Merge tag 'v3.10.34' into linux-linaro-lsk
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index c7ab8eab5427893ab20ef0ad5dccd5b796331e00..c6d617b2c506d1b64be175b6ac5908174d51609f 100644 (file)
 #include <linux/task_work.h>
 
 #include <trace/events/sched.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
 
 #include "sched.h"
 
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -1208,8 +1217,91 @@ static u32 __compute_runnable_contrib(u64 n)
        return contrib + runnable_avg_yN_sum[n];
 }
 
-/*
- * We can represent the historical contribution to runnable average as the
+#ifdef CONFIG_SCHED_HMP
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct kobject *kobj,
+                       struct attribute *attr, char *buf);
+       ssize_t (*store)(struct kobject *a, struct attribute *b,
+                       const char *c, size_t count);
+       int *value;
+       int (*to_sysfs)(int);
+       int (*from_sysfs)(int);
+       ssize_t (*to_sysfs_text)(char *buf, int buf_size);
+};
+
+#define HMP_DATA_SYSFS_MAX 8
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       int freqinvar_load_scale_enabled;
+#endif
+       int multiplier; /* used to scale the time delta */
+       struct attribute_group attr_group;
+       struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+       struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+       u32 curr_scale;
+       u32 min;
+       u32 max;
+       u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_SCHED_HMP */
+
+/* We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
  * history into segments of approximately 1ms (1024us); label the segment that
  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
@@ -1238,13 +1330,24 @@ static u32 __compute_runnable_contrib(u64 n)
  */
 static __always_inline int __update_entity_runnable_avg(u64 now,
                                                        struct sched_avg *sa,
-                                                       int runnable)
+                                                       int runnable,
+                                                       int running,
+                                                       int cpu)
 {
        u64 delta, periods;
        u32 runnable_contrib;
        int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       u64 scaled_delta;
+       u32 scaled_runnable_contrib;
+       int scaled_delta_w;
+       u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
 
        delta = now - sa->last_runnable_update;
+#ifdef CONFIG_SCHED_HMP
+       delta = hmp_variable_scale_convert(delta);
+#endif
        /*
         * This should only happen when time goes backwards, which it
         * unfortunately does during sched clock init when we swap over to TSC.
@@ -1263,6 +1366,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                return 0;
        sa->last_runnable_update = now;
 
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       /* retrieve scale factor for load */
+       if (hmp_data.freqinvar_load_scale_enabled)
+               curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
        /* delta_w is the amount already accumulated against our next period */
        delta_w = sa->runnable_avg_period % 1024;
        if (delta + delta_w >= 1024) {
@@ -1275,8 +1384,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                 * period and accrue it.
                 */
                delta_w = 1024 - delta_w;
+               /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+               scaled_delta_w = (delta_w * curr_scale)
+                               >> SCHED_FREQSCALE_SHIFT;
+               if (runnable)
+                       sa->runnable_avg_sum += scaled_delta_w;
+               if (running)
+                       sa->usage_avg_sum += scaled_delta_w;
+#else
                if (runnable)
                        sa->runnable_avg_sum += delta_w;
+               if (running)
+                       sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
                sa->runnable_avg_period += delta_w;
 
                delta -= delta_w;
@@ -1284,22 +1405,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                /* Figure out how many additional periods this update spans */
                periods = delta / 1024;
                delta %= 1024;
-
+               /* decay the load we have accumulated so far */
                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
                                                  periods + 1);
                sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
                                                     periods + 1);
-
+               sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+               /* add the contribution from this period */
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                runnable_contrib = __compute_runnable_contrib(periods);
+               /* Apply load scaling if necessary.
+                * Note that multiplying the whole series is same as
+                * multiplying all terms
+                */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+               scaled_runnable_contrib = (runnable_contrib * curr_scale)
+                               >> SCHED_FREQSCALE_SHIFT;
+               if (runnable)
+                       sa->runnable_avg_sum += scaled_runnable_contrib;
+               if (running)
+                       sa->usage_avg_sum += scaled_runnable_contrib;
+#else
                if (runnable)
                        sa->runnable_avg_sum += runnable_contrib;
+               if (running)
+                       sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
                sa->runnable_avg_period += runnable_contrib;
        }
 
        /* Remainder of delta accrued against u_0` */
+       /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+       if (runnable)
+               sa->runnable_avg_sum += scaled_delta;
+       if (running)
+               sa->usage_avg_sum += scaled_delta;
+#else
        if (runnable)
                sa->runnable_avg_sum += delta;
+       if (running)
+               sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
        sa->runnable_avg_period += delta;
 
        return decayed;
@@ -1312,12 +1460,9 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
        u64 decays = atomic64_read(&cfs_rq->decay_counter);
 
        decays -= se->avg.decay_count;
-       if (!decays)
-               return 0;
-
-       se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+       if (decays)
+               se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
        se->avg.decay_count = 0;
-
        return decays;
 }
 
@@ -1345,16 +1490,28 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
                                                  struct cfs_rq *cfs_rq)
 {
        struct task_group *tg = cfs_rq->tg;
-       long contrib;
+       long contrib, usage_contrib;
 
        /* The fraction of a cpu used by this cfs_rq */
        contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
                          sa->runnable_avg_period + 1);
        contrib -= cfs_rq->tg_runnable_contrib;
 
-       if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+       usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
+                               sa->runnable_avg_period + 1);
+       usage_contrib -= cfs_rq->tg_usage_contrib;
+
+       /*
+        * contrib/usage at this point represent deltas, only update if they
+        * are substantive.
+        */
+       if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+           (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
                atomic_add(contrib, &tg->runnable_avg);
                cfs_rq->tg_runnable_contrib += contrib;
+
+               atomic_add(usage_contrib, &tg->usage_avg);
+               cfs_rq->tg_usage_contrib += usage_contrib;
        }
 }
 
@@ -1415,12 +1572,18 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
        contrib /= (se->avg.runnable_avg_period + 1);
        se->avg.load_avg_contrib = scale_load(contrib);
+       trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+       contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
+       contrib /= (se->avg.runnable_avg_period + 1);
+       se->avg.load_avg_ratio = scale_load(contrib);
+       trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio)
 {
        long old_contrib = se->avg.load_avg_contrib;
+       long old_ratio   = se->avg.load_avg_ratio;
 
        if (entity_is_task(se)) {
                __update_task_entity_contrib(se);
@@ -1429,6 +1592,8 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
                __update_group_entity_contrib(se);
        }
 
+       if (ratio)
+               *ratio = se->avg.load_avg_ratio - old_ratio;
        return se->avg.load_avg_contrib - old_contrib;
 }
 
@@ -1448,9 +1613,13 @@ static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       long contrib_delta;
+       long contrib_delta, ratio_delta;
        u64 now;
+       int cpu = -1;   /* not used in normal case */
 
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       cpu = cfs_rq->rq->cpu;
+#endif
        /*
         * For a group entity we need to use their owned cfs_rq_clock_task() in
         * case they are the parent of a throttled hierarchy.
@@ -1460,18 +1629,21 @@ static inline void update_entity_load_avg(struct sched_entity *se,
        else
                now = cfs_rq_clock_task(group_cfs_rq(se));
 
-       if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+       if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+                       cfs_rq->curr == se, cpu))
                return;
 
-       contrib_delta = __update_entity_load_avg_contrib(se);
+       contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta);
 
        if (!update_cfs_rq)
                return;
 
-       if (se->on_rq)
+       if (se->on_rq) {
                cfs_rq->runnable_load_avg += contrib_delta;
-       else
+               rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta;
+       } else {
                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+       }
 }
 
 /*
@@ -1504,8 +1676,17 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-       __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+       int cpu = -1;   /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       cpu = rq->cpu;
+#endif
+       __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+                                    runnable, cpu);
        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+       trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio);
+       trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+       trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter);
 }
 
 /* Add the load generated by se into cfs_rq's child load-average */
@@ -1547,6 +1728,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
        }
 
        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+       rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio;
+
        /* we force update consideration on load-balancer moves */
        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -1565,6 +1748,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
        update_cfs_rq_blocked_load(cfs_rq, !sleep);
 
        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+       rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio;
+
        if (sleep) {
                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -1893,6 +2078,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
+               update_entity_load_avg(se, 1);
        }
 
        update_stats_curr_start(cfs_rq, se);
@@ -3341,6 +3527,708 @@ done:
        return target;
 }
 
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * The cpu types are distinguished using a list of hmp_domains
+ * which each represent one cpu type using a cpumask.
+ * The list is assumed ordered by compute capacity with the
+ * fastest domain first.
+ */
+DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+static const int hmp_max_tasks = 5;
+
+extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+
+/* Setup hmp_domains */
+static int __init hmp_cpu_mask_setup(void)
+{
+       char buf[64];
+       struct hmp_domain *domain;
+       struct list_head *pos;
+       int dc, cpu;
+
+       pr_debug("Initializing HMP scheduler:\n");
+
+       /* Initialize hmp_domains using platform code */
+       arch_get_hmp_domains(&hmp_domains);
+       if (list_empty(&hmp_domains)) {
+               pr_debug("HMP domain list is empty!\n");
+               return 0;
+       }
+
+       /* Print hmp_domains */
+       dc = 0;
+       list_for_each(pos, &hmp_domains) {
+               domain = list_entry(pos, struct hmp_domain, hmp_domains);
+               cpulist_scnprintf(buf, 64, &domain->possible_cpus);
+               pr_debug("  HMP domain %d: %s\n", dc, buf);
+
+               for_each_cpu_mask(cpu, domain->possible_cpus) {
+                       per_cpu(hmp_cpu_domain, cpu) = domain;
+               }
+               dc++;
+       }
+
+       return 1;
+}
+
+static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
+{
+       struct hmp_domain *domain;
+       struct list_head *pos;
+
+       list_for_each(pos, &hmp_domains) {
+               domain = list_entry(pos, struct hmp_domain, hmp_domains);
+               if(cpumask_test_cpu(cpu, &domain->possible_cpus))
+                       return domain;
+       }
+       return NULL;
+}
+
+static void hmp_online_cpu(int cpu)
+{
+       struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+       if(domain)
+               cpumask_set_cpu(cpu, &domain->cpus);
+}
+
+static void hmp_offline_cpu(int cpu)
+{
+       struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+       if(domain)
+               cpumask_clear_cpu(cpu, &domain->cpus);
+}
+/*
+ * Needed to determine heaviest tasks etc.
+ */
+static inline unsigned int hmp_cpu_is_fastest(int cpu);
+static inline unsigned int hmp_cpu_is_slowest(int cpu);
+static inline struct hmp_domain *hmp_slower_domain(int cpu);
+static inline struct hmp_domain *hmp_faster_domain(int cpu);
+
+/* must hold runqueue lock for queue se is currently on */
+static struct sched_entity *hmp_get_heaviest_task(
+                               struct sched_entity *se, int migrate_up)
+{
+       int num_tasks = hmp_max_tasks;
+       struct sched_entity *max_se = se;
+       unsigned long int max_ratio = se->avg.load_avg_ratio;
+       const struct cpumask *hmp_target_mask = NULL;
+
+       if (migrate_up) {
+               struct hmp_domain *hmp;
+               if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
+                       return max_se;
+
+               hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
+               hmp_target_mask = &hmp->cpus;
+       }
+       /* The currently running task is not on the runqueue */
+       se = __pick_first_entity(cfs_rq_of(se));
+
+       while (num_tasks && se) {
+               if (entity_is_task(se) &&
+                       (se->avg.load_avg_ratio > max_ratio &&
+                        hmp_target_mask &&
+                        cpumask_intersects(hmp_target_mask,
+                               tsk_cpus_allowed(task_of(se))))) {
+                       max_se = se;
+                       max_ratio = se->avg.load_avg_ratio;
+               }
+               se = __pick_next_entity(se);
+               num_tasks--;
+       }
+       return max_se;
+}
+
+static struct sched_entity *hmp_get_lightest_task(
+                               struct sched_entity *se, int migrate_down)
+{
+       int num_tasks = hmp_max_tasks;
+       struct sched_entity *min_se = se;
+       unsigned long int min_ratio = se->avg.load_avg_ratio;
+       const struct cpumask *hmp_target_mask = NULL;
+
+       if (migrate_down) {
+               struct hmp_domain *hmp;
+               if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
+                       return min_se;
+               hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
+               hmp_target_mask = &hmp->cpus;
+       }
+       /* The currently running task is not on the runqueue */
+       se = __pick_first_entity(cfs_rq_of(se));
+
+       while (num_tasks && se) {
+               if (entity_is_task(se) &&
+                       (se->avg.load_avg_ratio < min_ratio &&
+                       hmp_target_mask &&
+                               cpumask_intersects(hmp_target_mask,
+                               tsk_cpus_allowed(task_of(se))))) {
+                       min_se = se;
+                       min_ratio = se->avg.load_avg_ratio;
+               }
+               se = __pick_next_entity(se);
+               num_tasks--;
+       }
+       return min_se;
+}
+
+/*
+ * Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
+ *
+ * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
+ * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
+ * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ *
+ * Small Task Packing:
+ * We can choose to fill the littlest CPUs in an HMP system rather than
+ * the typical spreading mechanic. This behavior is controllable using
+ * two variables.
+ * hmp_packing_enabled: runtime control over pack/spread
+ * hmp_full_threshold: Consider a CPU with this much unweighted load full
+ */
+unsigned int hmp_up_threshold = 700;
+unsigned int hmp_down_threshold = 512;
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
+#endif
+unsigned int hmp_next_up_threshold = 4096;
+unsigned int hmp_next_down_threshold = 4096;
+
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+#ifndef CONFIG_ARCH_VEXPRESS_TC2
+unsigned int hmp_packing_enabled = 1;
+unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8;
+#else
+/* TC2 has a sharp consumption curve @ around 800Mhz, so
+   we aim to spread the load around that frequency. */
+unsigned int hmp_packing_enabled;
+unsigned int hmp_full_threshold = 650;  /*  80% of the 800Mhz freq * NICE_0_LOAD */
+#endif
+#endif
+
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+                                               int *min_cpu, struct cpumask *affinity);
+
+static inline struct hmp_domain *hmp_smallest_domain(void)
+{
+       return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains);
+}
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+       struct list_head *pos;
+
+       pos = &hmp_cpu_domain(cpu)->hmp_domains;
+       return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+       struct list_head *pos;
+
+       pos = &hmp_cpu_domain(cpu)->hmp_domains;
+       return list_is_last(pos, &hmp_domains);
+}
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+       struct list_head *pos;
+
+       pos = &hmp_cpu_domain(cpu)->hmp_domains;
+       return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+       struct list_head *pos;
+
+       pos = &hmp_cpu_domain(cpu)->hmp_domains;
+       return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+/*
+ * Selects a cpu in previous (faster) hmp_domain
+ */
+static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
+                                                       int cpu)
+{
+       int lowest_cpu=NR_CPUS;
+       __always_unused int lowest_ratio;
+       struct hmp_domain *hmp;
+
+       if (hmp_cpu_is_fastest(cpu))
+               hmp = hmp_cpu_domain(cpu);
+       else
+               hmp = hmp_faster_domain(cpu);
+
+       lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+                       tsk_cpus_allowed(tsk));
+
+       return lowest_cpu;
+}
+
+/*
+ * Selects a cpu in next (slower) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
+                                                       int cpu)
+{
+       int lowest_cpu=NR_CPUS;
+       struct hmp_domain *hmp;
+       __always_unused int lowest_ratio;
+
+       if (hmp_cpu_is_slowest(cpu))
+               hmp = hmp_cpu_domain(cpu);
+       else
+               hmp = hmp_slower_domain(cpu);
+
+       lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+                       tsk_cpus_allowed(tsk));
+
+       return lowest_cpu;
+}
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/*
+ * Select the 'best' candidate little CPU to wake up on.
+ * Implements a packing strategy which examines CPU in
+ * logical CPU order, and selects the first which will
+ * have at least 10% capacity available, according to
+ * both tracked load of the runqueue and the task.
+ */
+static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
+               int cpu) {
+       int tmp_cpu;
+       unsigned long estimated_load;
+       struct hmp_domain *hmp;
+       struct sched_avg *avg;
+       struct cpumask allowed_hmp_cpus;
+
+       if(!hmp_packing_enabled ||
+                       tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100))
+               return hmp_select_slower_cpu(tsk, cpu);
+
+       if (hmp_cpu_is_slowest(cpu))
+               hmp = hmp_cpu_domain(cpu);
+       else
+               hmp = hmp_slower_domain(cpu);
+
+       /* respect affinity */
+       cpumask_and(&allowed_hmp_cpus, &hmp->cpus,
+                       tsk_cpus_allowed(tsk));
+
+       for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) {
+               avg = &cpu_rq(tmp_cpu)->avg;
+               /* estimate new rq load if we add this task */
+               estimated_load = avg->load_avg_ratio +
+                               tsk->se.avg.load_avg_ratio;
+               if (estimated_load <= hmp_full_threshold) {
+                       cpu = tmp_cpu;
+                       break;
+               }
+       }
+       /* if no match was found, the task uses the initial value */
+       return cpu;
+}
+#endif
+static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
+{
+       /* hack - always use clock from first online CPU */
+       u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+       se->avg.hmp_last_up_migration = now;
+       se->avg.hmp_last_down_migration = 0;
+       cpu_rq(cpu)->avg.hmp_last_up_migration = now;
+       cpu_rq(cpu)->avg.hmp_last_down_migration = 0;
+}
+
+static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
+{
+       /* hack - always use clock from first online CPU */
+       u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+       se->avg.hmp_last_down_migration = now;
+       se->avg.hmp_last_up_migration = 0;
+       cpu_rq(cpu)->avg.hmp_last_down_migration = now;
+       cpu_rq(cpu)->avg.hmp_last_up_migration = 0;
+}
+
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * These functions allow to change the growing speed of the load_avg_ratio
+ * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
+ * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
+ *
+ * These functions also allow to change the up and down threshold of HMP
+ * using /sys/kernel/hmp/{up,down}_threshold.
+ * Both must be between 0 and 1023. The threshold that is compared
+ * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
+ *
+ * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
+ * task with a load of 0 will reach the threshold after 64ms of busy loop.
+ *
+ * Changing load_avg_periods_ms has the same effect than changing the
+ * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
+ * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
+ * could trigger overflows.
+ * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
+ * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
+ * could be overflowed for a weight > 2^12 even is the load_avg_contrib
+ * should still be a 32bits result. This would not happen by multiplicating
+ * delta time by 1/22 and setting load_avg_period_ms = 706.
+ */
+
+/*
+ * By scaling the delta time it end-up increasing or decrease the
+ * growing speed of the per entity load_avg_ratio
+ * The scale factor hmp_data.multiplier is a fixed point
+ * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
+ */
+static inline u64 hmp_variable_scale_convert(u64 delta)
+{
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+       u64 high = delta >> 32ULL;
+       u64 low = delta & 0xffffffffULL;
+       low *= hmp_data.multiplier;
+       high *= hmp_data.multiplier;
+       return (low >> HMP_VARIABLE_SCALE_SHIFT)
+                       + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+#else
+       return delta;
+#endif
+}
+
+static ssize_t hmp_show(struct kobject *kobj,
+                               struct attribute *attr, char *buf)
+{
+       struct hmp_global_attr *hmp_attr =
+               container_of(attr, struct hmp_global_attr, attr);
+       int temp;
+
+       if (hmp_attr->to_sysfs_text != NULL)
+               return hmp_attr->to_sysfs_text(buf, PAGE_SIZE);
+
+       temp = *(hmp_attr->value);
+       if (hmp_attr->to_sysfs != NULL)
+               temp = hmp_attr->to_sysfs(temp);
+
+       return (ssize_t)sprintf(buf, "%d\n", temp);
+}
+
+static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
+                               const char *buf, size_t count)
+{
+       int temp;
+       ssize_t ret = count;
+       struct hmp_global_attr *hmp_attr =
+               container_of(attr, struct hmp_global_attr, attr);
+       char *str = vmalloc(count + 1);
+       if (str == NULL)
+               return -ENOMEM;
+       memcpy(str, buf, count);
+       str[count] = 0;
+       if (sscanf(str, "%d", &temp) < 1)
+               ret = -EINVAL;
+       else {
+               if (hmp_attr->from_sysfs != NULL)
+                       temp = hmp_attr->from_sysfs(temp);
+               if (temp < 0)
+                       ret = -EINVAL;
+               else
+                       *(hmp_attr->value) = temp;
+       }
+       vfree(str);
+       return ret;
+}
+
+static ssize_t hmp_print_domains(char *outbuf, int outbufsize)
+{
+       char buf[64];
+       const char nospace[] = "%s", space[] = " %s";
+       const char *fmt = nospace;
+       struct hmp_domain *domain;
+       struct list_head *pos;
+       int outpos = 0;
+       list_for_each(pos, &hmp_domains) {
+               domain = list_entry(pos, struct hmp_domain, hmp_domains);
+               if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) {
+                       outpos += sprintf(outbuf+outpos, fmt, buf);
+                       fmt = space;
+               }
+       }
+       strcat(outbuf, "\n");
+       return outpos+1;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+static int hmp_period_tofrom_sysfs(int value)
+{
+       return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+}
+#endif
+/* max value for threshold is 1024 */
+static int hmp_theshold_from_sysfs(int value)
+{
+       if (value > 1024)
+               return -1;
+       return value;
+}
+#if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \
+               defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)
+/* toggle control is only 0,1 off/on */
+static int hmp_toggle_from_sysfs(int value)
+{
+       if (value < 0 || value > 1)
+               return -1;
+       return value;
+}
+#endif
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/* packing value must be non-negative */
+static int hmp_packing_from_sysfs(int value)
+{
+       if (value < 0)
+               return -1;
+       return value;
+}
+#endif
+static void hmp_attr_add(
+       const char *name,
+       int *value,
+       int (*to_sysfs)(int),
+       int (*from_sysfs)(int),
+       ssize_t (*to_sysfs_text)(char *, int),
+       umode_t mode)
+{
+       int i = 0;
+       while (hmp_data.attributes[i] != NULL) {
+               i++;
+               if (i >= HMP_DATA_SYSFS_MAX)
+                       return;
+       }
+       if (mode)
+               hmp_data.attr[i].attr.mode = mode;
+       else
+               hmp_data.attr[i].attr.mode = 0644;
+       hmp_data.attr[i].show = hmp_show;
+       hmp_data.attr[i].store = hmp_store;
+       hmp_data.attr[i].attr.name = name;
+       hmp_data.attr[i].value = value;
+       hmp_data.attr[i].to_sysfs = to_sysfs;
+       hmp_data.attr[i].from_sysfs = from_sysfs;
+       hmp_data.attr[i].to_sysfs_text = to_sysfs_text;
+       hmp_data.attributes[i] = &hmp_data.attr[i].attr;
+       hmp_data.attributes[i + 1] = NULL;
+}
+
+static int hmp_attr_init(void)
+{
+       int ret;
+       memset(&hmp_data, sizeof(hmp_data), 0);
+       hmp_attr_add("hmp_domains",
+               NULL,
+               NULL,
+               NULL,
+               hmp_print_domains,
+               0444);
+       hmp_attr_add("up_threshold",
+               &hmp_up_threshold,
+               NULL,
+               hmp_theshold_from_sysfs,
+               NULL,
+               0);
+       hmp_attr_add("down_threshold",
+               &hmp_down_threshold,
+               NULL,
+               hmp_theshold_from_sysfs,
+               NULL,
+               0);
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+       /* by default load_avg_period_ms == LOAD_AVG_PERIOD
+        * meaning no change
+        */
+       hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
+       hmp_attr_add("load_avg_period_ms",
+               &hmp_data.multiplier,
+               hmp_period_tofrom_sysfs,
+               hmp_period_tofrom_sysfs,
+               NULL,
+               0);
+#endif
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       /* default frequency-invariant scaling ON */
+       hmp_data.freqinvar_load_scale_enabled = 1;
+       hmp_attr_add("frequency_invariant_load_scale",
+               &hmp_data.freqinvar_load_scale_enabled,
+               NULL,
+               hmp_toggle_from_sysfs,
+               NULL,
+               0);
+#endif
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+       hmp_attr_add("packing_enable",
+               &hmp_packing_enabled,
+               NULL,
+               hmp_toggle_from_sysfs,
+               NULL,
+               0);
+       hmp_attr_add("packing_limit",
+               &hmp_full_threshold,
+               NULL,
+               hmp_packing_from_sysfs,
+               NULL,
+               0);
+#endif
+       hmp_data.attr_group.name = "hmp";
+       hmp_data.attr_group.attrs = hmp_data.attributes;
+       ret = sysfs_create_group(kernel_kobj,
+               &hmp_data.attr_group);
+       return 0;
+}
+late_initcall(hmp_attr_init);
+/*
+ * return the load of the lowest-loaded CPU in a given HMP domain
+ * min_cpu optionally points to an int to receive the CPU.
+ * affinity optionally points to a cpumask containing the
+ * CPUs to be considered. note:
+ *   + min_cpu = NR_CPUS only if no CPUs are in the set of
+ *     affinity && hmp_domain cpus
+ *   + min_cpu will always otherwise equal one of the CPUs in
+ *     the hmp domain
+ *   + when more than one CPU has the same load, the one which
+ *     is least-recently-disturbed by an HMP migration will be
+ *     selected
+ *   + if all CPUs are equally loaded or idle and the times are
+ *     all the same, the first in the set will be used
+ *   + if affinity is not set, cpu_online_mask is used
+ */
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+                                               int *min_cpu, struct cpumask *affinity)
+{
+       int cpu;
+       int min_cpu_runnable_temp = NR_CPUS;
+       u64 min_target_last_migration = ULLONG_MAX;
+       u64 curr_last_migration;
+       unsigned long min_runnable_load = INT_MAX;
+       unsigned long contrib;
+       struct sched_avg *avg;
+       struct cpumask temp_cpumask;
+       /*
+        * only look at CPUs allowed if specified,
+        * otherwise look at all online CPUs in the
+        * right HMP domain
+        */
+       cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask);
+
+       for_each_cpu_mask(cpu, temp_cpumask) {
+               avg = &cpu_rq(cpu)->avg;
+               /* used for both up and down migration */
+               curr_last_migration = avg->hmp_last_up_migration ?
+                       avg->hmp_last_up_migration : avg->hmp_last_down_migration;
+
+               contrib = avg->load_avg_ratio;
+               /*
+                * Consider a runqueue completely busy if there is any load
+                * on it. Definitely not the best for overall fairness, but
+                * does well in typical Android use cases.
+                */
+               if (contrib)
+                       contrib = 1023;
+
+               if ((contrib < min_runnable_load) ||
+                       (contrib == min_runnable_load &&
+                        curr_last_migration < min_target_last_migration)) {
+                       /*
+                        * if the load is the same target the CPU with
+                        * the longest time since a migration.
+                        * This is to spread migration load between
+                        * members of a domain more evenly when the
+                        * domain is fully loaded
+                        */
+                       min_runnable_load = contrib;
+                       min_cpu_runnable_temp = cpu;
+                       min_target_last_migration = curr_last_migration;
+               }
+       }
+
+       if (min_cpu)
+               *min_cpu = min_cpu_runnable_temp;
+
+       return min_runnable_load;
+}
+
+/*
+ * Calculate the task starvation
+ * This is the ratio of actually running time vs. runnable time.
+ * If the two are equal the task is getting the cpu time it needs or
+ * it is alone on the cpu and the cpu is fully utilized.
+ */
+static inline unsigned int hmp_task_starvation(struct sched_entity *se)
+{
+       u32 starvation;
+
+       starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
+       starvation /= (se->avg.runnable_avg_sum + 1);
+
+       return scale_load(starvation);
+}
+
+static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
+{
+       int min_usage;
+       int dest_cpu = NR_CPUS;
+
+       if (hmp_cpu_is_slowest(cpu))
+               return NR_CPUS;
+
+       /* Is there an idle CPU in the current domain */
+       min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
+       if (min_usage == 0) {
+               trace_sched_hmp_offload_abort(cpu, min_usage, "load");
+               return NR_CPUS;
+       }
+
+       /* Is the task alone on the cpu? */
+       if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
+               trace_sched_hmp_offload_abort(cpu,
+                       cpu_rq(cpu)->cfs.h_nr_running, "nr_running");
+               return NR_CPUS;
+       }
+
+       /* Is the task actually starving? */
+       /* >=25% ratio running/runnable = starving */
+       if (hmp_task_starvation(se) > 768) {
+               trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se),
+                       "starvation");
+               return NR_CPUS;
+       }
+
+       /* Does the slower domain have any idle CPUs? */
+       min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
+                       tsk_cpus_allowed(task_of(se)));
+
+       if (min_usage == 0) {
+               trace_sched_hmp_offload_succeed(cpu, dest_cpu);
+               return dest_cpu;
+       } else
+               trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain");
+       return NR_CPUS;
+}
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -3365,6 +4253,19 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
 
+#ifdef CONFIG_SCHED_HMP
+       /* always put non-kernel forking tasks on a big domain */
+       if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
+               new_cpu = hmp_select_faster_cpu(p, prev_cpu);
+               if (new_cpu != NR_CPUS) {
+                       hmp_next_up_delay(&p->se, new_cpu);
+                       return new_cpu;
+               }
+               /* failed to perform HMP fork balance, use normal balance */
+               new_cpu = cpu;
+       }
+#endif
+
        if (sd_flag & SD_BALANCE_WAKE) {
                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
                        want_affine = 1;
@@ -3439,15 +4340,50 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 unlock:
        rcu_read_unlock();
 
-       return new_cpu;
-}
+#ifdef CONFIG_SCHED_HMP
+       prev_cpu = task_cpu(p);
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+       if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
+               hmp_next_up_delay(&p->se, new_cpu);
+               trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+               return new_cpu;
+       }
+       if (hmp_down_migration(prev_cpu, &p->se)) {
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+               new_cpu = hmp_best_little_cpu(p, prev_cpu);
+#else
+               new_cpu = hmp_select_slower_cpu(p, prev_cpu);
+#endif
+               if (new_cpu != prev_cpu) {
+                       hmp_next_down_delay(&p->se, new_cpu);
+                       trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+                       return new_cpu;
+               }
+       }
+       /* Make sure that the task stays in its previous hmp domain */
+       if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
+               return prev_cpu;
+#endif
+
+       return new_cpu;
+}
+
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
  * removed when useful for applications beyond shares distribution (e.g.
  * load-balance).
  */
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+#ifdef CONFIG_NO_HZ_COMMON
+static int nohz_test_cpu(int cpu);
+#else
+static inline int nohz_test_cpu(int cpu)
+{
+       return 0;
+}
+#endif
+
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -3467,6 +4403,25 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
         * be negative here since on-rq tasks have decay-count == 0.
         */
        if (se->avg.decay_count) {
+               /*
+                * If we migrate a sleeping task away from a CPU
+                * which has the tick stopped, then both the clock_task
+                * and decay_counter will be out of date for that CPU
+                * and we will not decay load correctly.
+                */
+               if (!se->on_rq && nohz_test_cpu(task_cpu(p))) {
+                       struct rq *rq = cpu_rq(task_cpu(p));
+                       unsigned long flags;
+                       /*
+                        * Current CPU cannot be holding rq->lock in this
+                        * circumstance, but another might be. We must hold
+                        * rq->lock before we go poking around in its clocks
+                        */
+                       raw_spin_lock_irqsave(&rq->lock, flags);
+                       update_rq_clock(rq);
+                       update_cfs_rq_blocked_load(cfs_rq, 0);
+                       raw_spin_unlock_irqrestore(&rq->lock, flags);
+               }
                se->avg.decay_count = -__synchronize_entity_decay(se);
                atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
        }
@@ -3972,7 +4927,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 1) task is cache cold, or
         * 2) too many balance attempts have failed.
         */
-
        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -5257,7 +6211,9 @@ out_one_pinned:
 out:
        return ld_moved;
 }
-
+#ifdef CONFIG_SCHED_HMP
+static unsigned int hmp_idle_pull(int this_cpu);
+#endif
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -5302,7 +6258,10 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                }
        }
        rcu_read_unlock();
-
+#ifdef CONFIG_SCHED_HMP
+       if (!pulled_task)
+               pulled_task = hmp_idle_pull(this_cpu);
+#endif
        raw_spin_lock(&this_rq->lock);
 
        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
@@ -5395,12 +6354,65 @@ static struct {
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
 
+/*
+ * nohz_test_cpu used when load tracking is enabled. FAIR_GROUP_SCHED
+ * dependency below may be removed when load tracking guards are
+ * removed.
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int nohz_test_cpu(int cpu)
+{
+       return cpumask_test_cpu(cpu, nohz.idle_cpus_mask);
+}
+#endif
+
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/*
+ * Decide if the tasks on the busy CPUs in the
+ * littlest domain would benefit from an idle balance
+ */
+static int hmp_packing_ilb_needed(int cpu)
+{
+       struct hmp_domain *hmp;
+       /* always allow ilb on non-slowest domain */
+       if (!hmp_cpu_is_slowest(cpu))
+               return 1;
+
+       /* if disabled, use normal ILB behaviour */
+       if (!hmp_packing_enabled)
+               return 1;
+
+       hmp = hmp_cpu_domain(cpu);
+       for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) {
+               /* only idle balance if a CPU is loaded over threshold */
+               if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold)
+                       return 1;
+       }
+       return 0;
+}
+#endif
+
 static inline int find_new_ilb(int call_cpu)
 {
        int ilb = cpumask_first(nohz.idle_cpus_mask);
+#ifdef CONFIG_SCHED_HMP
+       int ilb_needed = 1;
+
+       /* restrict nohz balancing to occur in the same hmp domain */
+       ilb = cpumask_first_and(nohz.idle_cpus_mask,
+                       &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
+
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+       if (ilb < nr_cpu_ids)
+               ilb_needed = hmp_packing_ilb_needed(ilb);
+#endif
 
+       if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb))
+               return ilb;
+#else
        if (ilb < nr_cpu_ids && idle_cpu(ilb))
                return ilb;
+#endif
 
        return nr_cpu_ids;
 }
@@ -5677,6 +6689,18 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
        if (time_before(now, nohz.next_balance))
                return 0;
 
+#ifdef CONFIG_SCHED_HMP
+       /*
+        * Bail out if there are no nohz CPUs in our
+        * HMP domain, since we will move tasks between
+        * domains through wakeup and force balancing
+        * as necessary based upon task load.
+        */
+       if (cpumask_first_and(nohz.idle_cpus_mask,
+                       &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
+               return 0;
+#endif
+
        if (rq->nr_running >= 2)
                goto need_kick;
 
@@ -5709,6 +6733,558 @@ need_kick:
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
 #endif
 
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster cpu */
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
+{
+       struct task_struct *p = task_of(se);
+       int temp_target_cpu;
+       u64 now;
+
+       if (hmp_cpu_is_fastest(cpu))
+               return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+       /* Filter by task priority */
+       if (p->prio >= hmp_up_prio)
+               return 0;
+#endif
+       if (se->avg.load_avg_ratio < hmp_up_threshold)
+               return 0;
+
+       /* Let the task load settle before doing another up migration */
+       /* hack - always use clock from first online CPU */
+       now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+       if (((now - se->avg.hmp_last_up_migration) >> 10)
+                                       < hmp_next_up_threshold)
+               return 0;
+
+       /* hmp_domain_min_load only returns 0 for an
+        * idle CPU or 1023 for any partly-busy one.
+        * Be explicit about requirement for an idle CPU.
+        */
+       if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu,
+                       tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) {
+               if(target_cpu)
+                       *target_cpu = temp_target_cpu;
+               return 1;
+       }
+       return 0;
+}
+
+/* Check if task should migrate to a slower cpu */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+       struct task_struct *p = task_of(se);
+       u64 now;
+
+       if (hmp_cpu_is_slowest(cpu)) {
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+               if(hmp_packing_enabled)
+                       return 1;
+               else
+#endif
+               return 0;
+       }
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+       /* Filter by task priority */
+       if ((p->prio >= hmp_up_prio) &&
+               cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+                                       tsk_cpus_allowed(p))) {
+               return 1;
+       }
+#endif
+
+       /* Let the task load settle before doing another down migration */
+       /* hack - always use clock from first online CPU */
+       now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+       if (((now - se->avg.hmp_last_down_migration) >> 10)
+                                       < hmp_next_down_threshold)
+               return 0;
+
+       if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+                                       tsk_cpus_allowed(p))
+               && se->avg.load_avg_ratio < hmp_down_threshold) {
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+       int tsk_cache_hot = 0;
+
+       /*
+        * We do not migrate tasks that are:
+        * 1) running (obviously), or
+        * 2) cannot be migrated to this CPU due to cpus_allowed
+        */
+       if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+               schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+               return 0;
+       }
+       env->flags &= ~LBF_ALL_PINNED;
+
+       if (task_running(env->src_rq, p)) {
+               schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+               return 0;
+       }
+
+       /*
+        * Aggressive migration if:
+        * 1) task is cache cold, or
+        * 2) too many balance attempts have failed.
+        */
+
+       tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+       if (!tsk_cache_hot ||
+               env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
+       return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+       struct task_struct *p, *n;
+
+       list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+       if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+                               env->dst_cpu))
+               continue;
+
+               if (!hmp_can_migrate_task(p, env))
+                       continue;
+               /* Check if we found the right task */
+               if (p != pm)
+                       continue;
+
+               move_task(p, env);
+               /*
+                * Right now, this is only the third place move_task()
+                * is called, so we can safely collect move_task()
+                * stats here rather than inside move_task().
+                */
+               schedstat_inc(env->sd, lb_gained[env->idle]);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+       struct rq *busiest_rq = data;
+       struct task_struct *p = busiest_rq->migrate_task;
+       int busiest_cpu = cpu_of(busiest_rq);
+       int target_cpu = busiest_rq->push_cpu;
+       struct rq *target_rq = cpu_rq(target_cpu);
+       struct sched_domain *sd;
+
+       raw_spin_lock_irq(&busiest_rq->lock);
+       /* make sure the requested cpu hasn't gone down in the meantime */
+       if (unlikely(busiest_cpu != smp_processor_id() ||
+               !busiest_rq->active_balance)) {
+               goto out_unlock;
+       }
+       /* Is there any task to move? */
+       if (busiest_rq->nr_running <= 1)
+               goto out_unlock;
+       /* Task has migrated meanwhile, abort forced migration */
+       if (task_rq(p) != busiest_rq)
+               goto out_unlock;
+       /*
+        * This condition is "impossible", if it occurs
+        * we need to fix it. Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
+        */
+       BUG_ON(busiest_rq == target_rq);
+
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
+
+       /* Search for an sd spanning us and the target CPU. */
+       rcu_read_lock();
+       for_each_domain(target_cpu, sd) {
+               if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                       break;
+       }
+
+       if (likely(sd)) {
+               struct lb_env env = {
+                       .sd             = sd,
+                       .dst_cpu        = target_cpu,
+                       .dst_rq         = target_rq,
+                       .src_cpu        = busiest_rq->cpu,
+                       .src_rq         = busiest_rq,
+                       .idle           = CPU_IDLE,
+               };
+
+               schedstat_inc(sd, alb_count);
+
+               if (move_specific_task(&env, p))
+                       schedstat_inc(sd, alb_pushed);
+               else
+                       schedstat_inc(sd, alb_failed);
+       }
+       rcu_read_unlock();
+       double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+       put_task_struct(p);
+       busiest_rq->active_balance = 0;
+       raw_spin_unlock_irq(&busiest_rq->lock);
+       return 0;
+}
+
+/*
+ * hmp_idle_pull_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_idle_pull uses this to push a currently running task
+ * off a runqueue to a faster CPU.
+ * Locking is slightly different than usual.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_idle_pull_cpu_stop(void *data)
+{
+       struct rq *busiest_rq = data;
+       struct task_struct *p = busiest_rq->migrate_task;
+       int busiest_cpu = cpu_of(busiest_rq);
+       int target_cpu = busiest_rq->push_cpu;
+       struct rq *target_rq = cpu_rq(target_cpu);
+       struct sched_domain *sd;
+
+       raw_spin_lock_irq(&busiest_rq->lock);
+
+       /* make sure the requested cpu hasn't gone down in the meantime */
+       if (unlikely(busiest_cpu != smp_processor_id() ||
+               !busiest_rq->active_balance))
+               goto out_unlock;
+
+       /* Is there any task to move? */
+       if (busiest_rq->nr_running <= 1)
+               goto out_unlock;
+
+       /* Task has migrated meanwhile, abort forced migration */
+       if (task_rq(p) != busiest_rq)
+               goto out_unlock;
+
+       /*
+        * This condition is "impossible", if it occurs
+        * we need to fix it. Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
+        */
+       BUG_ON(busiest_rq == target_rq);
+
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
+
+       /* Search for an sd spanning us and the target CPU. */
+       rcu_read_lock();
+       for_each_domain(target_cpu, sd) {
+               if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                       break;
+       }
+       if (likely(sd)) {
+               struct lb_env env = {
+                       .sd             = sd,
+                       .dst_cpu        = target_cpu,
+                       .dst_rq         = target_rq,
+                       .src_cpu        = busiest_rq->cpu,
+                       .src_rq         = busiest_rq,
+                       .idle           = CPU_IDLE,
+               };
+
+               schedstat_inc(sd, alb_count);
+
+               if (move_specific_task(&env, p))
+                       schedstat_inc(sd, alb_pushed);
+               else
+                       schedstat_inc(sd, alb_failed);
+       }
+       rcu_read_unlock();
+       double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+       put_task_struct(p);
+       busiest_rq->active_balance = 0;
+       raw_spin_unlock_irq(&busiest_rq->lock);
+       return 0;
+}
+
+/*
+ * Move task in a runnable state to another CPU.
+ *
+ * Tailored on 'active_load_balance_stop_cpu' with slight
+ * modification to locking and pre-transfer checks.  Note
+ * rq->lock must be held before calling.
+ */
+static void hmp_migrate_runnable_task(struct rq *rq)
+{
+       struct sched_domain *sd;
+       int src_cpu = cpu_of(rq);
+       struct rq *src_rq = rq;
+       int dst_cpu = rq->push_cpu;
+       struct rq *dst_rq = cpu_rq(dst_cpu);
+       struct task_struct *p = rq->migrate_task;
+       /*
+        * One last check to make sure nobody else is playing
+        * with the source rq.
+        */
+       if (src_rq->active_balance)
+               goto out;
+
+       if (src_rq->nr_running <= 1)
+               goto out;
+
+       if (task_rq(p) != src_rq)
+               goto out;
+       /*
+        * Not sure if this applies here but one can never
+        * be too cautious
+        */
+       BUG_ON(src_rq == dst_rq);
+
+       double_lock_balance(src_rq, dst_rq);
+
+       rcu_read_lock();
+       for_each_domain(dst_cpu, sd) {
+               if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
+                       break;
+       }
+
+       if (likely(sd)) {
+               struct lb_env env = {
+                       .sd             = sd,
+                       .dst_cpu        = dst_cpu,
+                       .dst_rq         = dst_rq,
+                       .src_cpu        = src_cpu,
+                       .src_rq         = src_rq,
+                       .idle           = CPU_IDLE,
+               };
+
+               schedstat_inc(sd, alb_count);
+
+               if (move_specific_task(&env, p))
+                       schedstat_inc(sd, alb_pushed);
+               else
+                       schedstat_inc(sd, alb_failed);
+       }
+
+       rcu_read_unlock();
+       double_unlock_balance(src_rq, dst_rq);
+out:
+       put_task_struct(p);
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/*
+ * hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu.
+ */
+static void hmp_force_up_migration(int this_cpu)
+{
+       int cpu, target_cpu;
+       struct sched_entity *curr, *orig;
+       struct rq *target;
+       unsigned long flags;
+       unsigned int force, got_target;
+       struct task_struct *p;
+
+       if (!spin_trylock(&hmp_force_migration))
+               return;
+       for_each_online_cpu(cpu) {
+               force = 0;
+               got_target = 0;
+               target = cpu_rq(cpu);
+               raw_spin_lock_irqsave(&target->lock, flags);
+               curr = target->cfs.curr;
+               if (!curr) {
+                       raw_spin_unlock_irqrestore(&target->lock, flags);
+                       continue;
+               }
+               if (!entity_is_task(curr)) {
+                       struct cfs_rq *cfs_rq;
+
+                       cfs_rq = group_cfs_rq(curr);
+                       while (cfs_rq) {
+                               curr = cfs_rq->curr;
+                               cfs_rq = group_cfs_rq(curr);
+                       }
+               }
+               orig = curr;
+               curr = hmp_get_heaviest_task(curr, 1);
+               p = task_of(curr);
+               if (hmp_up_migration(cpu, &target_cpu, curr)) {
+                       if (!target->active_balance) {
+                               get_task_struct(p);
+                               target->push_cpu = target_cpu;
+                               target->migrate_task = p;
+                               got_target = 1;
+                               trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
+                               hmp_next_up_delay(&p->se, target->push_cpu);
+                       }
+               }
+               if (!got_target && !target->active_balance) {
+                       /*
+                        * For now we just check the currently running task.
+                        * Selecting the lightest task for offloading will
+                        * require extensive book keeping.
+                        */
+                       curr = hmp_get_lightest_task(orig, 1);
+                       p = task_of(curr);
+                       target->push_cpu = hmp_offload_down(cpu, curr);
+                       if (target->push_cpu < NR_CPUS) {
+                               get_task_struct(p);
+                               target->migrate_task = p;
+                               got_target = 1;
+                               trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
+                               hmp_next_down_delay(&p->se, target->push_cpu);
+                       }
+               }
+               /*
+                * We have a target with no active_balance.  If the task
+                * is not currently running move it, otherwise let the
+                * CPU stopper take care of it.
+                */
+               if (got_target && !target->active_balance) {
+                       if (!task_running(target, p)) {
+                               trace_sched_hmp_migrate_force_running(p, 0);
+                               hmp_migrate_runnable_task(target);
+                       } else {
+                               target->active_balance = 1;
+                               force = 1;
+                       }
+               }
+
+               raw_spin_unlock_irqrestore(&target->lock, flags);
+
+               if (force)
+                       stop_one_cpu_nowait(cpu_of(target),
+                               hmp_active_task_migration_cpu_stop,
+                               target, &target->active_balance_work);
+       }
+       spin_unlock(&hmp_force_migration);
+}
+/*
+ * hmp_idle_pull looks at little domain runqueues to see
+ * if a task should be pulled.
+ *
+ * Reuses hmp_force_migration spinlock.
+ *
+ */
+static unsigned int hmp_idle_pull(int this_cpu)
+{
+       int cpu;
+       struct sched_entity *curr, *orig;
+       struct hmp_domain *hmp_domain = NULL;
+       struct rq *target = NULL, *rq;
+       unsigned long flags, ratio = 0;
+       unsigned int force = 0;
+       struct task_struct *p = NULL;
+
+       if (!hmp_cpu_is_slowest(this_cpu))
+               hmp_domain = hmp_slower_domain(this_cpu);
+       if (!hmp_domain)
+               return 0;
+
+       if (!spin_trylock(&hmp_force_migration))
+               return 0;
+
+       /* first select a task */
+       for_each_cpu(cpu, &hmp_domain->cpus) {
+               rq = cpu_rq(cpu);
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               curr = rq->cfs.curr;
+               if (!curr) {
+                       raw_spin_unlock_irqrestore(&rq->lock, flags);
+                       continue;
+               }
+               if (!entity_is_task(curr)) {
+                       struct cfs_rq *cfs_rq;
+
+                       cfs_rq = group_cfs_rq(curr);
+                       while (cfs_rq) {
+                               curr = cfs_rq->curr;
+                               if (!entity_is_task(curr))
+                                       cfs_rq = group_cfs_rq(curr);
+                               else
+                                       cfs_rq = NULL;
+                       }
+               }
+               orig = curr;
+               curr = hmp_get_heaviest_task(curr, 1);
+               if (curr->avg.load_avg_ratio > hmp_up_threshold &&
+                       curr->avg.load_avg_ratio > ratio) {
+                       p = task_of(curr);
+                       target = rq;
+                       ratio = curr->avg.load_avg_ratio;
+               }
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
+
+       if (!p)
+               goto done;
+
+       /* now we have a candidate */
+       raw_spin_lock_irqsave(&target->lock, flags);
+       if (!target->active_balance && task_rq(p) == target) {
+               get_task_struct(p);
+               target->push_cpu = this_cpu;
+               target->migrate_task = p;
+               trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
+               hmp_next_up_delay(&p->se, target->push_cpu);
+               /*
+                * if the task isn't running move it right away.
+                * Otherwise setup the active_balance mechanic and let
+                * the CPU stopper do its job.
+                */
+               if (!task_running(target, p)) {
+                       trace_sched_hmp_migrate_idle_running(p, 0);
+                       hmp_migrate_runnable_task(target);
+               } else {
+                       target->active_balance = 1;
+                       force = 1;
+               }
+       }
+       raw_spin_unlock_irqrestore(&target->lock, flags);
+
+       if (force) {
+               stop_one_cpu_nowait(cpu_of(target),
+                       hmp_idle_pull_cpu_stop,
+                       target, &target->active_balance_work);
+       }
+done:
+       spin_unlock(&hmp_force_migration);
+       return force;
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -5720,6 +7296,8 @@ static void run_rebalance_domains(struct softirq_action *h)
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
 
+       hmp_force_up_migration(this_cpu);
+
        rebalance_domains(this_cpu, idle);
 
        /*
@@ -5752,11 +7330,17 @@ void trigger_load_balance(struct rq *rq, int cpu)
 
 static void rq_online_fair(struct rq *rq)
 {
+#ifdef CONFIG_SCHED_HMP
+       hmp_online_cpu(rq->cpu);
+#endif
        update_sysctl();
 }
 
 static void rq_offline_fair(struct rq *rq)
 {
+#ifdef CONFIG_SCHED_HMP
+       hmp_offline_cpu(rq->cpu);
+#endif
        update_sysctl();
 
        /* Ensure any throttled groups are reachable by pick_next_task */
@@ -6224,6 +7808,139 @@ __init void init_sched_fair_class(void)
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        cpu_notifier(sched_ilb_notifier, 0);
 #endif
+
+#ifdef CONFIG_SCHED_HMP
+       hmp_cpu_mask_setup();
+#endif
 #endif /* SMP */
 
 }
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+       u32 result = curr / max;
+       return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+                                       unsigned long val, void *data)
+{
+       struct cpufreq_freqs *freq = data;
+       int cpu = freq->cpu;
+       struct cpufreq_extents *extents;
+
+       if (freq->flags & CPUFREQ_CONST_LOOPS)
+               return NOTIFY_OK;
+
+       if (val != CPUFREQ_POSTCHANGE)
+               return NOTIFY_OK;
+
+       /* if dynamic load scale is disabled, set the load scale to 1.0 */
+       if (!hmp_data.freqinvar_load_scale_enabled) {
+               freq_scale[cpu].curr_scale = 1024;
+               return NOTIFY_OK;
+       }
+
+       extents = &freq_scale[cpu];
+       if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+               /* If our governor was recognised as a single-freq governor,
+                * use 1.0
+                */
+               extents->curr_scale = 1024;
+       } else {
+               extents->curr_scale = cpufreq_calc_scale(extents->min,
+                               extents->max, freq->new);
+       }
+
+       return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+                                      unsigned long event, void *data)
+{
+       struct cpufreq_policy *policy = data;
+       struct cpufreq_extents *extents;
+       int cpu, singleFreq = 0;
+       static const char performance_governor[] = "performance";
+       static const char powersave_governor[] = "powersave";
+
+       if (event == CPUFREQ_START)
+               return 0;
+
+       if (event != CPUFREQ_INCOMPATIBLE)
+               return 0;
+
+       /* CPUFreq governors do not accurately report the range of
+        * CPU Frequencies they will choose from.
+        * We recognise performance and powersave governors as
+        * single-frequency only.
+        */
+       if (!strncmp(policy->governor->name, performance_governor,
+                       strlen(performance_governor)) ||
+               !strncmp(policy->governor->name, powersave_governor,
+                               strlen(powersave_governor)))
+               singleFreq = 1;
+
+       /* Make sure that all CPUs impacted by this policy are
+        * updated since we will only get a notification when the
+        * user explicitly changes the policy on a CPU.
+        */
+       for_each_cpu(cpu, policy->cpus) {
+               extents = &freq_scale[cpu];
+               extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+               extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+               if (!hmp_data.freqinvar_load_scale_enabled) {
+                       extents->curr_scale = 1024;
+               } else if (singleFreq) {
+                       extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+                       extents->curr_scale = 1024;
+               } else {
+                       extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+                       extents->curr_scale = cpufreq_calc_scale(extents->min,
+                                       extents->max, policy->cur);
+               }
+       }
+
+       return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+       .notifier_call  = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+       .notifier_call  = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+       int ret = 0;
+
+       /* init safe defaults since there are no policies at registration */
+       for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+               /* safe defaults */
+               freq_scale[ret].max = 1024;
+               freq_scale[ret].min = 1024;
+               freq_scale[ret].curr_scale = 1024;
+       }
+
+       pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+       ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+                       CPUFREQ_POLICY_NOTIFIER);
+
+       if (ret != -EINVAL)
+               ret = cpufreq_register_notifier(&cpufreq_notifier,
+                       CPUFREQ_TRANSITION_NOTIFIER);
+
+       return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */