sched/tune: add support to compute normalized energy
[firefly-linux-kernel-4.4.55.git] / kernel / sched / tune.c
index 3253a8732ba575363a35419995439b044be58838..f4fbbcd28373f4929b606f26a558436d46d17c75 100644 (file)
@@ -1,7 +1,9 @@
 #include <linux/cgroup.h>
 #include <linux/err.h>
+#include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/printk.h>
+#include <linux/reciprocal_div.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 
@@ -9,6 +11,84 @@
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
+/*
+ * System energy normalization constants
+ */
+static struct target_nrg {
+       unsigned long min_power;
+       unsigned long max_power;
+       struct reciprocal_value rdiv;
+} schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+       int nrg_gain;
+       int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+       { 0, 4 }, /* >=  0% */
+       { 0, 4 }, /* >= 10% */
+       { 1, 4 }, /* >= 20% */
+       { 2, 4 }, /* >= 30% */
+       { 3, 4 }, /* >= 40% */
+       { 4, 3 }, /* >= 50% */
+       { 4, 2 }, /* >= 60% */
+       { 4, 1 }, /* >= 70% */
+       { 4, 0 }, /* >= 80% */
+       { 4, 0 }  /* >= 90% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+                         int perf_boost_idx, int perf_constrain_idx)
+{
+       int payoff = -INT_MAX;
+
+       /* Performance Boost (B) region */
+       if (nrg_delta > 0 && cap_delta > 0) {
+               /*
+                * Evaluate "Performance Boost" vs "Energy Increase"
+                * payoff criteria:
+                *    cap_delta / nrg_delta < cap_gain / nrg_gain
+                * which is:
+                *    nrg_delta * cap_gain > cap_delta * nrg_gain
+                */
+               payoff  = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
+               payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
+               return payoff;
+       }
+
+       /* Performance Constraint (C) region */
+       if (nrg_delta < 0 && cap_delta < 0) {
+               /*
+                * Evaluate "Performance Boost" vs "Energy Increase"
+                * payoff criteria:
+                *    cap_delta / nrg_delta > cap_gain / nrg_gain
+                * which is:
+                *    cap_delta * nrg_gain > nrg_delta * cap_gain
+                */
+               payoff  = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
+               payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
+               return payoff;
+       }
+
+       /* Default: reject schedule candidate */
+       return payoff;
+}
+
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 
 /*
@@ -26,6 +106,11 @@ struct schedtune {
        /* Boost value for tasks on that SchedTune CGroup */
        int boost;
 
+       /* Performance Boost (B) region threshold params */
+       int perf_boost_idx;
+
+       /* Performance Constraint (C) region threshold params */
+       int perf_constrain_idx;
 };
 
 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st)
 static struct schedtune
 root_schedtune = {
        .boost  = 0,
+       .perf_boost_idx = 0,
+       .perf_constrain_idx = 0,
 };
 
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+                       struct task_struct *task)
+{
+       struct schedtune *ct;
+       int perf_boost_idx;
+       int perf_constrain_idx;
+
+       /* Optimal (O) region */
+       if (nrg_delta < 0 && cap_delta > 0)
+               return INT_MAX;
+
+       /* Suboptimal (S) region */
+       if (nrg_delta > 0 && cap_delta < 0)
+               return -INT_MAX;
+
+       /* Get task specific perf Boost/Constraints indexes */
+       rcu_read_lock();
+       ct = task_schedtune(task);
+       perf_boost_idx = ct->perf_boost_idx;
+       perf_constrain_idx = ct->perf_constrain_idx;
+       rcu_read_unlock();
+
+       return __schedtune_accept_deltas(nrg_delta, cap_delta,
+                       perf_boost_idx, perf_constrain_idx);
+}
+
 /*
  * Maximum number of boost groups to support
  * When per-task boosting is used we still allow only limited number of
@@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
        .early_init     = 1,
 };
 
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+                       struct task_struct *task)
+{
+       /* Optimal (O) region */
+       if (nrg_delta < 0 && cap_delta > 0)
+               return INT_MAX;
+
+       /* Suboptimal (S) region */
+       if (nrg_delta > 0 && cap_delta < 0)
+               return -INT_MAX;
+
+       return __schedtune_accept_deltas(nrg_delta, cap_delta,
+                       perf_boost_idx, perf_constrain_idx);
+}
+
 #endif /* CONFIG_CGROUP_SCHEDTUNE */
 
 int
@@ -408,6 +540,195 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
        if (ret || !write)
                return ret;
 
+       /* Performance Boost (B) region threshold params */
+       perf_boost_idx  = sysctl_sched_cfs_boost;
+       perf_boost_idx /= 10;
+
+       /* Performance Constraint (C) region threshold params */
+       perf_constrain_idx  = 100 - sysctl_sched_cfs_boost;
+       perf_constrain_idx /= 10;
+
+       return 0;
+}
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+int
+schedtune_normalize_energy(int energy_diff)
+{
+       u32 normalized_nrg;
+       int max_delta;
+
+#ifdef CONFIG_SCHED_DEBUG
+       /* Check for boundaries */
+       max_delta  = schedtune_target_nrg.max_power;
+       max_delta -= schedtune_target_nrg.min_power;
+       WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+       /* Do scaling using positive numbers to increase the range */
+       normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+       /* Scale by energy magnitude */
+       normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+       /* Normalize on max energy for target platform */
+       normalized_nrg = reciprocal_divide(
+                       normalized_nrg, schedtune_target_nrg.rdiv);
+
+       return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+       unsigned long test_delta_pwr;
+       unsigned long test_norm_pwr;
+       int idx;
+
+       /*
+        * Check normalization constants using some constant system
+        * energy values
+        */
+       pr_info("schedtune: verify normalization constants...\n");
+       for (idx = 0; idx < 6; ++idx) {
+               test_delta_pwr = delta_pwr >> idx;
+
+               /* Normalize on max energy for target platform */
+               test_norm_pwr = reciprocal_divide(
+                                       test_delta_pwr << SCHED_LOAD_SHIFT,
+                                       schedtune_target_nrg.rdiv);
+
+               pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+                       idx, test_delta_pwr, test_norm_pwr);
+       }
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+               struct sched_domain *sd,
+               struct sched_group *sg,
+               struct target_nrg *ste)
+{
+       struct sched_domain *sd2;
+       struct sched_group *sg2;
+
+       struct cpumask *cluster_cpus;
+       char str[32];
+
+       unsigned long min_pwr;
+       unsigned long max_pwr;
+       int cpu;
+
+       /* Get Cluster energy using EM data for the first CPU */
+       cluster_cpus = sched_group_cpus(sg);
+       snprintf(str, 32, "CLUSTER[%*pbl]",
+                cpumask_pr_args(cluster_cpus));
+
+       min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+       max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+       pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+               str, min_pwr, max_pwr);
+
+       /*
+        * Keep track of this cluster's energy in the computation of the
+        * overall system energy
+        */
+       ste->min_power += min_pwr;
+       ste->max_power += max_pwr;
+
+       /* Get CPU energy using EM data for each CPU in the group */
+       for_each_cpu(cpu, cluster_cpus) {
+               /* Get a SD view for the specific CPU */
+               for_each_domain(cpu, sd2) {
+                       /* Get the CPU group */
+                       sg2 = sd2->groups;
+                       min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+                       max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+                       ste->min_power += min_pwr;
+                       ste->max_power += max_pwr;
+
+                       snprintf(str, 32, "CPU[%d]", cpu);
+                       pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+                               str, min_pwr, max_pwr);
+
+                       /*
+                        * Assume we have EM data only at the CPU and
+                        * the upper CLUSTER level
+                        */
+                       BUG_ON(!cpumask_equal(
+                               sched_group_cpus(sg),
+                               sched_group_cpus(sd2->parent->groups)
+                               ));
+                       break;
+               }
+       }
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init_late(void)
+{
+       struct target_nrg *ste = &schedtune_target_nrg;
+       unsigned long delta_pwr = 0;
+       struct sched_domain *sd;
+       struct sched_group *sg;
+
+       pr_info("schedtune: init normalization constants...\n");
+       ste->max_power = 0;
+       ste->min_power = 0;
+
+       rcu_read_lock();
+
+       /*
+        * When EAS is in use, we always have a pointer to the highest SD
+        * which provides EM data.
+        */
+       sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+       if (!sd) {
+               pr_info("schedtune: no energy model data\n");
+               goto nodata;
+       }
+
+       sg = sd->groups;
+       do {
+               schedtune_add_cluster_nrg(sd, sg, ste);
+       } while (sg = sg->next, sg != sd->groups);
+
+       rcu_read_unlock();
+
+       pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+               "SYSTEM", ste->min_power, ste->max_power);
+
+       /* Compute normalization constants */
+       delta_pwr = ste->max_power - ste->min_power;
+       ste->rdiv = reciprocal_value(delta_pwr);
+       pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+               ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+       schedtune_test_nrg(delta_pwr);
        return 0;
+
+nodata:
+       rcu_read_unlock();
+       return -EINVAL;
 }
+late_initcall(schedtune_init_late);