sched/fair: add tunable to force selection at cpu granularity

author Juri Lelli <juri.lelli@arm.com>

Fri, 29 Jul 2016 13:04:11 +0000 (14:04 +0100)

committer Amit Pundir <amit.pundir@linaro.org>

Wed, 14 Sep 2016 09:29:32 +0000 (14:59 +0530)
author Juri Lelli <juri.lelli@arm.com>
Fri, 29 Jul 2016 13:04:11 +0000 (14:04 +0100)
committer Amit Pundir <amit.pundir@linaro.org>
Wed, 14 Sep 2016 09:29:32 +0000 (14:59 +0530)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index 7d021393b0da0dcdef9741ab7e2e3fffbdee6e70..4883dcf3e1a9e0ddea0a42f85730f3d8a466a147 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
  extern unsigned int sysctl_sched_wakeup_granularity;
  extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_is_big_little;
+extern unsigned int sysctl_sched_sync_hint_enable;
  extern unsigned int sysctl_sched_cstate_aware;
  
  enum sched_tunable_scaling {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 4742a17c7d53d803dd4673e3f8899a0e81e60d87..e2b6174db07d040327e9ac74772e061af04397df 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,7 +51,10 @@
  unsigned int sysctl_sched_latency = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
  unsigned int sysctl_sched_cstate_aware = 1;
+
  /*
   * The initial- and re-scaling of tunables is configurable
   * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -5555,7 +5558,97 @@ done:
         return target;
  }
  
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p)
+{
+       int i, boosted;
+       int target_cpu = -1;
+       int target_capacity = 0;
+       int backup_capacity = 0;
+       int idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       int backup_cpu = -1;
+       unsigned long task_util_boosted, new_util;
+
+       /*
+        * Favor 1) busy cpu with most capacity at current OPP
+        *       2) idle_cpu with capacity at current OPP
+        *       3) busy cpu with capacity at higher OPP
+        */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+       boosted = schedtune_task_boost(p);
+#else
+       boosted = 0;
+#endif
+       task_util_boosted = boosted_task_util(p);
+       for_each_cpu(i, tsk_cpus_allowed(p)) {
+               int cur_capacity = capacity_curr_of(i);
+               struct rq *rq = cpu_rq(i);
+               int idle_idx = idle_get_state_idx(rq);
+
+               /*
+                * p's blocked utilization is still accounted for on prev_cpu
+                * so prev_cpu will receive a negative bias due to the double
+                * accounting. However, the blocked utilization may be zero.
+                */
+               new_util = cpu_util(i) + task_util_boosted;
+
+               /*
+                * Ensure minimum capacity to grant the required boost.
+                * The target CPU can be already at a capacity level higher
+                * than the one required to boost the task.
+                */
+
+               if (new_util > capacity_orig_of(i))
+                       continue;
+
+               /*
+                * For boosted tasks we favor idle cpus unconditionally to
+                * improve latency.
+                */
+               if (idle_idx >= 0 && boosted) {
+                       if (idle_cpu < 0 ||
+                               (sysctl_sched_cstate_aware &&
+                                best_idle_cstate > idle_idx)) {
+                               best_idle_cstate = idle_idx;
+                               idle_cpu = i;
+                       }
+                       continue;
+               }
+
+               if (new_util < cur_capacity) {
+                       if (cpu_rq(i)->nr_running) {
+                               if (target_capacity == 0 ||
+                                       target_capacity > cur_capacity) {
+                                       /* busy CPU with most capacity at current OPP */
+                                       target_cpu = i;
+                                       target_capacity = cur_capacity;
+                               }
+                       } else if (!boosted) {
+                               if (idle_cpu < 0 ||
+                                       (sysctl_sched_cstate_aware &&
+                                               best_idle_cstate > idle_idx)) {
+                                       best_idle_cstate = idle_idx;
+                                       idle_cpu = i;
+                               }
+                       }
+               } else if (backup_capacity == 0 ||
+                               backup_capacity > cur_capacity) {
+                       /* first busy CPU with capacity at higher OPP */
+                       backup_capacity = cur_capacity;
+                       backup_cpu = i;
+               }
+       }
+
+       if (!boosted && target_cpu < 0) {
+               target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu;
+       }
+
+       if (boosted && idle_cpu >= 0)
+               target_cpu = idle_cpu;
+       return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
  {
         struct sched_domain *sd;
         struct sched_group *sg, *sg_target;
@@ -5563,6 +5656,14 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
         int target_cpu = task_cpu(p);
         int i;
  
+       if (sysctl_sched_sync_hint_enable && sync) {
+               int cpu = smp_processor_id();
+               cpumask_t search_cpus;
+               cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+               if (cpumask_test_cpu(cpu, &search_cpus))
+                       return cpu;
+       }
+
         sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
  
         if (!sd)
@@ -5571,50 +5672,60 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
         sg = sd->groups;
         sg_target = sg;
  
-       /*
-        * Find group with sufficient capacity. We only get here if no cpu is
-        * overutilized. We may end up overutilizing a cpu by adding the task,
-        * but that should not be any worse than select_idle_sibling().
-        * load_balance() should sort it out later as we get above the tipping
-        * point.
-        */
-       do {
-               /* Assuming all cpus are the same in group */
-               int max_cap_cpu = group_first_cpu(sg);
+       if (sysctl_sched_is_big_little) {
  
                 /*
-                * Assume smaller max capacity means more energy-efficient.
-                * Ideally we should query the energy model for the right
-                * answer but it easily ends up in an exhaustive search.
+                * Find group with sufficient capacity. We only get here if no cpu is
+                * overutilized. We may end up overutilizing a cpu by adding the task,
+                * but that should not be any worse than select_idle_sibling().
+                * load_balance() should sort it out later as we get above the tipping
+                * point.
                  */
-               if (capacity_of(max_cap_cpu) < target_max_cap &&
-                   task_fits_max(p, max_cap_cpu)) {
-                       sg_target = sg;
-                       target_max_cap = capacity_of(max_cap_cpu);
-               }
-       } while (sg = sg->next, sg != sd->groups);
+               do {
+                       /* Assuming all cpus are the same in group */
+                       int max_cap_cpu = group_first_cpu(sg);
  
-       /* Find cpu with sufficient capacity */
-       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
-               /*
-                * p's blocked utilization is still accounted for on prev_cpu
-                * so prev_cpu will receive a negative bias due to the double
-                * accounting. However, the blocked utilization may be zero.
-                */
-               int new_util = cpu_util(i) + boosted_task_util(p);
+                       /*
+                        * Assume smaller max capacity means more energy-efficient.
+                        * Ideally we should query the energy model for the right
+                        * answer but it easily ends up in an exhaustive search.
+                        */
+                       if (capacity_of(max_cap_cpu) < target_max_cap &&
+                           task_fits_max(p, max_cap_cpu)) {
+                               sg_target = sg;
+                               target_max_cap = capacity_of(max_cap_cpu);
+                       }
+               } while (sg = sg->next, sg != sd->groups);
  
-               if (new_util > capacity_orig_of(i))
-                       continue;
+               /* Find cpu with sufficient capacity */
+               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       int new_util = cpu_util(i) + boosted_task_util(p);
  
-               if (new_util < capacity_curr_of(i)) {
-                       target_cpu = i;
-                       if (cpu_rq(i)->nr_running)
-                               break;
-               }
+                       if (new_util > capacity_orig_of(i))
+                               continue;
+
+                       if (new_util < capacity_curr_of(i)) {
+                               target_cpu = i;
+                               if (cpu_rq(i)->nr_running)
+                                       break;
+                       }
  
-               /* cpu has capacity at higher OPP, keep it as fallback */
-               if (target_cpu == task_cpu(p))
-                       target_cpu = i;
+                       /* cpu has capacity at higher OPP, keep it as fallback */
+                       if (target_cpu == task_cpu(p))
+                               target_cpu = i;
+               }
+       } else {
+               /*
+                * Find a cpu with sufficient capacity
+                */
+               int tmp_target = find_best_target(p);
+               if (tmp_target >= 0)
+                       target_cpu = tmp_target;
         }
  
         if (target_cpu != task_cpu(p)) {
@@ -5691,7 +5802,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (!sd) {
                 if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-                       new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+                       new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
                 else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                         new_cpu = select_idle_sibling(p, new_cpu);
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index fc204ae8487d53d6c9406ece2593142b9a7437ec..831d674a5566b03e449186a5d9b29d37de9e10fd 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -304,6 +304,20 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_sched_granularity_ns,
                 .extra2         = &max_sched_granularity_ns,
         },
+       {
+               .procname       = "sched_is_big_little",
+               .data           = &sysctl_sched_is_big_little,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_sync_hint_enable",
+               .data           = &sysctl_sched_sync_hint_enable,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
         {
                 .procname       = "sched_cstate_aware",
                 .data           = &sysctl_sched_cstate_aware,
author	Juri Lelli <juri.lelli@arm.com>
	Fri, 29 Jul 2016 13:04:11 +0000 (14:04 +0100)
committer	Amit Pundir <amit.pundir@linaro.org>
	Wed, 14 Sep 2016 09:29:32 +0000 (14:59 +0530)
include/linux/sched/sysctl.h		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history