1 #include <linux/cgroup.h>
3 #include <linux/kernel.h>
4 #include <linux/percpu.h>
5 #include <linux/printk.h>
6 #include <linux/reciprocal_div.h>
7 #include <linux/rcupdate.h>
8 #include <linux/slab.h>
10 #include <trace/events/sched.h>
14 unsigned int sysctl_sched_cfs_boost __read_mostly;
17 * System energy normalization constants
19 static struct target_nrg {
20 unsigned long min_power;
21 unsigned long max_power;
22 struct reciprocal_value rdiv;
23 } schedtune_target_nrg;
25 /* Performance Boost region (B) threshold params */
26 static int perf_boost_idx;
28 /* Performance Constraint region (C) threshold params */
29 static int perf_constrain_idx;
32 * Performance-Energy (P-E) Space thresholds constants
34 struct threshold_params {
40 * System specific P-E space thresholds constants
42 static struct threshold_params
45 { 0, 4 }, /* >= 10% */
46 { 1, 4 }, /* >= 20% */
47 { 2, 4 }, /* >= 30% */
48 { 3, 4 }, /* >= 40% */
49 { 4, 3 }, /* >= 50% */
50 { 4, 2 }, /* >= 60% */
51 { 4, 1 }, /* >= 70% */
52 { 4, 0 }, /* >= 80% */
57 __schedtune_accept_deltas(int nrg_delta, int cap_delta,
58 int perf_boost_idx, int perf_constrain_idx)
60 int payoff = -INT_MAX;
62 /* Performance Boost (B) region */
63 if (nrg_delta > 0 && cap_delta > 0) {
65 * Evaluate "Performance Boost" vs "Energy Increase"
67 * cap_delta / nrg_delta < cap_gain / nrg_gain
69 * nrg_delta * cap_gain > cap_delta * nrg_gain
71 payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
72 payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
76 /* Performance Constraint (C) region */
77 if (nrg_delta < 0 && cap_delta < 0) {
79 * Evaluate "Performance Boost" vs "Energy Increase"
81 * cap_delta / nrg_delta > cap_gain / nrg_gain
83 * cap_delta * nrg_gain > nrg_delta * cap_gain
85 payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
86 payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
90 /* Default: reject schedule candidate */
94 #ifdef CONFIG_CGROUP_SCHEDTUNE
97 * EAS scheduler tunables for task groups.
100 /* SchdTune tunables for a group of tasks */
102 /* SchedTune CGroup subsystem */
103 struct cgroup_subsys_state css;
105 /* Boost group allocated ID */
108 /* Boost value for tasks on that SchedTune CGroup */
111 /* Performance Boost (B) region threshold params */
114 /* Performance Constraint (C) region threshold params */
115 int perf_constrain_idx;
118 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
120 return css ? container_of(css, struct schedtune, css) : NULL;
123 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
125 return css_st(task_css(tsk, schedtune_cgrp_id));
128 static inline struct schedtune *parent_st(struct schedtune *st)
130 return css_st(st->css.parent);
134 * SchedTune root control group
135 * The root control group is used to defined a system-wide boosting tuning,
136 * which is applied to all tasks in the system.
137 * Task specific boost tuning could be specified by creating and
138 * configuring a child control group under the root one.
139 * By default, system-wide boosting is disabled, i.e. no boosting is applied
140 * to tasks which are not into a child control group.
142 static struct schedtune
146 .perf_constrain_idx = 0,
150 schedtune_accept_deltas(int nrg_delta, int cap_delta,
151 struct task_struct *task)
153 struct schedtune *ct;
155 int perf_constrain_idx;
157 /* Optimal (O) region */
158 if (nrg_delta < 0 && cap_delta > 0)
161 /* Suboptimal (S) region */
162 if (nrg_delta > 0 && cap_delta < 0)
165 /* Get task specific perf Boost/Constraints indexes */
167 ct = task_schedtune(task);
168 perf_boost_idx = ct->perf_boost_idx;
169 perf_constrain_idx = ct->perf_constrain_idx;
172 return __schedtune_accept_deltas(nrg_delta, cap_delta,
173 perf_boost_idx, perf_constrain_idx);
177 * Maximum number of boost groups to support
178 * When per-task boosting is used we still allow only limited number of
179 * boost groups for two main reasons:
180 * 1. on a real system we usually have only few classes of workloads which
181 * make sense to boost with different values (e.g. background vs foreground
182 * tasks, interactive vs low-priority tasks)
183 * 2. a limited number allows for a simpler and more memory/time efficient
184 * implementation especially for the computation of the per-CPU boost
187 #define BOOSTGROUPS_COUNT 4
189 /* Array of configured boostgroups */
190 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
195 /* SchedTune boost groups
196 * Keep track of all the boost groups which impact on CPU, for example when a
197 * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
198 * likely with different boost values.
199 * Since on each system we expect only a limited number of boost groups, here
200 * we use a simple array to keep track of the metrics required to compute the
201 * maximum per-CPU boosting value.
203 struct boost_groups {
204 /* Maximum boost value for all RUNNABLE tasks on a CPU */
207 /* The boost for tasks on that boost group */
209 /* Count of RUNNABLE tasks on that boost group */
211 } group[BOOSTGROUPS_COUNT];
214 /* Boost groups affecting each CPU in the system */
215 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
218 schedtune_cpu_update(int cpu)
220 struct boost_groups *bg;
224 bg = &per_cpu(cpu_boost_groups, cpu);
226 /* The root boost group is always active */
227 boost_max = bg->group[0].boost;
228 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
230 * A boost group affects a CPU only if it has
231 * RUNNABLE tasks on that CPU
233 if (bg->group[idx].tasks == 0)
235 boost_max = max(boost_max, bg->group[idx].boost);
238 bg->boost_max = boost_max;
242 schedtune_boostgroup_update(int idx, int boost)
244 struct boost_groups *bg;
249 /* Update per CPU boost groups */
250 for_each_possible_cpu(cpu) {
251 bg = &per_cpu(cpu_boost_groups, cpu);
254 * Keep track of current boost values to compute the per CPU
255 * maximum only when it has been affected by the new value of
256 * the updated boost group
258 cur_boost_max = bg->boost_max;
259 old_boost = bg->group[idx].boost;
261 /* Update the boost value of this boost group */
262 bg->group[idx].boost = boost;
264 /* Check if this update increase current max */
265 if (boost > cur_boost_max && bg->group[idx].tasks) {
266 bg->boost_max = boost;
270 /* Check if this update has decreased current max */
271 if (cur_boost_max == old_boost && old_boost > boost)
272 schedtune_cpu_update(cpu);
279 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
281 struct boost_groups *bg;
284 bg = &per_cpu(cpu_boost_groups, cpu);
286 /* Update boosted tasks count while avoiding to make it negative */
287 if (task_count < 0 && bg->group[idx].tasks <= -task_count)
288 bg->group[idx].tasks = 0;
290 bg->group[idx].tasks += task_count;
292 /* Boost group activation or deactivation on that RQ */
293 tasks = bg->group[idx].tasks;
294 if (tasks == 1 || tasks == 0)
295 schedtune_cpu_update(cpu);
299 * NOTE: This function must be called while holding the lock on the CPU RQ
301 void schedtune_enqueue_task(struct task_struct *p, int cpu)
303 struct schedtune *st;
307 * When a task is marked PF_EXITING by do_exit() it's going to be
308 * dequeued and enqueued multiple times in the exit path.
309 * Thus we avoid any further update, since we do not want to change
310 * CPU boosting while the task is exiting.
312 if (p->flags & PF_EXITING)
315 /* Get task boost group */
317 st = task_schedtune(p);
321 schedtune_tasks_update(p, cpu, idx, 1);
325 * NOTE: This function must be called while holding the lock on the CPU RQ
327 void schedtune_dequeue_task(struct task_struct *p, int cpu)
329 struct schedtune *st;
333 * When a task is marked PF_EXITING by do_exit() it's going to be
334 * dequeued and enqueued multiple times in the exit path.
335 * Thus we avoid any further update, since we do not want to change
336 * CPU boosting while the task is exiting.
337 * The last dequeue will be done by cgroup exit() callback.
339 if (p->flags & PF_EXITING)
342 /* Get task boost group */
344 st = task_schedtune(p);
348 schedtune_tasks_update(p, cpu, idx, -1);
351 int schedtune_cpu_boost(int cpu)
353 struct boost_groups *bg;
355 bg = &per_cpu(cpu_boost_groups, cpu);
356 return bg->boost_max;
359 int schedtune_task_boost(struct task_struct *p)
361 struct schedtune *st;
364 /* Get task boost value */
366 st = task_schedtune(p);
367 task_boost = st->boost;
374 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
376 struct schedtune *st = css_st(css);
382 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
385 struct schedtune *st = css_st(css);
387 if (boost < 0 || boost > 100)
391 if (css == &root_schedtune.css)
392 sysctl_sched_cfs_boost = boost;
394 /* Update CPU boost */
395 schedtune_boostgroup_update(st->idx, st->boost);
397 trace_sched_tune_config(st->boost);
402 static struct cftype files[] = {
405 .read_u64 = boost_read,
406 .write_u64 = boost_write,
412 schedtune_boostgroup_init(struct schedtune *st)
414 struct boost_groups *bg;
417 /* Keep track of allocated boost groups */
418 allocated_group[st->idx] = st;
420 /* Initialize the per CPU boost groups */
421 for_each_possible_cpu(cpu) {
422 bg = &per_cpu(cpu_boost_groups, cpu);
423 bg->group[st->idx].boost = 0;
424 bg->group[st->idx].tasks = 0;
433 struct boost_groups *bg;
436 /* Initialize the per CPU boost groups */
437 for_each_possible_cpu(cpu) {
438 bg = &per_cpu(cpu_boost_groups, cpu);
439 memset(bg, 0, sizeof(struct boost_groups));
442 pr_info(" schedtune configured to support %d boost groups\n",
447 static struct cgroup_subsys_state *
448 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
450 struct schedtune *st;
455 return &root_schedtune.css;
458 /* Allow only single level hierachies */
459 if (parent_css != &root_schedtune.css) {
460 pr_err("Nested SchedTune boosting groups not allowed\n");
461 return ERR_PTR(-ENOMEM);
464 /* Allow only a limited number of boosting groups */
465 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
466 if (!allocated_group[idx])
468 if (idx == BOOSTGROUPS_COUNT) {
469 pr_err("Trying to create more than %d SchedTune boosting groups\n",
471 return ERR_PTR(-ENOSPC);
474 st = kzalloc(sizeof(*st), GFP_KERNEL);
478 /* Initialize per CPUs boost group support */
480 if (schedtune_boostgroup_init(st))
488 return ERR_PTR(-ENOMEM);
492 schedtune_boostgroup_release(struct schedtune *st)
494 /* Reset this boost group */
495 schedtune_boostgroup_update(st->idx, 0);
497 /* Keep track of allocated boost groups */
498 allocated_group[st->idx] = NULL;
502 schedtune_css_free(struct cgroup_subsys_state *css)
504 struct schedtune *st = css_st(css);
506 schedtune_boostgroup_release(st);
510 struct cgroup_subsys schedtune_cgrp_subsys = {
511 .css_alloc = schedtune_css_alloc,
512 .css_free = schedtune_css_free,
513 .legacy_cftypes = files,
517 #else /* CONFIG_CGROUP_SCHEDTUNE */
520 schedtune_accept_deltas(int nrg_delta, int cap_delta,
521 struct task_struct *task)
523 /* Optimal (O) region */
524 if (nrg_delta < 0 && cap_delta > 0)
527 /* Suboptimal (S) region */
528 if (nrg_delta > 0 && cap_delta < 0)
531 return __schedtune_accept_deltas(nrg_delta, cap_delta,
532 perf_boost_idx, perf_constrain_idx);
535 #endif /* CONFIG_CGROUP_SCHEDTUNE */
538 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
539 void __user *buffer, size_t *lenp,
542 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
547 /* Performance Boost (B) region threshold params */
548 perf_boost_idx = sysctl_sched_cfs_boost;
549 perf_boost_idx /= 10;
551 /* Performance Constraint (C) region threshold params */
552 perf_constrain_idx = 100 - sysctl_sched_cfs_boost;
553 perf_constrain_idx /= 10;
559 * System energy normalization
560 * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
561 * corresponding to the specified energy variation.
564 schedtune_normalize_energy(int energy_diff)
569 #ifdef CONFIG_SCHED_DEBUG
570 /* Check for boundaries */
571 max_delta = schedtune_target_nrg.max_power;
572 max_delta -= schedtune_target_nrg.min_power;
573 WARN_ON(abs(energy_diff) >= max_delta);
576 /* Do scaling using positive numbers to increase the range */
577 normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
579 /* Scale by energy magnitude */
580 normalized_nrg <<= SCHED_LOAD_SHIFT;
582 /* Normalize on max energy for target platform */
583 normalized_nrg = reciprocal_divide(
584 normalized_nrg, schedtune_target_nrg.rdiv);
586 return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
589 #ifdef CONFIG_SCHED_DEBUG
591 schedtune_test_nrg(unsigned long delta_pwr)
593 unsigned long test_delta_pwr;
594 unsigned long test_norm_pwr;
598 * Check normalization constants using some constant system
601 pr_info("schedtune: verify normalization constants...\n");
602 for (idx = 0; idx < 6; ++idx) {
603 test_delta_pwr = delta_pwr >> idx;
605 /* Normalize on max energy for target platform */
606 test_norm_pwr = reciprocal_divide(
607 test_delta_pwr << SCHED_LOAD_SHIFT,
608 schedtune_target_nrg.rdiv);
610 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
611 idx, test_delta_pwr, test_norm_pwr);
615 #define schedtune_test_nrg(delta_pwr)
619 * Compute the min/max power consumption of a cluster and all its CPUs
622 schedtune_add_cluster_nrg(
623 struct sched_domain *sd,
624 struct sched_group *sg,
625 struct target_nrg *ste)
627 struct sched_domain *sd2;
628 struct sched_group *sg2;
630 struct cpumask *cluster_cpus;
633 unsigned long min_pwr;
634 unsigned long max_pwr;
637 /* Get Cluster energy using EM data for the first CPU */
638 cluster_cpus = sched_group_cpus(sg);
639 snprintf(str, 32, "CLUSTER[%*pbl]",
640 cpumask_pr_args(cluster_cpus));
642 min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
643 max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
644 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
645 str, min_pwr, max_pwr);
648 * Keep track of this cluster's energy in the computation of the
649 * overall system energy
651 ste->min_power += min_pwr;
652 ste->max_power += max_pwr;
654 /* Get CPU energy using EM data for each CPU in the group */
655 for_each_cpu(cpu, cluster_cpus) {
656 /* Get a SD view for the specific CPU */
657 for_each_domain(cpu, sd2) {
658 /* Get the CPU group */
660 min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
661 max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
663 ste->min_power += min_pwr;
664 ste->max_power += max_pwr;
666 snprintf(str, 32, "CPU[%d]", cpu);
667 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
668 str, min_pwr, max_pwr);
671 * Assume we have EM data only at the CPU and
672 * the upper CLUSTER level
674 BUG_ON(!cpumask_equal(
675 sched_group_cpus(sg),
676 sched_group_cpus(sd2->parent->groups)
684 * Initialize the constants required to compute normalized energy.
685 * The values of these constants depends on the EM data for the specific
686 * target system and topology.
687 * Thus, this function is expected to be called by the code
688 * that bind the EM to the topology information.
691 schedtune_init_late(void)
693 struct target_nrg *ste = &schedtune_target_nrg;
694 unsigned long delta_pwr = 0;
695 struct sched_domain *sd;
696 struct sched_group *sg;
698 pr_info("schedtune: init normalization constants...\n");
705 * When EAS is in use, we always have a pointer to the highest SD
706 * which provides EM data.
708 sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
710 pr_info("schedtune: no energy model data\n");
716 schedtune_add_cluster_nrg(sd, sg, ste);
717 } while (sg = sg->next, sg != sd->groups);
721 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
722 "SYSTEM", ste->min_power, ste->max_power);
724 /* Compute normalization constants */
725 delta_pwr = ste->max_power - ste->min_power;
726 ste->rdiv = reciprocal_value(delta_pwr);
727 pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
728 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
730 schedtune_test_nrg(delta_pwr);
737 late_initcall(schedtune_init_late);