1 #include <linux/cgroup.h>
3 #include <linux/kernel.h>
4 #include <linux/percpu.h>
5 #include <linux/printk.h>
6 #include <linux/rcupdate.h>
7 #include <linux/slab.h>
9 #include <trace/events/sched.h>
14 unsigned int sysctl_sched_cfs_boost __read_mostly;
16 extern struct target_nrg schedtune_target_nrg;
18 /* Performance Boost region (B) threshold params */
19 static int perf_boost_idx;
21 /* Performance Constraint region (C) threshold params */
22 static int perf_constrain_idx;
25 * Performance-Energy (P-E) Space thresholds constants
27 struct threshold_params {
33 * System specific P-E space thresholds constants
35 static struct threshold_params
38 { 0, 4 }, /* >= 10% */
39 { 1, 4 }, /* >= 20% */
40 { 2, 4 }, /* >= 30% */
41 { 3, 4 }, /* >= 40% */
42 { 4, 3 }, /* >= 50% */
43 { 4, 2 }, /* >= 60% */
44 { 4, 1 }, /* >= 70% */
45 { 4, 0 }, /* >= 80% */
50 __schedtune_accept_deltas(int nrg_delta, int cap_delta,
51 int perf_boost_idx, int perf_constrain_idx)
53 int payoff = -INT_MAX;
55 /* Performance Boost (B) region */
56 if (nrg_delta > 0 && cap_delta > 0) {
58 * Evaluate "Performance Boost" vs "Energy Increase"
60 * cap_delta / nrg_delta < cap_gain / nrg_gain
62 * nrg_delta * cap_gain > cap_delta * nrg_gain
64 payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
65 payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
67 trace_sched_tune_filter(
69 threshold_gains[perf_boost_idx].nrg_gain,
70 threshold_gains[perf_boost_idx].cap_gain,
76 /* Performance Constraint (C) region */
77 if (nrg_delta < 0 && cap_delta < 0) {
79 * Evaluate "Performance Boost" vs "Energy Increase"
81 * cap_delta / nrg_delta > cap_gain / nrg_gain
83 * cap_delta * nrg_gain > nrg_delta * cap_gain
85 payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
86 payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
88 trace_sched_tune_filter(
90 threshold_gains[perf_constrain_idx].nrg_gain,
91 threshold_gains[perf_constrain_idx].cap_gain,
97 /* Default: reject schedule candidate */
101 #ifdef CONFIG_CGROUP_SCHEDTUNE
104 * EAS scheduler tunables for task groups.
107 /* SchdTune tunables for a group of tasks */
109 /* SchedTune CGroup subsystem */
110 struct cgroup_subsys_state css;
112 /* Boost group allocated ID */
115 /* Boost value for tasks on that SchedTune CGroup */
118 /* Performance Boost (B) region threshold params */
121 /* Performance Constraint (C) region threshold params */
122 int perf_constrain_idx;
125 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
127 return css ? container_of(css, struct schedtune, css) : NULL;
130 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
132 return css_st(task_css(tsk, schedtune_cgrp_id));
135 static inline struct schedtune *parent_st(struct schedtune *st)
137 return css_st(st->css.parent);
141 * SchedTune root control group
142 * The root control group is used to defined a system-wide boosting tuning,
143 * which is applied to all tasks in the system.
144 * Task specific boost tuning could be specified by creating and
145 * configuring a child control group under the root one.
146 * By default, system-wide boosting is disabled, i.e. no boosting is applied
147 * to tasks which are not into a child control group.
149 static struct schedtune
153 .perf_constrain_idx = 0,
157 schedtune_accept_deltas(int nrg_delta, int cap_delta,
158 struct task_struct *task)
160 struct schedtune *ct;
162 int perf_constrain_idx;
164 /* Optimal (O) region */
165 if (nrg_delta < 0 && cap_delta > 0) {
166 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
170 /* Suboptimal (S) region */
171 if (nrg_delta > 0 && cap_delta < 0) {
172 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
176 /* Get task specific perf Boost/Constraints indexes */
178 ct = task_schedtune(task);
179 perf_boost_idx = ct->perf_boost_idx;
180 perf_constrain_idx = ct->perf_constrain_idx;
183 return __schedtune_accept_deltas(nrg_delta, cap_delta,
184 perf_boost_idx, perf_constrain_idx);
188 * Maximum number of boost groups to support
189 * When per-task boosting is used we still allow only limited number of
190 * boost groups for two main reasons:
191 * 1. on a real system we usually have only few classes of workloads which
192 * make sense to boost with different values (e.g. background vs foreground
193 * tasks, interactive vs low-priority tasks)
194 * 2. a limited number allows for a simpler and more memory/time efficient
195 * implementation especially for the computation of the per-CPU boost
198 #define BOOSTGROUPS_COUNT 4
200 /* Array of configured boostgroups */
201 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
206 /* SchedTune boost groups
207 * Keep track of all the boost groups which impact on CPU, for example when a
208 * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
209 * likely with different boost values.
210 * Since on each system we expect only a limited number of boost groups, here
211 * we use a simple array to keep track of the metrics required to compute the
212 * maximum per-CPU boosting value.
214 struct boost_groups {
215 /* Maximum boost value for all RUNNABLE tasks on a CPU */
219 /* The boost for tasks on that boost group */
221 /* Count of RUNNABLE tasks on that boost group */
223 } group[BOOSTGROUPS_COUNT];
226 /* Boost groups affecting each CPU in the system */
227 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
230 schedtune_cpu_update(int cpu)
232 struct boost_groups *bg;
236 bg = &per_cpu(cpu_boost_groups, cpu);
238 /* The root boost group is always active */
239 boost_max = bg->group[0].boost;
240 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
242 * A boost group affects a CPU only if it has
243 * RUNNABLE tasks on that CPU
245 if (bg->group[idx].tasks == 0)
248 boost_max = max(boost_max, bg->group[idx].boost);
250 /* Ensures boost_max is non-negative when all cgroup boost values
251 * are neagtive. Avoids under-accounting of cpu capacity which may cause
252 * task stacking and frequency spikes.*/
253 boost_max = max(boost_max, 0);
254 bg->boost_max = boost_max;
258 schedtune_boostgroup_update(int idx, int boost)
260 struct boost_groups *bg;
265 /* Update per CPU boost groups */
266 for_each_possible_cpu(cpu) {
267 bg = &per_cpu(cpu_boost_groups, cpu);
270 * Keep track of current boost values to compute the per CPU
271 * maximum only when it has been affected by the new value of
272 * the updated boost group
274 cur_boost_max = bg->boost_max;
275 old_boost = bg->group[idx].boost;
277 /* Update the boost value of this boost group */
278 bg->group[idx].boost = boost;
280 /* Check if this update increase current max */
281 if (boost > cur_boost_max && bg->group[idx].tasks) {
282 bg->boost_max = boost;
283 trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
287 /* Check if this update has decreased current max */
288 if (cur_boost_max == old_boost && old_boost > boost) {
289 schedtune_cpu_update(cpu);
290 trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
294 trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
301 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
303 struct boost_groups *bg;
306 bg = &per_cpu(cpu_boost_groups, cpu);
308 /* Update boosted tasks count while avoiding to make it negative */
309 if (task_count < 0 && bg->group[idx].tasks <= -task_count)
310 bg->group[idx].tasks = 0;
312 bg->group[idx].tasks += task_count;
314 /* Boost group activation or deactivation on that RQ */
315 tasks = bg->group[idx].tasks;
316 if (tasks == 1 || tasks == 0)
317 schedtune_cpu_update(cpu);
319 trace_sched_tune_tasks_update(p, cpu, tasks, idx,
320 bg->group[idx].boost, bg->boost_max);
325 * NOTE: This function must be called while holding the lock on the CPU RQ
327 void schedtune_enqueue_task(struct task_struct *p, int cpu)
329 struct schedtune *st;
333 * When a task is marked PF_EXITING by do_exit() it's going to be
334 * dequeued and enqueued multiple times in the exit path.
335 * Thus we avoid any further update, since we do not want to change
336 * CPU boosting while the task is exiting.
338 if (p->flags & PF_EXITING)
341 /* Get task boost group */
343 st = task_schedtune(p);
347 schedtune_tasks_update(p, cpu, idx, 1);
351 * NOTE: This function must be called while holding the lock on the CPU RQ
353 void schedtune_dequeue_task(struct task_struct *p, int cpu)
355 struct schedtune *st;
359 * When a task is marked PF_EXITING by do_exit() it's going to be
360 * dequeued and enqueued multiple times in the exit path.
361 * Thus we avoid any further update, since we do not want to change
362 * CPU boosting while the task is exiting.
363 * The last dequeue will be done by cgroup exit() callback.
365 if (p->flags & PF_EXITING)
368 /* Get task boost group */
370 st = task_schedtune(p);
374 schedtune_tasks_update(p, cpu, idx, -1);
377 int schedtune_cpu_boost(int cpu)
379 struct boost_groups *bg;
381 bg = &per_cpu(cpu_boost_groups, cpu);
382 return bg->boost_max;
385 int schedtune_task_boost(struct task_struct *p)
387 struct schedtune *st;
390 /* Get task boost value */
392 st = task_schedtune(p);
393 task_boost = st->boost;
400 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
402 struct schedtune *st = css_st(css);
408 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
411 struct schedtune *st = css_st(css);
412 unsigned threshold_idx;
415 if (boost < -100 || boost > 100)
419 if (css == &root_schedtune.css)
420 sysctl_sched_cfs_boost = boost;
422 /* Update CPU boost */
423 schedtune_boostgroup_update(st->idx, st->boost);
425 trace_sched_tune_config(st->boost);
430 static struct cftype files[] = {
433 .read_s64 = boost_read,
434 .write_s64 = boost_write,
440 schedtune_boostgroup_init(struct schedtune *st)
442 struct boost_groups *bg;
445 /* Keep track of allocated boost groups */
446 allocated_group[st->idx] = st;
448 /* Initialize the per CPU boost groups */
449 for_each_possible_cpu(cpu) {
450 bg = &per_cpu(cpu_boost_groups, cpu);
451 bg->group[st->idx].boost = 0;
452 bg->group[st->idx].tasks = 0;
461 struct boost_groups *bg;
464 /* Initialize the per CPU boost groups */
465 for_each_possible_cpu(cpu) {
466 bg = &per_cpu(cpu_boost_groups, cpu);
467 memset(bg, 0, sizeof(struct boost_groups));
470 pr_info(" schedtune configured to support %d boost groups\n",
475 static struct cgroup_subsys_state *
476 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
478 struct schedtune *st;
483 return &root_schedtune.css;
486 /* Allow only single level hierachies */
487 if (parent_css != &root_schedtune.css) {
488 pr_err("Nested SchedTune boosting groups not allowed\n");
489 return ERR_PTR(-ENOMEM);
492 /* Allow only a limited number of boosting groups */
493 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
494 if (!allocated_group[idx])
496 if (idx == BOOSTGROUPS_COUNT) {
497 pr_err("Trying to create more than %d SchedTune boosting groups\n",
499 return ERR_PTR(-ENOSPC);
502 st = kzalloc(sizeof(*st), GFP_KERNEL);
506 /* Initialize per CPUs boost group support */
508 if (schedtune_boostgroup_init(st))
516 return ERR_PTR(-ENOMEM);
520 schedtune_boostgroup_release(struct schedtune *st)
522 /* Reset this boost group */
523 schedtune_boostgroup_update(st->idx, 0);
525 /* Keep track of allocated boost groups */
526 allocated_group[st->idx] = NULL;
530 schedtune_css_free(struct cgroup_subsys_state *css)
532 struct schedtune *st = css_st(css);
534 schedtune_boostgroup_release(st);
538 struct cgroup_subsys schedtune_cgrp_subsys = {
539 .css_alloc = schedtune_css_alloc,
540 .css_free = schedtune_css_free,
541 .legacy_cftypes = files,
545 #else /* CONFIG_CGROUP_SCHEDTUNE */
548 schedtune_accept_deltas(int nrg_delta, int cap_delta,
549 struct task_struct *task)
551 /* Optimal (O) region */
552 if (nrg_delta < 0 && cap_delta > 0) {
553 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
557 /* Suboptimal (S) region */
558 if (nrg_delta > 0 && cap_delta < 0) {
559 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
563 return __schedtune_accept_deltas(nrg_delta, cap_delta,
564 perf_boost_idx, perf_constrain_idx);
567 #endif /* CONFIG_CGROUP_SCHEDTUNE */
570 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
571 void __user *buffer, size_t *lenp,
574 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 /* Performance Boost (B) region threshold params */
580 perf_boost_idx = sysctl_sched_cfs_boost;
581 perf_boost_idx /= 10;
583 /* Performance Constraint (C) region threshold params */
584 perf_constrain_idx = 100 - sysctl_sched_cfs_boost;
585 perf_constrain_idx /= 10;
590 #ifdef CONFIG_SCHED_DEBUG
592 schedtune_test_nrg(unsigned long delta_pwr)
594 unsigned long test_delta_pwr;
595 unsigned long test_norm_pwr;
599 * Check normalization constants using some constant system
602 pr_info("schedtune: verify normalization constants...\n");
603 for (idx = 0; idx < 6; ++idx) {
604 test_delta_pwr = delta_pwr >> idx;
606 /* Normalize on max energy for target platform */
607 test_norm_pwr = reciprocal_divide(
608 test_delta_pwr << SCHED_LOAD_SHIFT,
609 schedtune_target_nrg.rdiv);
611 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
612 idx, test_delta_pwr, test_norm_pwr);
616 #define schedtune_test_nrg(delta_pwr)
620 * Compute the min/max power consumption of a cluster and all its CPUs
623 schedtune_add_cluster_nrg(
624 struct sched_domain *sd,
625 struct sched_group *sg,
626 struct target_nrg *ste)
628 struct sched_domain *sd2;
629 struct sched_group *sg2;
631 struct cpumask *cluster_cpus;
634 unsigned long min_pwr;
635 unsigned long max_pwr;
638 /* Get Cluster energy using EM data for the first CPU */
639 cluster_cpus = sched_group_cpus(sg);
640 snprintf(str, 32, "CLUSTER[%*pbl]",
641 cpumask_pr_args(cluster_cpus));
643 min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
644 max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
645 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
646 str, min_pwr, max_pwr);
649 * Keep track of this cluster's energy in the computation of the
650 * overall system energy
652 ste->min_power += min_pwr;
653 ste->max_power += max_pwr;
655 /* Get CPU energy using EM data for each CPU in the group */
656 for_each_cpu(cpu, cluster_cpus) {
657 /* Get a SD view for the specific CPU */
658 for_each_domain(cpu, sd2) {
659 /* Get the CPU group */
661 min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
662 max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
664 ste->min_power += min_pwr;
665 ste->max_power += max_pwr;
667 snprintf(str, 32, "CPU[%d]", cpu);
668 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
669 str, min_pwr, max_pwr);
672 * Assume we have EM data only at the CPU and
673 * the upper CLUSTER level
675 BUG_ON(!cpumask_equal(
676 sched_group_cpus(sg),
677 sched_group_cpus(sd2->parent->groups)
685 * Initialize the constants required to compute normalized energy.
686 * The values of these constants depends on the EM data for the specific
687 * target system and topology.
688 * Thus, this function is expected to be called by the code
689 * that bind the EM to the topology information.
692 schedtune_init_late(void)
694 struct target_nrg *ste = &schedtune_target_nrg;
695 unsigned long delta_pwr = 0;
696 struct sched_domain *sd;
697 struct sched_group *sg;
699 pr_info("schedtune: init normalization constants...\n");
706 * When EAS is in use, we always have a pointer to the highest SD
707 * which provides EM data.
709 sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
711 pr_info("schedtune: no energy model data\n");
717 schedtune_add_cluster_nrg(sd, sg, ste);
718 } while (sg = sg->next, sg != sd->groups);
722 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
723 "SYSTEM", ste->min_power, ste->max_power);
725 /* Compute normalization constants */
726 delta_pwr = ste->max_power - ste->min_power;
727 ste->rdiv = reciprocal_value(delta_pwr);
728 pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
729 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
731 schedtune_test_nrg(delta_pwr);
738 late_initcall(schedtune_init_late);