1 #include <linux/cgroup.h>
3 #include <linux/kernel.h>
4 #include <linux/percpu.h>
5 #include <linux/printk.h>
6 #include <linux/rcupdate.h>
7 #include <linux/slab.h>
9 #include <trace/events/sched.h>
14 #ifdef CONFIG_CGROUP_SCHEDTUNE
15 static bool schedtune_initialized = false;
18 unsigned int sysctl_sched_cfs_boost __read_mostly;
20 extern struct target_nrg schedtune_target_nrg;
22 /* Performance Boost region (B) threshold params */
23 static int perf_boost_idx;
25 /* Performance Constraint region (C) threshold params */
26 static int perf_constrain_idx;
29 * Performance-Energy (P-E) Space thresholds constants
31 struct threshold_params {
37 * System specific P-E space thresholds constants
39 static struct threshold_params
50 { 5, 0 } /* <= 100% */
54 __schedtune_accept_deltas(int nrg_delta, int cap_delta,
55 int perf_boost_idx, int perf_constrain_idx)
57 int payoff = -INT_MAX;
60 /* Performance Boost (B) region */
61 if (nrg_delta >= 0 && cap_delta > 0)
62 gain_idx = perf_boost_idx;
63 /* Performance Constraint (C) region */
64 else if (nrg_delta < 0 && cap_delta <= 0)
65 gain_idx = perf_constrain_idx;
67 /* Default: reject schedule candidate */
72 * Evaluate "Performance Boost" vs "Energy Increase"
74 * - Performance Boost (B) region
76 * Condition: nrg_delta > 0 && cap_delta > 0
78 * cap_gain / nrg_gain < cap_delta / nrg_delta =
79 * cap_gain * nrg_delta < cap_delta * nrg_gain
80 * Note that since both nrg_gain and nrg_delta are positive, the
81 * inequality does not change. Thus:
83 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
85 * - Performance Constraint (C) region
87 * Condition: nrg_delta < 0 && cap_delta < 0
89 * cap_gain / nrg_gain > cap_delta / nrg_delta =
90 * cap_gain * nrg_delta < cap_delta * nrg_gain
91 * Note that since nrg_gain > 0 while nrg_delta < 0, the
92 * inequality change. Thus:
94 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
96 * This means that, in case of same positive defined {cap,nrg}_gain
97 * for both the B and C regions, we can use the same payoff formula
98 * where a positive value represents the accept condition.
100 payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
101 payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
106 #ifdef CONFIG_CGROUP_SCHEDTUNE
109 * EAS scheduler tunables for task groups.
112 /* SchdTune tunables for a group of tasks */
114 /* SchedTune CGroup subsystem */
115 struct cgroup_subsys_state css;
117 /* Boost group allocated ID */
120 /* Boost value for tasks on that SchedTune CGroup */
123 /* Performance Boost (B) region threshold params */
126 /* Performance Constraint (C) region threshold params */
127 int perf_constrain_idx;
129 /* Hint to bias scheduling of tasks on that SchedTune CGroup
130 * towards idle CPUs */
134 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
136 return css ? container_of(css, struct schedtune, css) : NULL;
139 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
141 return css_st(task_css(tsk, schedtune_cgrp_id));
144 static inline struct schedtune *parent_st(struct schedtune *st)
146 return css_st(st->css.parent);
150 * SchedTune root control group
151 * The root control group is used to defined a system-wide boosting tuning,
152 * which is applied to all tasks in the system.
153 * Task specific boost tuning could be specified by creating and
154 * configuring a child control group under the root one.
155 * By default, system-wide boosting is disabled, i.e. no boosting is applied
156 * to tasks which are not into a child control group.
158 static struct schedtune
162 .perf_constrain_idx = 0,
167 schedtune_accept_deltas(int nrg_delta, int cap_delta,
168 struct task_struct *task)
170 struct schedtune *ct;
172 int perf_constrain_idx;
174 /* Optimal (O) region */
175 if (nrg_delta < 0 && cap_delta > 0) {
176 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
180 /* Suboptimal (S) region */
181 if (nrg_delta > 0 && cap_delta < 0) {
182 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
186 /* Get task specific perf Boost/Constraints indexes */
188 ct = task_schedtune(task);
189 perf_boost_idx = ct->perf_boost_idx;
190 perf_constrain_idx = ct->perf_constrain_idx;
193 return __schedtune_accept_deltas(nrg_delta, cap_delta,
194 perf_boost_idx, perf_constrain_idx);
198 * Maximum number of boost groups to support
199 * When per-task boosting is used we still allow only limited number of
200 * boost groups for two main reasons:
201 * 1. on a real system we usually have only few classes of workloads which
202 * make sense to boost with different values (e.g. background vs foreground
203 * tasks, interactive vs low-priority tasks)
204 * 2. a limited number allows for a simpler and more memory/time efficient
205 * implementation especially for the computation of the per-CPU boost
208 #define BOOSTGROUPS_COUNT 4
210 /* Array of configured boostgroups */
211 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
216 /* SchedTune boost groups
217 * Keep track of all the boost groups which impact on CPU, for example when a
218 * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
219 * likely with different boost values.
220 * Since on each system we expect only a limited number of boost groups, here
221 * we use a simple array to keep track of the metrics required to compute the
222 * maximum per-CPU boosting value.
224 struct boost_groups {
225 /* Maximum boost value for all RUNNABLE tasks on a CPU */
229 /* The boost for tasks on that boost group */
231 /* Count of RUNNABLE tasks on that boost group */
233 } group[BOOSTGROUPS_COUNT];
234 /* CPU's boost group locking */
238 /* Boost groups affecting each CPU in the system */
239 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
242 schedtune_cpu_update(int cpu)
244 struct boost_groups *bg;
248 bg = &per_cpu(cpu_boost_groups, cpu);
250 /* The root boost group is always active */
251 boost_max = bg->group[0].boost;
252 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
254 * A boost group affects a CPU only if it has
255 * RUNNABLE tasks on that CPU
257 if (bg->group[idx].tasks == 0)
260 boost_max = max(boost_max, bg->group[idx].boost);
262 /* Ensures boost_max is non-negative when all cgroup boost values
263 * are neagtive. Avoids under-accounting of cpu capacity which may cause
264 * task stacking and frequency spikes.*/
265 boost_max = max(boost_max, 0);
266 bg->boost_max = boost_max;
270 schedtune_boostgroup_update(int idx, int boost)
272 struct boost_groups *bg;
277 /* Update per CPU boost groups */
278 for_each_possible_cpu(cpu) {
279 bg = &per_cpu(cpu_boost_groups, cpu);
282 * Keep track of current boost values to compute the per CPU
283 * maximum only when it has been affected by the new value of
284 * the updated boost group
286 cur_boost_max = bg->boost_max;
287 old_boost = bg->group[idx].boost;
289 /* Update the boost value of this boost group */
290 bg->group[idx].boost = boost;
292 /* Check if this update increase current max */
293 if (boost > cur_boost_max && bg->group[idx].tasks) {
294 bg->boost_max = boost;
295 trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
299 /* Check if this update has decreased current max */
300 if (cur_boost_max == old_boost && old_boost > boost) {
301 schedtune_cpu_update(cpu);
302 trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
306 trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
312 #define ENQUEUE_TASK 1
313 #define DEQUEUE_TASK -1
316 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
318 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
319 int tasks = bg->group[idx].tasks + task_count;
321 /* Update boosted tasks count while avoiding to make it negative */
322 bg->group[idx].tasks = max(0, tasks);
324 trace_sched_tune_tasks_update(p, cpu, tasks, idx,
325 bg->group[idx].boost, bg->boost_max);
327 /* Boost group activation or deactivation on that RQ */
328 if (tasks == 1 || tasks == 0)
329 schedtune_cpu_update(cpu);
333 * NOTE: This function must be called while holding the lock on the CPU RQ
335 void schedtune_enqueue_task(struct task_struct *p, int cpu)
337 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
338 unsigned long irq_flags;
339 struct schedtune *st;
342 if (!unlikely(schedtune_initialized))
346 * When a task is marked PF_EXITING by do_exit() it's going to be
347 * dequeued and enqueued multiple times in the exit path.
348 * Thus we avoid any further update, since we do not want to change
349 * CPU boosting while the task is exiting.
351 if (p->flags & PF_EXITING)
355 * Boost group accouting is protected by a per-cpu lock and requires
356 * interrupt to be disabled to avoid race conditions for example on
357 * do_exit()::cgroup_exit() and task migration.
359 raw_spin_lock_irqsave(&bg->lock, irq_flags);
362 st = task_schedtune(p);
365 schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
368 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
371 int schedtune_allow_attach(struct cgroup_taskset *tset)
373 /* We always allows tasks to be moved between existing CGroups */
377 int schedtune_can_attach(struct cgroup_taskset *tset)
379 struct task_struct *task;
380 struct cgroup_subsys_state *css;
381 struct boost_groups *bg;
382 unsigned long irq_flags;
385 int src_bg; /* Source boost group index */
386 int dst_bg; /* Destination boost group index */
389 if (!unlikely(schedtune_initialized))
393 cgroup_taskset_for_each(task, css, tset) {
396 * Lock the CPU's RQ the task is enqueued to avoid race
397 * conditions with migration code while the task is being
400 rq = lock_rq_of(task, &irq_flags);
403 unlock_rq_of(rq, task, &irq_flags);
408 * Boost group accouting is protected by a per-cpu lock and requires
409 * interrupt to be disabled to avoid race conditions on...
412 bg = &per_cpu(cpu_boost_groups, cpu);
413 raw_spin_lock(&bg->lock);
415 dst_bg = css_st(css)->idx;
416 src_bg = task_schedtune(task)->idx;
419 * Current task is not changing boostgroup, which can
420 * happen when the new hierarchy is in use.
422 if (unlikely(dst_bg == src_bg)) {
423 raw_spin_unlock(&bg->lock);
424 unlock_rq_of(rq, task, &irq_flags);
429 * This is the case of a RUNNABLE task which is switching its
430 * current boost group.
433 /* Move task from src to dst boost group */
434 tasks = bg->group[src_bg].tasks - 1;
435 bg->group[src_bg].tasks = max(0, tasks);
436 bg->group[dst_bg].tasks += 1;
438 raw_spin_unlock(&bg->lock);
439 unlock_rq_of(rq, task, &irq_flags);
441 /* Update CPU boost group */
442 if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
443 schedtune_cpu_update(task_cpu(task));
450 void schedtune_cancel_attach(struct cgroup_taskset *tset)
452 /* This can happen only if SchedTune controller is mounted with
453 * other hierarchies ane one of them fails. Since usually SchedTune is
454 * mouted on its own hierarcy, for the time being we do not implement
455 * a proper rollback mechanism */
456 WARN(1, "SchedTune cancel attach not implemented");
460 * NOTE: This function must be called while holding the lock on the CPU RQ
462 void schedtune_dequeue_task(struct task_struct *p, int cpu)
464 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
465 unsigned long irq_flags;
466 struct schedtune *st;
469 if (!unlikely(schedtune_initialized))
473 * When a task is marked PF_EXITING by do_exit() it's going to be
474 * dequeued and enqueued multiple times in the exit path.
475 * Thus we avoid any further update, since we do not want to change
476 * CPU boosting while the task is exiting.
477 * The last dequeue is already enforce by the do_exit() code path
478 * via schedtune_exit_task().
480 if (p->flags & PF_EXITING)
484 * Boost group accouting is protected by a per-cpu lock and requires
485 * interrupt to be disabled to avoid race conditions on...
487 raw_spin_lock_irqsave(&bg->lock, irq_flags);
490 st = task_schedtune(p);
493 schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
496 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
499 void schedtune_exit_task(struct task_struct *tsk)
501 struct schedtune *st;
502 unsigned long irq_flags;
507 if (!unlikely(schedtune_initialized))
510 rq = lock_rq_of(tsk, &irq_flags);
514 st = task_schedtune(tsk);
516 schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
519 unlock_rq_of(rq, tsk, &irq_flags);
522 int schedtune_cpu_boost(int cpu)
524 struct boost_groups *bg;
526 bg = &per_cpu(cpu_boost_groups, cpu);
527 return bg->boost_max;
530 int schedtune_task_boost(struct task_struct *p)
532 struct schedtune *st;
535 /* Get task boost value */
537 st = task_schedtune(p);
538 task_boost = st->boost;
544 int schedtune_prefer_idle(struct task_struct *p)
546 struct schedtune *st;
549 /* Get prefer_idle value */
551 st = task_schedtune(p);
552 prefer_idle = st->prefer_idle;
559 prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
561 struct schedtune *st = css_st(css);
563 return st->prefer_idle;
567 prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
570 struct schedtune *st = css_st(css);
571 st->prefer_idle = prefer_idle;
577 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
579 struct schedtune *st = css_st(css);
585 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
588 struct schedtune *st = css_st(css);
589 unsigned threshold_idx;
592 if (boost < -100 || boost > 100)
597 * Update threshold params for Performance Boost (B)
598 * and Performance Constraint (C) regions.
599 * The current implementatio uses the same cuts for both
602 threshold_idx = clamp(boost_pct, 0, 99) / 10;
603 st->perf_boost_idx = threshold_idx;
604 st->perf_constrain_idx = threshold_idx;
607 if (css == &root_schedtune.css) {
608 sysctl_sched_cfs_boost = boost;
609 perf_boost_idx = threshold_idx;
610 perf_constrain_idx = threshold_idx;
613 /* Update CPU boost */
614 schedtune_boostgroup_update(st->idx, st->boost);
616 trace_sched_tune_config(st->boost);
621 static struct cftype files[] = {
624 .read_s64 = boost_read,
625 .write_s64 = boost_write,
628 .name = "prefer_idle",
629 .read_u64 = prefer_idle_read,
630 .write_u64 = prefer_idle_write,
636 schedtune_boostgroup_init(struct schedtune *st)
638 struct boost_groups *bg;
641 /* Keep track of allocated boost groups */
642 allocated_group[st->idx] = st;
644 /* Initialize the per CPU boost groups */
645 for_each_possible_cpu(cpu) {
646 bg = &per_cpu(cpu_boost_groups, cpu);
647 bg->group[st->idx].boost = 0;
648 bg->group[st->idx].tasks = 0;
654 static struct cgroup_subsys_state *
655 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
657 struct schedtune *st;
661 return &root_schedtune.css;
663 /* Allow only single level hierachies */
664 if (parent_css != &root_schedtune.css) {
665 pr_err("Nested SchedTune boosting groups not allowed\n");
666 return ERR_PTR(-ENOMEM);
669 /* Allow only a limited number of boosting groups */
670 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
671 if (!allocated_group[idx])
673 if (idx == BOOSTGROUPS_COUNT) {
674 pr_err("Trying to create more than %d SchedTune boosting groups\n",
676 return ERR_PTR(-ENOSPC);
679 st = kzalloc(sizeof(*st), GFP_KERNEL);
683 /* Initialize per CPUs boost group support */
685 if (schedtune_boostgroup_init(st))
693 return ERR_PTR(-ENOMEM);
697 schedtune_boostgroup_release(struct schedtune *st)
699 /* Reset this boost group */
700 schedtune_boostgroup_update(st->idx, 0);
702 /* Keep track of allocated boost groups */
703 allocated_group[st->idx] = NULL;
707 schedtune_css_free(struct cgroup_subsys_state *css)
709 struct schedtune *st = css_st(css);
711 schedtune_boostgroup_release(st);
715 struct cgroup_subsys schedtune_cgrp_subsys = {
716 .css_alloc = schedtune_css_alloc,
717 .css_free = schedtune_css_free,
718 .allow_attach = schedtune_allow_attach,
719 .can_attach = schedtune_can_attach,
720 .cancel_attach = schedtune_cancel_attach,
721 .legacy_cftypes = files,
726 schedtune_init_cgroups(void)
728 struct boost_groups *bg;
731 /* Initialize the per CPU boost groups */
732 for_each_possible_cpu(cpu) {
733 bg = &per_cpu(cpu_boost_groups, cpu);
734 memset(bg, 0, sizeof(struct boost_groups));
737 pr_info("schedtune: configured to support %d boost groups\n",
740 schedtune_initialized = true;
743 #else /* CONFIG_CGROUP_SCHEDTUNE */
746 schedtune_accept_deltas(int nrg_delta, int cap_delta,
747 struct task_struct *task)
749 /* Optimal (O) region */
750 if (nrg_delta < 0 && cap_delta > 0) {
751 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
755 /* Suboptimal (S) region */
756 if (nrg_delta > 0 && cap_delta < 0) {
757 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
761 return __schedtune_accept_deltas(nrg_delta, cap_delta,
762 perf_boost_idx, perf_constrain_idx);
765 #endif /* CONFIG_CGROUP_SCHEDTUNE */
768 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
769 void __user *buffer, size_t *lenp,
772 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
773 unsigned threshold_idx;
779 if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
781 boost_pct = sysctl_sched_cfs_boost;
784 * Update threshold params for Performance Boost (B)
785 * and Performance Constraint (C) regions.
786 * The current implementatio uses the same cuts for both
789 threshold_idx = clamp(boost_pct, 0, 99) / 10;
790 perf_boost_idx = threshold_idx;
791 perf_constrain_idx = threshold_idx;
796 #ifdef CONFIG_SCHED_DEBUG
798 schedtune_test_nrg(unsigned long delta_pwr)
800 unsigned long test_delta_pwr;
801 unsigned long test_norm_pwr;
805 * Check normalization constants using some constant system
808 pr_info("schedtune: verify normalization constants...\n");
809 for (idx = 0; idx < 6; ++idx) {
810 test_delta_pwr = delta_pwr >> idx;
812 /* Normalize on max energy for target platform */
813 test_norm_pwr = reciprocal_divide(
814 test_delta_pwr << SCHED_LOAD_SHIFT,
815 schedtune_target_nrg.rdiv);
817 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
818 idx, test_delta_pwr, test_norm_pwr);
822 #define schedtune_test_nrg(delta_pwr)
826 * Compute the min/max power consumption of a cluster and all its CPUs
829 schedtune_add_cluster_nrg(
830 struct sched_domain *sd,
831 struct sched_group *sg,
832 struct target_nrg *ste)
834 struct sched_domain *sd2;
835 struct sched_group *sg2;
837 struct cpumask *cluster_cpus;
840 unsigned long min_pwr;
841 unsigned long max_pwr;
844 /* Get Cluster energy using EM data for the first CPU */
845 cluster_cpus = sched_group_cpus(sg);
846 snprintf(str, 32, "CLUSTER[%*pbl]",
847 cpumask_pr_args(cluster_cpus));
849 min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
850 max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
851 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
852 str, min_pwr, max_pwr);
855 * Keep track of this cluster's energy in the computation of the
856 * overall system energy
858 ste->min_power += min_pwr;
859 ste->max_power += max_pwr;
861 /* Get CPU energy using EM data for each CPU in the group */
862 for_each_cpu(cpu, cluster_cpus) {
863 /* Get a SD view for the specific CPU */
864 for_each_domain(cpu, sd2) {
865 /* Get the CPU group */
867 min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
868 max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
870 ste->min_power += min_pwr;
871 ste->max_power += max_pwr;
873 snprintf(str, 32, "CPU[%d]", cpu);
874 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
875 str, min_pwr, max_pwr);
878 * Assume we have EM data only at the CPU and
879 * the upper CLUSTER level
881 BUG_ON(!cpumask_equal(
882 sched_group_cpus(sg),
883 sched_group_cpus(sd2->parent->groups)
891 * Initialize the constants required to compute normalized energy.
892 * The values of these constants depends on the EM data for the specific
893 * target system and topology.
894 * Thus, this function is expected to be called by the code
895 * that bind the EM to the topology information.
900 struct target_nrg *ste = &schedtune_target_nrg;
901 unsigned long delta_pwr = 0;
902 struct sched_domain *sd;
903 struct sched_group *sg;
905 pr_info("schedtune: init normalization constants...\n");
912 * When EAS is in use, we always have a pointer to the highest SD
913 * which provides EM data.
915 sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
917 pr_info("schedtune: no energy model data\n");
923 schedtune_add_cluster_nrg(sd, sg, ste);
924 } while (sg = sg->next, sg != sd->groups);
928 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
929 "SYSTEM", ste->min_power, ste->max_power);
931 /* Compute normalization constants */
932 delta_pwr = ste->max_power - ste->min_power;
933 ste->rdiv = reciprocal_value(delta_pwr);
934 pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
935 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
937 schedtune_test_nrg(delta_pwr);
939 #ifdef CONFIG_CGROUP_SCHEDTUNE
940 schedtune_init_cgroups();
942 pr_info("schedtune: configured to support global boosting only\n");
951 postcore_initcall(schedtune_init);