2 * Copyright (c) 2012 NVIDIA CORPORATION. All rights reserved.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; version 2 of the License.
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #include <linux/kernel.h>
20 #include <linux/cpuquiet.h>
21 #include <linux/cpumask.h>
22 #include <linux/module.h>
23 #include <linux/cpufreq.h>
24 #include <linux/pm_qos.h>
25 #include <linux/jiffies.h>
26 #include <linux/slab.h>
27 #include <linux/cpu.h>
28 #include <linux/sched.h>
29 #include <linux/tick.h>
30 #include <asm/cputime.h>
53 static DEFINE_PER_CPU(struct idle_info, idleinfo);
54 static DEFINE_PER_CPU(unsigned int, cpu_load);
56 static struct timer_list load_timer;
57 static bool load_timer_active;
59 /* configurable parameters */
60 static unsigned int balance_level = 60;
61 static unsigned int idle_bottom_freq;
62 static unsigned int idle_top_freq;
63 static unsigned long up_delay;
64 static unsigned long down_delay;
65 static unsigned long last_change_time;
66 static unsigned int load_sample_rate = 20; /* msec */
67 static struct workqueue_struct *balanced_wq;
68 static struct delayed_work balanced_work;
69 static BALANCED_STATE balanced_state;
70 static struct kobject *balanced_kobject;
72 static void calculate_load_timer(unsigned long data)
75 u64 idle_time, elapsed_time;
77 if (!load_timer_active)
80 for_each_online_cpu(i) {
81 struct idle_info *iinfo = &per_cpu(idleinfo, i);
82 unsigned int *load = &per_cpu(cpu_load, i);
84 iinfo->idle_last = iinfo->idle_current;
85 iinfo->last_timestamp = iinfo->timestamp;
87 get_cpu_idle_time_us(i, &iinfo->timestamp);
88 elapsed_time = iinfo->timestamp - iinfo->last_timestamp;
90 idle_time = iinfo->idle_current - iinfo->idle_last;
92 do_div(idle_time, elapsed_time);
93 *load = 100 - idle_time;
95 mod_timer(&load_timer, jiffies + msecs_to_jiffies(load_sample_rate));
98 static void start_load_timer(void)
102 if (load_timer_active)
105 load_timer_active = true;
107 for_each_online_cpu(i) {
108 struct idle_info *iinfo = &per_cpu(idleinfo, i);
110 iinfo->idle_current =
111 get_cpu_idle_time_us(i, &iinfo->timestamp);
113 mod_timer(&load_timer, jiffies + msecs_to_jiffies(100));
116 static void stop_load_timer(void)
118 if (!load_timer_active)
121 load_timer_active = false;
122 del_timer(&load_timer);
125 static unsigned int get_slowest_cpu_n(void)
127 unsigned int cpu = nr_cpu_ids;
128 unsigned long minload = ULONG_MAX;
131 for_each_online_cpu(i) {
132 unsigned int *load = &per_cpu(cpu_load, i);
134 if ((i > 0) && (minload > *load)) {
143 static unsigned int cpu_highest_speed(void)
145 unsigned int maxload = 0;
148 for_each_online_cpu(i) {
149 unsigned int *load = &per_cpu(cpu_load, i);
151 maxload = max(maxload, *load);
157 static unsigned int count_slow_cpus(unsigned int limit)
159 unsigned int cnt = 0;
162 for_each_online_cpu(i) {
163 unsigned int *load = &per_cpu(cpu_load, i);
174 static unsigned int rt_profile_sel;
175 static unsigned int core_bias; //Dummy variable exposed to userspace
177 static unsigned int rt_profile_default[] = {
178 /* 1, 2, 3, 4 - on-line cpus target */
182 static unsigned int rt_profile_1[] = {
183 /* 1, 2, 3, 4 - on-line cpus target */
187 static unsigned int rt_profile_2[] = {
188 /* 1, 2, 3, 4 - on-line cpus target */
192 static unsigned int rt_profile_disable[] = {
193 /* 1, 2, 3, 4 - on-line cpus target */
197 static unsigned int *rt_profiles[] = {
204 static unsigned int nr_run_hysteresis = 2; /* 0.5 thread */
205 static unsigned int nr_run_last;
207 struct runnables_avg_sample {
208 u64 previous_integral;
210 bool integral_sampled;
214 static DEFINE_PER_CPU(struct runnables_avg_sample, avg_nr_sample);
216 static unsigned int get_avg_nr_runnables(void)
218 unsigned int i, sum = 0;
219 struct runnables_avg_sample *sample;
220 u64 integral, old_integral, delta_integral, delta_time, cur_time;
222 for_each_online_cpu(i) {
223 sample = &per_cpu(avg_nr_sample, i);
224 integral = nr_running_integral(i);
225 old_integral = sample->previous_integral;
226 sample->previous_integral = integral;
227 cur_time = ktime_to_ns(ktime_get());
228 delta_time = cur_time - sample->prev_timestamp;
229 sample->prev_timestamp = cur_time;
231 if (!sample->integral_sampled) {
232 sample->integral_sampled = true;
233 /* First sample to initialize prev_integral, skip
239 if (integral < old_integral) {
241 delta_integral = (ULLONG_MAX - old_integral) + integral;
243 delta_integral = integral - old_integral;
246 /* Calculate average for the previous sample window */
247 do_div(delta_integral, delta_time);
248 sample->avg = delta_integral;
255 static CPU_SPEED_BALANCE balanced_speed_balance(void)
257 unsigned long highest_speed = cpu_highest_speed();
258 unsigned long balanced_speed = highest_speed * balance_level / 100;
259 unsigned long skewed_speed = balanced_speed / 2;
260 unsigned int nr_cpus = num_online_cpus();
261 unsigned int max_cpus = pm_qos_request(PM_QOS_MAX_ONLINE_CPUS) ? : 4;
262 unsigned int avg_nr_run = get_avg_nr_runnables();
264 unsigned int *current_profile = rt_profiles[rt_profile_sel];
266 /* balanced: freq targets for all CPUs are above 50% of highest speed
267 biased: freq target for at least one CPU is below 50% threshold
268 skewed: freq targets for at least 2 CPUs are below 25% threshold */
269 for (nr_run = 1; nr_run < ARRAY_SIZE(rt_profile_default); nr_run++) {
270 unsigned int nr_threshold = current_profile[nr_run - 1];
271 if (nr_run_last <= nr_run)
272 nr_threshold += nr_run_hysteresis;
273 if (avg_nr_run <= (nr_threshold << (FSHIFT - NR_FSHIFT)))
276 nr_run_last = nr_run;
278 if (count_slow_cpus(skewed_speed) >= 2 || nr_cpus > max_cpus ||
280 return CPU_SPEED_SKEWED;
282 if (count_slow_cpus(balanced_speed) >= 1 || nr_cpus == max_cpus ||
284 return CPU_SPEED_BIASED;
286 return CPU_SPEED_BALANCED;
289 static void balanced_work_func(struct work_struct *work)
292 unsigned int cpu = nr_cpu_ids;
293 unsigned long now = jiffies;
295 CPU_SPEED_BALANCE balance;
297 switch (balanced_state) {
301 cpu = get_slowest_cpu_n();
302 if (cpu < nr_cpu_ids) {
304 queue_delayed_work(balanced_wq,
305 &balanced_work, up_delay);
310 balance = balanced_speed_balance();
313 /* cpu speed is up and balanced - one more on-line */
314 case CPU_SPEED_BALANCED:
315 cpu = cpumask_next_zero(0, cpu_online_mask);
316 if (cpu < nr_cpu_ids)
319 /* cpu speed is up, but skewed - remove one core */
320 case CPU_SPEED_SKEWED:
321 cpu = get_slowest_cpu_n();
322 if (cpu < nr_cpu_ids)
325 /* cpu speed is up, but under-utilized - do nothing */
326 case CPU_SPEED_BIASED:
331 balanced_wq, &balanced_work, up_delay);
334 pr_err("%s: invalid cpuquiet balanced governor state %d\n",
335 __func__, balanced_state);
338 if (!up && ((now - last_change_time) < down_delay))
341 if (cpu < nr_cpu_ids) {
342 last_change_time = now;
344 cpuquiet_wake_cpu(cpu, false);
346 cpuquiet_quiesence_cpu(cpu, false);
350 static int balanced_cpufreq_transition(struct notifier_block *nb,
351 unsigned long state, void *data)
353 struct cpufreq_freqs *freqs = data;
354 unsigned long cpu_freq;
356 if (state == CPUFREQ_POSTCHANGE || state == CPUFREQ_RESUMECHANGE) {
357 cpu_freq = freqs->new;
359 switch (balanced_state) {
361 if (cpu_freq >= idle_top_freq) {
364 balanced_wq, &balanced_work, up_delay);
366 } else if (cpu_freq <= idle_bottom_freq) {
367 balanced_state = DOWN;
369 balanced_wq, &balanced_work,
375 if (cpu_freq >= idle_top_freq) {
378 balanced_wq, &balanced_work, up_delay);
383 if (cpu_freq <= idle_bottom_freq) {
384 balanced_state = DOWN;
385 queue_delayed_work(balanced_wq,
386 &balanced_work, up_delay);
391 pr_err("%s: invalid cpuquiet balanced governor "
392 "state %d\n", __func__, balanced_state);
399 static struct notifier_block balanced_cpufreq_nb = {
400 .notifier_call = balanced_cpufreq_transition,
403 static void delay_callback(struct cpuquiet_attribute *attr)
408 val = (*((unsigned long *)(attr->param)));
409 (*((unsigned long *)(attr->param))) = msecs_to_jiffies(val);
413 static void core_bias_callback (struct cpuquiet_attribute *attr)
417 val = (*((unsigned int*)(attr->param)));
418 if (val < ARRAY_SIZE(rt_profiles)) {
419 rt_profile_sel = val;
421 else { //Revert the change due to invalid range
422 core_bias = rt_profile_sel;
427 CPQ_BASIC_ATTRIBUTE(balance_level, 0644, uint);
428 CPQ_BASIC_ATTRIBUTE(idle_bottom_freq, 0644, uint);
429 CPQ_BASIC_ATTRIBUTE(idle_top_freq, 0644, uint);
430 CPQ_BASIC_ATTRIBUTE(load_sample_rate, 0644, uint);
431 CPQ_ATTRIBUTE(core_bias, 0644, uint, core_bias_callback);
432 CPQ_ATTRIBUTE(up_delay, 0644, ulong, delay_callback);
433 CPQ_ATTRIBUTE(down_delay, 0644, ulong, delay_callback);
435 static struct attribute *balanced_attributes[] = {
436 &balance_level_attr.attr,
437 &idle_bottom_freq_attr.attr,
438 &idle_top_freq_attr.attr,
440 &down_delay_attr.attr,
441 &load_sample_rate_attr.attr,
442 &core_bias_attr.attr,
446 static const struct sysfs_ops balanced_sysfs_ops = {
447 .show = cpuquiet_auto_sysfs_show,
448 .store = cpuquiet_auto_sysfs_store,
451 static struct kobj_type ktype_balanced = {
452 .sysfs_ops = &balanced_sysfs_ops,
453 .default_attrs = balanced_attributes,
456 static int balanced_sysfs(void)
460 balanced_kobject = kzalloc(sizeof(*balanced_kobject),
463 if (!balanced_kobject)
466 err = cpuquiet_kobject_init(balanced_kobject, &ktype_balanced,
470 kfree(balanced_kobject);
475 static void balanced_stop(void)
478 first unregister the notifiers. This ensures the governor state
479 can't be modified by a cpufreq transition
481 cpufreq_unregister_notifier(&balanced_cpufreq_nb,
482 CPUFREQ_TRANSITION_NOTIFIER);
484 /* now we can force the governor to be idle */
485 balanced_state = IDLE;
486 cancel_delayed_work_sync(&balanced_work);
487 destroy_workqueue(balanced_wq);
488 del_timer(&load_timer);
490 kobject_put(balanced_kobject);
493 static int balanced_start(void)
496 struct cpufreq_frequency_table *table;
497 struct cpufreq_freqs initial_freq;
499 err = balanced_sysfs();
503 balanced_wq = alloc_workqueue("cpuquiet-balanced",
504 WQ_UNBOUND | WQ_RESCUER | WQ_FREEZABLE, 1);
508 INIT_DELAYED_WORK(&balanced_work, balanced_work_func);
510 up_delay = msecs_to_jiffies(100);
511 down_delay = msecs_to_jiffies(2000);
513 table = cpufreq_frequency_get_table(0);
517 for (count = 0; table[count].frequency != CPUFREQ_TABLE_END; count++);
522 idle_top_freq = table[(count / 2) - 1].frequency;
523 idle_bottom_freq = table[(count / 2) - 2].frequency;
525 cpufreq_register_notifier(&balanced_cpufreq_nb,
526 CPUFREQ_TRANSITION_NOTIFIER);
528 init_timer(&load_timer);
529 load_timer.function = calculate_load_timer;
531 /*FIXME: Kick start the state machine by faking a freq notification*/
532 initial_freq.new = cpufreq_get(0);
533 if (initial_freq.new != 0)
534 balanced_cpufreq_transition(NULL, CPUFREQ_RESUMECHANGE,
539 struct cpuquiet_governor balanced_governor = {
541 .start = balanced_start,
542 .stop = balanced_stop,
543 .owner = THIS_MODULE,
546 static int __init init_balanced(void)
548 return cpuquiet_register_governor(&balanced_governor);
551 static void __exit exit_balanced(void)
553 cpuquiet_unregister_governor(&balanced_governor);
556 MODULE_LICENSE("GPL");
557 #ifdef CONFIG_CPUQUIET_DEFAULT_GOV_BALANCED
558 fs_initcall(init_balanced);
560 module_init(init_balanced);
562 module_exit(exit_balanced);