2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
33 #include <trace/events/sched.h>
34 #include <linux/sysfs.h>
35 #include <linux/vmalloc.h>
36 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
37 /* Include cpufreq header to add a notifier so that cpu frequency
38 * scaling can track the current CPU frequency
40 #include <linux/cpufreq.h>
41 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
47 * Targeted preemption latency for CPU-bound tasks:
48 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
50 * NOTE: this latency value is not the same as the concept of
51 * 'timeslice length' - timeslices in CFS are of variable length
52 * and have no persistent notion like in traditional, time-slice
53 * based scheduling concepts.
55 * (to see the precise effective timeslice length of your workload,
56 * run vmstat and monitor the context-switches (cs) field)
58 unsigned int sysctl_sched_latency = 6000000ULL;
59 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
62 * The initial- and re-scaling of tunables is configurable
63 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
66 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
67 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
68 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
70 enum sched_tunable_scaling sysctl_sched_tunable_scaling
71 = SCHED_TUNABLESCALING_LOG;
74 * Minimal preemption granularity for CPU-bound tasks:
75 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 unsigned int sysctl_sched_min_granularity = 750000ULL;
78 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
81 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
83 static unsigned int sched_nr_latency = 8;
86 * After fork, child runs first. If set to 0 (default) then
87 * parent will (try to) run first.
89 unsigned int sysctl_sched_child_runs_first __read_mostly;
92 * SCHED_OTHER wake-up granularity.
93 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
95 * This option delays the preemption effects of decoupled workloads
96 * and reduces their over-scheduling. Synchronous workloads will still
97 * have immediate wakeup/sleep latencies.
99 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
100 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
102 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
105 * The exponential sliding window over which load is averaged for shares
109 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
111 #ifdef CONFIG_CFS_BANDWIDTH
113 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
114 * each time a cfs_rq requests quota.
116 * Note: in the case that the slice exceeds the runtime remaining (either due
117 * to consumption or the quota being specified to be smaller than the slice)
118 * we will always only issue the remaining available time.
120 * default: 5 msec, units: microseconds
122 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
126 * Increase the granularity value when there are more CPUs,
127 * because with more CPUs the 'effective latency' as visible
128 * to users decreases. But the relationship is not linear,
129 * so pick a second-best guess by going with the log2 of the
132 * This idea comes from the SD scheduler of Con Kolivas:
134 static int get_update_sysctl_factor(void)
136 unsigned int cpus = min_t(int, num_online_cpus(), 8);
139 switch (sysctl_sched_tunable_scaling) {
140 case SCHED_TUNABLESCALING_NONE:
143 case SCHED_TUNABLESCALING_LINEAR:
146 case SCHED_TUNABLESCALING_LOG:
148 factor = 1 + ilog2(cpus);
155 static void update_sysctl(void)
157 unsigned int factor = get_update_sysctl_factor();
159 #define SET_SYSCTL(name) \
160 (sysctl_##name = (factor) * normalized_sysctl_##name)
161 SET_SYSCTL(sched_min_granularity);
162 SET_SYSCTL(sched_latency);
163 SET_SYSCTL(sched_wakeup_granularity);
167 void sched_init_granularity(void)
172 #if BITS_PER_LONG == 32
173 # define WMULT_CONST (~0UL)
175 # define WMULT_CONST (1UL << 32)
178 #define WMULT_SHIFT 32
181 * Shift right and round:
183 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
186 * delta *= weight / lw
189 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
190 struct load_weight *lw)
195 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
196 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
197 * 2^SCHED_LOAD_RESOLUTION.
199 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
200 tmp = (u64)delta_exec * scale_load_down(weight);
202 tmp = (u64)delta_exec;
204 if (!lw->inv_weight) {
205 unsigned long w = scale_load_down(lw->weight);
207 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
209 else if (unlikely(!w))
210 lw->inv_weight = WMULT_CONST;
212 lw->inv_weight = WMULT_CONST / w;
216 * Check whether we'd overflow the 64-bit multiplication:
218 if (unlikely(tmp > WMULT_CONST))
219 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
222 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
224 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
228 const struct sched_class fair_sched_class;
230 /**************************************************************
231 * CFS operations on generic schedulable entities:
234 #ifdef CONFIG_FAIR_GROUP_SCHED
236 /* cpu runqueue to which this cfs_rq is attached */
237 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
242 /* An entity is a task if it doesn't "own" a runqueue */
243 #define entity_is_task(se) (!se->my_q)
245 static inline struct task_struct *task_of(struct sched_entity *se)
247 #ifdef CONFIG_SCHED_DEBUG
248 WARN_ON_ONCE(!entity_is_task(se));
250 return container_of(se, struct task_struct, se);
253 /* Walk up scheduling entities hierarchy */
254 #define for_each_sched_entity(se) \
255 for (; se; se = se->parent)
257 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
262 /* runqueue on which this entity is (to be) queued */
263 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
268 /* runqueue "owned" by this group */
269 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
274 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
277 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
279 if (!cfs_rq->on_list) {
281 * Ensure we either appear before our parent (if already
282 * enqueued) or force our parent to appear after us when it is
283 * enqueued. The fact that we always enqueue bottom-up
284 * reduces this to two cases.
286 if (cfs_rq->tg->parent &&
287 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
288 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
289 &rq_of(cfs_rq)->leaf_cfs_rq_list);
291 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
292 &rq_of(cfs_rq)->leaf_cfs_rq_list);
296 /* We should have no load, but we need to update last_decay. */
297 update_cfs_rq_blocked_load(cfs_rq, 0);
301 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
303 if (cfs_rq->on_list) {
304 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
309 /* Iterate thr' all leaf cfs_rq's on a runqueue */
310 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
311 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
313 /* Do the two (enqueued) entities belong to the same group ? */
315 is_same_group(struct sched_entity *se, struct sched_entity *pse)
317 if (se->cfs_rq == pse->cfs_rq)
323 static inline struct sched_entity *parent_entity(struct sched_entity *se)
328 /* return depth at which a sched entity is present in the hierarchy */
329 static inline int depth_se(struct sched_entity *se)
333 for_each_sched_entity(se)
340 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
342 int se_depth, pse_depth;
345 * preemption test can be made between sibling entities who are in the
346 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
347 * both tasks until we find their ancestors who are siblings of common
351 /* First walk up until both entities are at same depth */
352 se_depth = depth_se(*se);
353 pse_depth = depth_se(*pse);
355 while (se_depth > pse_depth) {
357 *se = parent_entity(*se);
360 while (pse_depth > se_depth) {
362 *pse = parent_entity(*pse);
365 while (!is_same_group(*se, *pse)) {
366 *se = parent_entity(*se);
367 *pse = parent_entity(*pse);
371 #else /* !CONFIG_FAIR_GROUP_SCHED */
373 static inline struct task_struct *task_of(struct sched_entity *se)
375 return container_of(se, struct task_struct, se);
378 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
380 return container_of(cfs_rq, struct rq, cfs);
383 #define entity_is_task(se) 1
385 #define for_each_sched_entity(se) \
386 for (; se; se = NULL)
388 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
390 return &task_rq(p)->cfs;
393 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
395 struct task_struct *p = task_of(se);
396 struct rq *rq = task_rq(p);
401 /* runqueue "owned" by this group */
402 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
407 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
411 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
415 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
419 is_same_group(struct sched_entity *se, struct sched_entity *pse)
424 static inline struct sched_entity *parent_entity(struct sched_entity *se)
430 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
434 #endif /* CONFIG_FAIR_GROUP_SCHED */
436 static __always_inline
437 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
439 /**************************************************************
440 * Scheduling class tree data structure manipulation methods:
443 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
445 s64 delta = (s64)(vruntime - max_vruntime);
447 max_vruntime = vruntime;
452 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
454 s64 delta = (s64)(vruntime - min_vruntime);
456 min_vruntime = vruntime;
461 static inline int entity_before(struct sched_entity *a,
462 struct sched_entity *b)
464 return (s64)(a->vruntime - b->vruntime) < 0;
467 static void update_min_vruntime(struct cfs_rq *cfs_rq)
469 u64 vruntime = cfs_rq->min_vruntime;
472 vruntime = cfs_rq->curr->vruntime;
474 if (cfs_rq->rb_leftmost) {
475 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
480 vruntime = se->vruntime;
482 vruntime = min_vruntime(vruntime, se->vruntime);
485 /* ensure we never gain time by being placed backwards. */
486 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
489 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
494 * Enqueue an entity into the rb-tree:
496 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
498 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
499 struct rb_node *parent = NULL;
500 struct sched_entity *entry;
504 * Find the right place in the rbtree:
508 entry = rb_entry(parent, struct sched_entity, run_node);
510 * We dont care about collisions. Nodes with
511 * the same key stay together.
513 if (entity_before(se, entry)) {
514 link = &parent->rb_left;
516 link = &parent->rb_right;
522 * Maintain a cache of leftmost tree entries (it is frequently
526 cfs_rq->rb_leftmost = &se->run_node;
528 rb_link_node(&se->run_node, parent, link);
529 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
532 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
534 if (cfs_rq->rb_leftmost == &se->run_node) {
535 struct rb_node *next_node;
537 next_node = rb_next(&se->run_node);
538 cfs_rq->rb_leftmost = next_node;
541 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
544 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
546 struct rb_node *left = cfs_rq->rb_leftmost;
551 return rb_entry(left, struct sched_entity, run_node);
554 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
556 struct rb_node *next = rb_next(&se->run_node);
561 return rb_entry(next, struct sched_entity, run_node);
564 #ifdef CONFIG_SCHED_DEBUG
565 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
567 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
572 return rb_entry(last, struct sched_entity, run_node);
575 /**************************************************************
576 * Scheduling class statistics methods:
579 int sched_proc_update_handler(struct ctl_table *table, int write,
580 void __user *buffer, size_t *lenp,
583 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
584 int factor = get_update_sysctl_factor();
589 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
590 sysctl_sched_min_granularity);
592 #define WRT_SYSCTL(name) \
593 (normalized_sysctl_##name = sysctl_##name / (factor))
594 WRT_SYSCTL(sched_min_granularity);
595 WRT_SYSCTL(sched_latency);
596 WRT_SYSCTL(sched_wakeup_granularity);
606 static inline unsigned long
607 calc_delta_fair(unsigned long delta, struct sched_entity *se)
609 if (unlikely(se->load.weight != NICE_0_LOAD))
610 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
616 * The idea is to set a period in which each task runs once.
618 * When there are too many tasks (sched_nr_latency) we have to stretch
619 * this period because otherwise the slices get too small.
621 * p = (nr <= nl) ? l : l*nr/nl
623 static u64 __sched_period(unsigned long nr_running)
625 u64 period = sysctl_sched_latency;
626 unsigned long nr_latency = sched_nr_latency;
628 if (unlikely(nr_running > nr_latency)) {
629 period = sysctl_sched_min_granularity;
630 period *= nr_running;
637 * We calculate the wall-time slice from the period by taking a part
638 * proportional to the weight.
642 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
646 for_each_sched_entity(se) {
647 struct load_weight *load;
648 struct load_weight lw;
650 cfs_rq = cfs_rq_of(se);
651 load = &cfs_rq->load;
653 if (unlikely(!se->on_rq)) {
656 update_load_add(&lw, se->load.weight);
659 slice = calc_delta_mine(slice, se->load.weight, load);
665 * We calculate the vruntime slice of a to-be-inserted task.
669 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
671 return calc_delta_fair(sched_slice(cfs_rq, se), se);
675 * Update the current task's runtime statistics. Skip current tasks that
676 * are not in our scheduling class.
679 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
680 unsigned long delta_exec)
682 unsigned long delta_exec_weighted;
684 schedstat_set(curr->statistics.exec_max,
685 max((u64)delta_exec, curr->statistics.exec_max));
687 curr->sum_exec_runtime += delta_exec;
688 schedstat_add(cfs_rq, exec_clock, delta_exec);
689 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
691 curr->vruntime += delta_exec_weighted;
692 update_min_vruntime(cfs_rq);
695 static void update_curr(struct cfs_rq *cfs_rq)
697 struct sched_entity *curr = cfs_rq->curr;
698 u64 now = rq_of(cfs_rq)->clock_task;
699 unsigned long delta_exec;
705 * Get the amount of time the current task was running
706 * since the last time we changed load (this cannot
707 * overflow on 32 bits):
709 delta_exec = (unsigned long)(now - curr->exec_start);
713 __update_curr(cfs_rq, curr, delta_exec);
714 curr->exec_start = now;
716 if (entity_is_task(curr)) {
717 struct task_struct *curtask = task_of(curr);
719 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
720 cpuacct_charge(curtask, delta_exec);
721 account_group_exec_runtime(curtask, delta_exec);
724 account_cfs_rq_runtime(cfs_rq, delta_exec);
728 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
730 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
734 * Task is being enqueued - update stats:
736 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
739 * Are we enqueueing a waiting task? (for current tasks
740 * a dequeue/enqueue event is a NOP)
742 if (se != cfs_rq->curr)
743 update_stats_wait_start(cfs_rq, se);
747 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
749 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
750 rq_of(cfs_rq)->clock - se->statistics.wait_start));
751 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
752 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
753 rq_of(cfs_rq)->clock - se->statistics.wait_start);
754 #ifdef CONFIG_SCHEDSTATS
755 if (entity_is_task(se)) {
756 trace_sched_stat_wait(task_of(se),
757 rq_of(cfs_rq)->clock - se->statistics.wait_start);
760 schedstat_set(se->statistics.wait_start, 0);
764 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
767 * Mark the end of the wait period if dequeueing a
770 if (se != cfs_rq->curr)
771 update_stats_wait_end(cfs_rq, se);
775 * We are picking a new current task - update its stats:
778 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
781 * We are starting a new run period:
783 se->exec_start = rq_of(cfs_rq)->clock_task;
786 /**************************************************
787 * Scheduling class queueing methods:
790 #ifdef CONFIG_NUMA_BALANCING
792 * numa task sample period in ms
794 unsigned int sysctl_numa_balancing_scan_period_min = 100;
795 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
796 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
798 /* Portion of address space to scan in MB */
799 unsigned int sysctl_numa_balancing_scan_size = 256;
801 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
802 unsigned int sysctl_numa_balancing_scan_delay = 1000;
804 static void task_numa_placement(struct task_struct *p)
808 if (!p->mm) /* for example, ksmd faulting in a user's mm */
810 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
811 if (p->numa_scan_seq == seq)
813 p->numa_scan_seq = seq;
815 /* FIXME: Scheduling placement policy hints go here */
819 * Got a PROT_NONE fault for a page on @node.
821 void task_numa_fault(int node, int pages, bool migrated)
823 struct task_struct *p = current;
825 if (!sched_feat_numa(NUMA))
828 /* FIXME: Allocate task-specific structure for placement policy here */
831 * If pages are properly placed (did not migrate) then scan slower.
832 * This is reset periodically in case of phase changes
835 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
836 p->numa_scan_period + jiffies_to_msecs(10));
838 task_numa_placement(p);
841 static void reset_ptenuma_scan(struct task_struct *p)
843 ACCESS_ONCE(p->mm->numa_scan_seq)++;
844 p->mm->numa_scan_offset = 0;
848 * The expensive part of numa migration is done from task_work context.
849 * Triggered from task_tick_numa().
851 void task_numa_work(struct callback_head *work)
853 unsigned long migrate, next_scan, now = jiffies;
854 struct task_struct *p = current;
855 struct mm_struct *mm = p->mm;
856 struct vm_area_struct *vma;
857 unsigned long start, end;
860 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
862 work->next = work; /* protect against double add */
864 * Who cares about NUMA placement when they're dying.
866 * NOTE: make sure not to dereference p->mm before this check,
867 * exit_task_work() happens _after_ exit_mm() so we could be called
868 * without p->mm even though we still had it when we enqueued this
871 if (p->flags & PF_EXITING)
875 * We do not care about task placement until a task runs on a node
876 * other than the first one used by the address space. This is
877 * largely because migrations are driven by what CPU the task
878 * is running on. If it's never scheduled on another node, it'll
879 * not migrate so why bother trapping the fault.
881 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
882 mm->first_nid = numa_node_id();
883 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
884 /* Are we running on a new node yet? */
885 if (numa_node_id() == mm->first_nid &&
886 !sched_feat_numa(NUMA_FORCE))
889 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
893 * Reset the scan period if enough time has gone by. Objective is that
894 * scanning will be reduced if pages are properly placed. As tasks
895 * can enter different phases this needs to be re-examined. Lacking
896 * proper tracking of reference behaviour, this blunt hammer is used.
898 migrate = mm->numa_next_reset;
899 if (time_after(now, migrate)) {
900 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
901 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
902 xchg(&mm->numa_next_reset, next_scan);
906 * Enforce maximal scan/migration frequency..
908 migrate = mm->numa_next_scan;
909 if (time_before(now, migrate))
912 if (p->numa_scan_period == 0)
913 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
915 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
916 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
920 * Do not set pte_numa if the current running node is rate-limited.
921 * This loses statistics on the fault but if we are unwilling to
922 * migrate to this node, it is less likely we can do useful work
924 if (migrate_ratelimited(numa_node_id()))
927 start = mm->numa_scan_offset;
928 pages = sysctl_numa_balancing_scan_size;
929 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
933 down_read(&mm->mmap_sem);
934 vma = find_vma(mm, start);
936 reset_ptenuma_scan(p);
940 for (; vma; vma = vma->vm_next) {
941 if (!vma_migratable(vma))
944 /* Skip small VMAs. They are not likely to be of relevance */
945 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
949 start = max(start, vma->vm_start);
950 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
951 end = min(end, vma->vm_end);
952 pages -= change_prot_numa(vma, start, end);
957 } while (end != vma->vm_end);
962 * It is possible to reach the end of the VMA list but the last few VMAs are
963 * not guaranteed to the vma_migratable. If they are not, we would find the
964 * !migratable VMA on the next scan but not reset the scanner to the start
968 mm->numa_scan_offset = start;
970 reset_ptenuma_scan(p);
971 up_read(&mm->mmap_sem);
975 * Drive the periodic memory faults..
977 void task_tick_numa(struct rq *rq, struct task_struct *curr)
979 struct callback_head *work = &curr->numa_work;
983 * We don't care about NUMA placement if we don't have memory.
985 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
989 * Using runtime rather than walltime has the dual advantage that
990 * we (mostly) drive the selection from busy threads and that the
991 * task needs to have done some actual work before we bother with
994 now = curr->se.sum_exec_runtime;
995 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
997 if (now - curr->node_stamp > period) {
998 if (!curr->node_stamp)
999 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1000 curr->node_stamp = now;
1002 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1003 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1004 task_work_add(curr, work, true);
1009 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1012 #endif /* CONFIG_NUMA_BALANCING */
1015 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1017 update_load_add(&cfs_rq->load, se->load.weight);
1018 if (!parent_entity(se))
1019 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1021 if (entity_is_task(se))
1022 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1024 cfs_rq->nr_running++;
1028 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1030 update_load_sub(&cfs_rq->load, se->load.weight);
1031 if (!parent_entity(se))
1032 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1033 if (entity_is_task(se))
1034 list_del_init(&se->group_node);
1035 cfs_rq->nr_running--;
1038 #ifdef CONFIG_FAIR_GROUP_SCHED
1040 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1045 * Use this CPU's actual weight instead of the last load_contribution
1046 * to gain a more accurate current total weight. See
1047 * update_cfs_rq_load_contribution().
1049 tg_weight = atomic64_read(&tg->load_avg);
1050 tg_weight -= cfs_rq->tg_load_contrib;
1051 tg_weight += cfs_rq->load.weight;
1056 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1058 long tg_weight, load, shares;
1060 tg_weight = calc_tg_weight(tg, cfs_rq);
1061 load = cfs_rq->load.weight;
1063 shares = (tg->shares * load);
1065 shares /= tg_weight;
1067 if (shares < MIN_SHARES)
1068 shares = MIN_SHARES;
1069 if (shares > tg->shares)
1070 shares = tg->shares;
1074 # else /* CONFIG_SMP */
1075 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1079 # endif /* CONFIG_SMP */
1080 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1081 unsigned long weight)
1084 /* commit outstanding execution time */
1085 if (cfs_rq->curr == se)
1086 update_curr(cfs_rq);
1087 account_entity_dequeue(cfs_rq, se);
1090 update_load_set(&se->load, weight);
1093 account_entity_enqueue(cfs_rq, se);
1096 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1098 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1100 struct task_group *tg;
1101 struct sched_entity *se;
1105 se = tg->se[cpu_of(rq_of(cfs_rq))];
1106 if (!se || throttled_hierarchy(cfs_rq))
1109 if (likely(se->load.weight == tg->shares))
1112 shares = calc_cfs_shares(cfs_rq, tg);
1114 reweight_entity(cfs_rq_of(se), se, shares);
1116 #else /* CONFIG_FAIR_GROUP_SCHED */
1117 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1120 #endif /* CONFIG_FAIR_GROUP_SCHED */
1122 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1123 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1125 * We choose a half-life close to 1 scheduling period.
1126 * Note: The tables below are dependent on this value.
1128 #define LOAD_AVG_PERIOD 32
1129 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1130 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1132 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1133 static const u32 runnable_avg_yN_inv[] = {
1134 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1135 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1136 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1137 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1138 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1139 0x85aac367, 0x82cd8698,
1143 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1144 * over-estimates when re-combining.
1146 static const u32 runnable_avg_yN_sum[] = {
1147 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1148 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1149 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1154 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1156 static __always_inline u64 decay_load(u64 val, u64 n)
1158 unsigned int local_n;
1162 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1165 /* after bounds checking we can collapse to 32-bit */
1169 * As y^PERIOD = 1/2, we can combine
1170 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1171 * With a look-up table which covers k^n (n<PERIOD)
1173 * To achieve constant time decay_load.
1175 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1176 val >>= local_n / LOAD_AVG_PERIOD;
1177 local_n %= LOAD_AVG_PERIOD;
1180 val *= runnable_avg_yN_inv[local_n];
1181 /* We don't use SRR here since we always want to round down. */
1186 * For updates fully spanning n periods, the contribution to runnable
1187 * average will be: \Sum 1024*y^n
1189 * We can compute this reasonably efficiently by combining:
1190 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1192 static u32 __compute_runnable_contrib(u64 n)
1196 if (likely(n <= LOAD_AVG_PERIOD))
1197 return runnable_avg_yN_sum[n];
1198 else if (unlikely(n >= LOAD_AVG_MAX_N))
1199 return LOAD_AVG_MAX;
1201 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1203 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1204 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1206 n -= LOAD_AVG_PERIOD;
1207 } while (n > LOAD_AVG_PERIOD);
1209 contrib = decay_load(contrib, n);
1210 return contrib + runnable_avg_yN_sum[n];
1213 #ifdef CONFIG_SCHED_HMP
1214 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
1215 struct hmp_global_attr {
1216 struct attribute attr;
1217 ssize_t (*show)(struct kobject *kobj,
1218 struct attribute *attr, char *buf);
1219 ssize_t (*store)(struct kobject *a, struct attribute *b,
1220 const char *c, size_t count);
1222 int (*to_sysfs)(int);
1223 int (*from_sysfs)(int);
1224 ssize_t (*to_sysfs_text)(char *buf, int buf_size);
1227 #define HMP_DATA_SYSFS_MAX 8
1229 struct hmp_data_struct {
1230 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1231 int freqinvar_load_scale_enabled;
1233 int multiplier; /* used to scale the time delta */
1234 struct attribute_group attr_group;
1235 struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
1236 struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
1239 static u64 hmp_variable_scale_convert(u64 delta);
1240 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1241 /* Frequency-Invariant Load Modification:
1242 * Loads are calculated as in PJT's patch however we also scale the current
1243 * contribution in line with the frequency of the CPU that the task was
1245 * In this version, we use a simple linear scale derived from the maximum
1246 * frequency reported by CPUFreq. As an example:
1248 * Consider that we ran a task for 100% of the previous interval.
1250 * Our CPU was under asynchronous frequency control through one of the
1251 * CPUFreq governors.
1253 * The CPUFreq governor reports that it is able to scale the CPU between
1256 * During the period, the CPU was running at 1GHz.
1258 * In this case, our load contribution for that period is calculated as
1259 * 1 * (number_of_active_microseconds)
1261 * This results in our task being able to accumulate maximum load as normal.
1264 * Consider now that our CPU was executing at 500MHz.
1266 * We now scale the load contribution such that it is calculated as
1267 * 0.5 * (number_of_active_microseconds)
1269 * Our task can only record 50% maximum load during this period.
1271 * This represents the task consuming 50% of the CPU's *possible* compute
1272 * capacity. However the task did consume 100% of the CPU's *available*
1273 * compute capacity which is the value seen by the CPUFreq governor and
1274 * user-side CPU Utilization tools.
1276 * Restricting tracked load to be scaled by the CPU's frequency accurately
1277 * represents the consumption of possible compute capacity and allows the
1278 * HMP migration's simple threshold migration strategy to interact more
1279 * predictably with CPUFreq's asynchronous compute capacity changes.
1281 #define SCHED_FREQSCALE_SHIFT 10
1282 struct cpufreq_extents {
1288 /* Flag set when the governor in use only allows one frequency.
1291 #define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1293 static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
1294 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1295 #endif /* CONFIG_SCHED_HMP */
1297 /* We can represent the historical contribution to runnable average as the
1298 * coefficients of a geometric series. To do this we sub-divide our runnable
1299 * history into segments of approximately 1ms (1024us); label the segment that
1300 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1302 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1304 * (now) (~1ms ago) (~2ms ago)
1306 * Let u_i denote the fraction of p_i that the entity was runnable.
1308 * We then designate the fractions u_i as our co-efficients, yielding the
1309 * following representation of historical load:
1310 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1312 * We choose y based on the with of a reasonably scheduling period, fixing:
1315 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1316 * approximately half as much as the contribution to load within the last ms
1319 * When a period "rolls over" and we have new u_0`, multiplying the previous
1320 * sum again by y is sufficient to update:
1321 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1322 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1324 static __always_inline int __update_entity_runnable_avg(u64 now,
1325 struct sched_avg *sa,
1331 u32 runnable_contrib;
1332 int delta_w, decayed = 0;
1333 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1335 u32 scaled_runnable_contrib;
1337 u32 curr_scale = 1024;
1338 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1340 delta = now - sa->last_runnable_update;
1341 #ifdef CONFIG_SCHED_HMP
1342 delta = hmp_variable_scale_convert(delta);
1345 * This should only happen when time goes backwards, which it
1346 * unfortunately does during sched clock init when we swap over to TSC.
1348 if ((s64)delta < 0) {
1349 sa->last_runnable_update = now;
1354 * Use 1024ns as the unit of measurement since it's a reasonable
1355 * approximation of 1us and fast to compute.
1360 sa->last_runnable_update = now;
1362 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1363 /* retrieve scale factor for load */
1364 if (hmp_data.freqinvar_load_scale_enabled)
1365 curr_scale = freq_scale[cpu].curr_scale;
1366 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1368 /* delta_w is the amount already accumulated against our next period */
1369 delta_w = sa->runnable_avg_period % 1024;
1370 if (delta + delta_w >= 1024) {
1371 /* period roll-over */
1375 * Now that we know we're crossing a period boundary, figure
1376 * out how much from delta we need to complete the current
1377 * period and accrue it.
1379 delta_w = 1024 - delta_w;
1380 /* scale runnable time if necessary */
1381 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1382 scaled_delta_w = (delta_w * curr_scale)
1383 >> SCHED_FREQSCALE_SHIFT;
1385 sa->runnable_avg_sum += scaled_delta_w;
1387 sa->usage_avg_sum += scaled_delta_w;
1390 sa->runnable_avg_sum += delta_w;
1392 sa->usage_avg_sum += delta_w;
1393 #endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1394 sa->runnable_avg_period += delta_w;
1398 /* Figure out how many additional periods this update spans */
1399 periods = delta / 1024;
1401 /* decay the load we have accumulated so far */
1402 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1404 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1406 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
1407 /* add the contribution from this period */
1408 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1409 runnable_contrib = __compute_runnable_contrib(periods);
1410 /* Apply load scaling if necessary.
1411 * Note that multiplying the whole series is same as
1412 * multiplying all terms
1414 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1415 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1416 >> SCHED_FREQSCALE_SHIFT;
1418 sa->runnable_avg_sum += scaled_runnable_contrib;
1420 sa->usage_avg_sum += scaled_runnable_contrib;
1423 sa->runnable_avg_sum += runnable_contrib;
1425 sa->usage_avg_sum += runnable_contrib;
1426 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1427 sa->runnable_avg_period += runnable_contrib;
1430 /* Remainder of delta accrued against u_0` */
1431 /* scale if necessary */
1432 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1433 scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
1435 sa->runnable_avg_sum += scaled_delta;
1437 sa->usage_avg_sum += scaled_delta;
1440 sa->runnable_avg_sum += delta;
1442 sa->usage_avg_sum += delta;
1443 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1444 sa->runnable_avg_period += delta;
1449 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1450 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1452 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1453 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1455 decays -= se->avg.decay_count;
1459 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1460 se->avg.decay_count = 0;
1465 #ifdef CONFIG_FAIR_GROUP_SCHED
1466 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1469 struct task_group *tg = cfs_rq->tg;
1472 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1473 tg_contrib -= cfs_rq->tg_load_contrib;
1475 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1476 atomic64_add(tg_contrib, &tg->load_avg);
1477 cfs_rq->tg_load_contrib += tg_contrib;
1482 * Aggregate cfs_rq runnable averages into an equivalent task_group
1483 * representation for computing load contributions.
1485 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1486 struct cfs_rq *cfs_rq)
1488 struct task_group *tg = cfs_rq->tg;
1489 long contrib, usage_contrib;
1491 /* The fraction of a cpu used by this cfs_rq */
1492 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1493 sa->runnable_avg_period + 1);
1494 contrib -= cfs_rq->tg_runnable_contrib;
1496 usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
1497 sa->runnable_avg_period + 1);
1498 usage_contrib -= cfs_rq->tg_usage_contrib;
1501 * contrib/usage at this point represent deltas, only update if they
1504 if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
1505 (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
1506 atomic_add(contrib, &tg->runnable_avg);
1507 cfs_rq->tg_runnable_contrib += contrib;
1509 atomic_add(usage_contrib, &tg->usage_avg);
1510 cfs_rq->tg_usage_contrib += usage_contrib;
1514 static inline void __update_group_entity_contrib(struct sched_entity *se)
1516 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1517 struct task_group *tg = cfs_rq->tg;
1522 contrib = cfs_rq->tg_load_contrib * tg->shares;
1523 se->avg.load_avg_contrib = div64_u64(contrib,
1524 atomic64_read(&tg->load_avg) + 1);
1527 * For group entities we need to compute a correction term in the case
1528 * that they are consuming <1 cpu so that we would contribute the same
1529 * load as a task of equal weight.
1531 * Explicitly co-ordinating this measurement would be expensive, but
1532 * fortunately the sum of each cpus contribution forms a usable
1533 * lower-bound on the true value.
1535 * Consider the aggregate of 2 contributions. Either they are disjoint
1536 * (and the sum represents true value) or they are disjoint and we are
1537 * understating by the aggregate of their overlap.
1539 * Extending this to N cpus, for a given overlap, the maximum amount we
1540 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1541 * cpus that overlap for this interval and w_i is the interval width.
1543 * On a small machine; the first term is well-bounded which bounds the
1544 * total error since w_i is a subset of the period. Whereas on a
1545 * larger machine, while this first term can be larger, if w_i is the
1546 * of consequential size guaranteed to see n_i*w_i quickly converge to
1547 * our upper bound of 1-cpu.
1549 runnable_avg = atomic_read(&tg->runnable_avg);
1550 if (runnable_avg < NICE_0_LOAD) {
1551 se->avg.load_avg_contrib *= runnable_avg;
1552 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1556 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1557 int force_update) {}
1558 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1559 struct cfs_rq *cfs_rq) {}
1560 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1563 static inline void __update_task_entity_contrib(struct sched_entity *se)
1567 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1568 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1569 contrib /= (se->avg.runnable_avg_period + 1);
1570 se->avg.load_avg_contrib = scale_load(contrib);
1571 trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
1572 contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
1573 contrib /= (se->avg.runnable_avg_period + 1);
1574 se->avg.load_avg_ratio = scale_load(contrib);
1575 trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
1578 /* Compute the current contribution to load_avg by se, return any delta */
1579 static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio)
1581 long old_contrib = se->avg.load_avg_contrib;
1582 long old_ratio = se->avg.load_avg_ratio;
1584 if (entity_is_task(se)) {
1585 __update_task_entity_contrib(se);
1587 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1588 __update_group_entity_contrib(se);
1592 *ratio = se->avg.load_avg_ratio - old_ratio;
1593 return se->avg.load_avg_contrib - old_contrib;
1596 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1599 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1600 cfs_rq->blocked_load_avg -= load_contrib;
1602 cfs_rq->blocked_load_avg = 0;
1605 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1607 /* Update a sched_entity's runnable average */
1608 static inline void update_entity_load_avg(struct sched_entity *se,
1611 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1612 long contrib_delta, ratio_delta;
1614 int cpu = -1; /* not used in normal case */
1616 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1617 cpu = cfs_rq->rq->cpu;
1620 * For a group entity we need to use their owned cfs_rq_clock_task() in
1621 * case they are the parent of a throttled hierarchy.
1623 if (entity_is_task(se))
1624 now = cfs_rq_clock_task(cfs_rq);
1626 now = cfs_rq_clock_task(group_cfs_rq(se));
1628 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
1629 cfs_rq->curr == se, cpu))
1632 contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta);
1638 cfs_rq->runnable_load_avg += contrib_delta;
1639 rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta;
1641 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1646 * Decay the load contributed by all blocked children and account this so that
1647 * their contribution may appropriately discounted when they wake up.
1649 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1651 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1654 decays = now - cfs_rq->last_decay;
1655 if (!decays && !force_update)
1658 if (atomic64_read(&cfs_rq->removed_load)) {
1659 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1660 subtract_blocked_load_contrib(cfs_rq, removed_load);
1664 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1666 atomic64_add(decays, &cfs_rq->decay_counter);
1667 cfs_rq->last_decay = now;
1670 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1673 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1675 int cpu = -1; /* not used in normal case */
1677 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1680 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
1682 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1683 trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio);
1684 trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
1685 trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter);
1688 /* Add the load generated by se into cfs_rq's child load-average */
1689 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1690 struct sched_entity *se,
1694 * We track migrations using entity decay_count <= 0, on a wake-up
1695 * migration we use a negative decay count to track the remote decays
1696 * accumulated while sleeping.
1698 if (unlikely(se->avg.decay_count <= 0)) {
1699 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1700 if (se->avg.decay_count) {
1702 * In a wake-up migration we have to approximate the
1703 * time sleeping. This is because we can't synchronize
1704 * clock_task between the two cpus, and it is not
1705 * guaranteed to be read-safe. Instead, we can
1706 * approximate this using our carried decays, which are
1707 * explicitly atomically readable.
1709 se->avg.last_runnable_update -= (-se->avg.decay_count)
1711 update_entity_load_avg(se, 0);
1712 /* Indicate that we're now synchronized and on-rq */
1713 se->avg.decay_count = 0;
1717 __synchronize_entity_decay(se);
1720 /* migrated tasks did not contribute to our blocked load */
1722 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1723 update_entity_load_avg(se, 0);
1726 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1727 rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio;
1729 /* we force update consideration on load-balancer moves */
1730 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1734 * Remove se's load from this cfs_rq child load-average, if the entity is
1735 * transitioning to a blocked state we track its projected decay using
1738 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1739 struct sched_entity *se,
1742 update_entity_load_avg(se, 1);
1743 /* we force update consideration on load-balancer moves */
1744 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1746 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1747 rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio;
1750 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1751 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1752 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1756 * Update the rq's load with the elapsed running time before entering
1757 * idle. if the last scheduled task is not a CFS task, idle_enter will
1758 * be the only way to update the runnable statistic.
1760 void idle_enter_fair(struct rq *this_rq)
1762 update_rq_runnable_avg(this_rq, 1);
1766 * Update the rq's load with the elapsed idle time before a task is
1767 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1768 * be the only way to update the runnable statistic.
1770 void idle_exit_fair(struct rq *this_rq)
1772 update_rq_runnable_avg(this_rq, 0);
1776 static inline void update_entity_load_avg(struct sched_entity *se,
1777 int update_cfs_rq) {}
1778 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1779 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1780 struct sched_entity *se,
1782 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1783 struct sched_entity *se,
1785 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1786 int force_update) {}
1789 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1791 #ifdef CONFIG_SCHEDSTATS
1792 struct task_struct *tsk = NULL;
1794 if (entity_is_task(se))
1797 if (se->statistics.sleep_start) {
1798 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1803 if (unlikely(delta > se->statistics.sleep_max))
1804 se->statistics.sleep_max = delta;
1806 se->statistics.sleep_start = 0;
1807 se->statistics.sum_sleep_runtime += delta;
1810 account_scheduler_latency(tsk, delta >> 10, 1);
1811 trace_sched_stat_sleep(tsk, delta);
1814 if (se->statistics.block_start) {
1815 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1820 if (unlikely(delta > se->statistics.block_max))
1821 se->statistics.block_max = delta;
1823 se->statistics.block_start = 0;
1824 se->statistics.sum_sleep_runtime += delta;
1827 if (tsk->in_iowait) {
1828 se->statistics.iowait_sum += delta;
1829 se->statistics.iowait_count++;
1830 trace_sched_stat_iowait(tsk, delta);
1833 trace_sched_stat_blocked(tsk, delta);
1836 * Blocking time is in units of nanosecs, so shift by
1837 * 20 to get a milliseconds-range estimation of the
1838 * amount of time that the task spent sleeping:
1840 if (unlikely(prof_on == SLEEP_PROFILING)) {
1841 profile_hits(SLEEP_PROFILING,
1842 (void *)get_wchan(tsk),
1845 account_scheduler_latency(tsk, delta >> 10, 0);
1851 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1853 #ifdef CONFIG_SCHED_DEBUG
1854 s64 d = se->vruntime - cfs_rq->min_vruntime;
1859 if (d > 3*sysctl_sched_latency)
1860 schedstat_inc(cfs_rq, nr_spread_over);
1865 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1867 u64 vruntime = cfs_rq->min_vruntime;
1870 * The 'current' period is already promised to the current tasks,
1871 * however the extra weight of the new task will slow them down a
1872 * little, place the new task so that it fits in the slot that
1873 * stays open at the end.
1875 if (initial && sched_feat(START_DEBIT))
1876 vruntime += sched_vslice(cfs_rq, se);
1878 /* sleeps up to a single latency don't count. */
1880 unsigned long thresh = sysctl_sched_latency;
1883 * Halve their sleep time's effect, to allow
1884 * for a gentler effect of sleepers:
1886 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1892 /* ensure we never gain time by being placed backwards. */
1893 se->vruntime = max_vruntime(se->vruntime, vruntime);
1896 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1899 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1902 * Update the normalized vruntime before updating min_vruntime
1903 * through callig update_curr().
1905 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1906 se->vruntime += cfs_rq->min_vruntime;
1909 * Update run-time statistics of the 'current'.
1911 update_curr(cfs_rq);
1912 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1913 account_entity_enqueue(cfs_rq, se);
1914 update_cfs_shares(cfs_rq);
1916 if (flags & ENQUEUE_WAKEUP) {
1917 place_entity(cfs_rq, se, 0);
1918 enqueue_sleeper(cfs_rq, se);
1921 update_stats_enqueue(cfs_rq, se);
1922 check_spread(cfs_rq, se);
1923 if (se != cfs_rq->curr)
1924 __enqueue_entity(cfs_rq, se);
1927 if (cfs_rq->nr_running == 1) {
1928 list_add_leaf_cfs_rq(cfs_rq);
1929 check_enqueue_throttle(cfs_rq);
1933 static void __clear_buddies_last(struct sched_entity *se)
1935 for_each_sched_entity(se) {
1936 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1937 if (cfs_rq->last == se)
1938 cfs_rq->last = NULL;
1944 static void __clear_buddies_next(struct sched_entity *se)
1946 for_each_sched_entity(se) {
1947 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1948 if (cfs_rq->next == se)
1949 cfs_rq->next = NULL;
1955 static void __clear_buddies_skip(struct sched_entity *se)
1957 for_each_sched_entity(se) {
1958 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1959 if (cfs_rq->skip == se)
1960 cfs_rq->skip = NULL;
1966 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1968 if (cfs_rq->last == se)
1969 __clear_buddies_last(se);
1971 if (cfs_rq->next == se)
1972 __clear_buddies_next(se);
1974 if (cfs_rq->skip == se)
1975 __clear_buddies_skip(se);
1978 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1981 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1984 * Update run-time statistics of the 'current'.
1986 update_curr(cfs_rq);
1987 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1989 update_stats_dequeue(cfs_rq, se);
1990 if (flags & DEQUEUE_SLEEP) {
1991 #ifdef CONFIG_SCHEDSTATS
1992 if (entity_is_task(se)) {
1993 struct task_struct *tsk = task_of(se);
1995 if (tsk->state & TASK_INTERRUPTIBLE)
1996 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1997 if (tsk->state & TASK_UNINTERRUPTIBLE)
1998 se->statistics.block_start = rq_of(cfs_rq)->clock;
2003 clear_buddies(cfs_rq, se);
2005 if (se != cfs_rq->curr)
2006 __dequeue_entity(cfs_rq, se);
2008 account_entity_dequeue(cfs_rq, se);
2011 * Normalize the entity after updating the min_vruntime because the
2012 * update can refer to the ->curr item and we need to reflect this
2013 * movement in our normalized position.
2015 if (!(flags & DEQUEUE_SLEEP))
2016 se->vruntime -= cfs_rq->min_vruntime;
2018 /* return excess runtime on last dequeue */
2019 return_cfs_rq_runtime(cfs_rq);
2021 update_min_vruntime(cfs_rq);
2022 update_cfs_shares(cfs_rq);
2026 * Preempt the current task with a newly woken task if needed:
2029 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2031 unsigned long ideal_runtime, delta_exec;
2032 struct sched_entity *se;
2035 ideal_runtime = sched_slice(cfs_rq, curr);
2036 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2037 if (delta_exec > ideal_runtime) {
2038 resched_task(rq_of(cfs_rq)->curr);
2040 * The current task ran long enough, ensure it doesn't get
2041 * re-elected due to buddy favours.
2043 clear_buddies(cfs_rq, curr);
2048 * Ensure that a task that missed wakeup preemption by a
2049 * narrow margin doesn't have to wait for a full slice.
2050 * This also mitigates buddy induced latencies under load.
2052 if (delta_exec < sysctl_sched_min_granularity)
2055 se = __pick_first_entity(cfs_rq);
2056 delta = curr->vruntime - se->vruntime;
2061 if (delta > ideal_runtime)
2062 resched_task(rq_of(cfs_rq)->curr);
2066 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2068 /* 'current' is not kept within the tree. */
2071 * Any task has to be enqueued before it get to execute on
2072 * a CPU. So account for the time it spent waiting on the
2075 update_stats_wait_end(cfs_rq, se);
2076 __dequeue_entity(cfs_rq, se);
2077 update_entity_load_avg(se, 1);
2080 update_stats_curr_start(cfs_rq, se);
2082 #ifdef CONFIG_SCHEDSTATS
2084 * Track our maximum slice length, if the CPU's load is at
2085 * least twice that of our own weight (i.e. dont track it
2086 * when there are only lesser-weight tasks around):
2088 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2089 se->statistics.slice_max = max(se->statistics.slice_max,
2090 se->sum_exec_runtime - se->prev_sum_exec_runtime);
2093 se->prev_sum_exec_runtime = se->sum_exec_runtime;
2097 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2100 * Pick the next process, keeping these things in mind, in this order:
2101 * 1) keep things fair between processes/task groups
2102 * 2) pick the "next" process, since someone really wants that to run
2103 * 3) pick the "last" process, for cache locality
2104 * 4) do not run the "skip" process, if something else is available
2106 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2108 struct sched_entity *se = __pick_first_entity(cfs_rq);
2109 struct sched_entity *left = se;
2112 * Avoid running the skip buddy, if running something else can
2113 * be done without getting too unfair.
2115 if (cfs_rq->skip == se) {
2116 struct sched_entity *second = __pick_next_entity(se);
2117 if (second && wakeup_preempt_entity(second, left) < 1)
2122 * Prefer last buddy, try to return the CPU to a preempted task.
2124 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2128 * Someone really wants this to run. If it's not unfair, run it.
2130 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2133 clear_buddies(cfs_rq, se);
2138 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2140 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2143 * If still on the runqueue then deactivate_task()
2144 * was not called and update_curr() has to be done:
2147 update_curr(cfs_rq);
2149 /* throttle cfs_rqs exceeding runtime */
2150 check_cfs_rq_runtime(cfs_rq);
2152 check_spread(cfs_rq, prev);
2154 update_stats_wait_start(cfs_rq, prev);
2155 /* Put 'current' back into the tree. */
2156 __enqueue_entity(cfs_rq, prev);
2157 /* in !on_rq case, update occurred at dequeue */
2158 update_entity_load_avg(prev, 1);
2160 cfs_rq->curr = NULL;
2164 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2167 * Update run-time statistics of the 'current'.
2169 update_curr(cfs_rq);
2172 * Ensure that runnable average is periodically updated.
2174 update_entity_load_avg(curr, 1);
2175 update_cfs_rq_blocked_load(cfs_rq, 1);
2176 update_cfs_shares(cfs_rq);
2178 #ifdef CONFIG_SCHED_HRTICK
2180 * queued ticks are scheduled to match the slice, so don't bother
2181 * validating it and just reschedule.
2184 resched_task(rq_of(cfs_rq)->curr);
2188 * don't let the period tick interfere with the hrtick preemption
2190 if (!sched_feat(DOUBLE_TICK) &&
2191 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2195 if (cfs_rq->nr_running > 1)
2196 check_preempt_tick(cfs_rq, curr);
2200 /**************************************************
2201 * CFS bandwidth control machinery
2204 #ifdef CONFIG_CFS_BANDWIDTH
2206 #ifdef HAVE_JUMP_LABEL
2207 static struct static_key __cfs_bandwidth_used;
2209 static inline bool cfs_bandwidth_used(void)
2211 return static_key_false(&__cfs_bandwidth_used);
2214 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2216 /* only need to count groups transitioning between enabled/!enabled */
2217 if (enabled && !was_enabled)
2218 static_key_slow_inc(&__cfs_bandwidth_used);
2219 else if (!enabled && was_enabled)
2220 static_key_slow_dec(&__cfs_bandwidth_used);
2222 #else /* HAVE_JUMP_LABEL */
2223 static bool cfs_bandwidth_used(void)
2228 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2229 #endif /* HAVE_JUMP_LABEL */
2232 * default period for cfs group bandwidth.
2233 * default: 0.1s, units: nanoseconds
2235 static inline u64 default_cfs_period(void)
2237 return 100000000ULL;
2240 static inline u64 sched_cfs_bandwidth_slice(void)
2242 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2246 * Replenish runtime according to assigned quota and update expiration time.
2247 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2248 * additional synchronization around rq->lock.
2250 * requires cfs_b->lock
2252 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2256 if (cfs_b->quota == RUNTIME_INF)
2259 now = sched_clock_cpu(smp_processor_id());
2260 cfs_b->runtime = cfs_b->quota;
2261 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2264 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2266 return &tg->cfs_bandwidth;
2269 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2270 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2272 if (unlikely(cfs_rq->throttle_count))
2273 return cfs_rq->throttled_clock_task;
2275 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2278 /* returns 0 on failure to allocate runtime */
2279 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2281 struct task_group *tg = cfs_rq->tg;
2282 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2283 u64 amount = 0, min_amount, expires;
2285 /* note: this is a positive sum as runtime_remaining <= 0 */
2286 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2288 raw_spin_lock(&cfs_b->lock);
2289 if (cfs_b->quota == RUNTIME_INF)
2290 amount = min_amount;
2293 * If the bandwidth pool has become inactive, then at least one
2294 * period must have elapsed since the last consumption.
2295 * Refresh the global state and ensure bandwidth timer becomes
2298 if (!cfs_b->timer_active) {
2299 __refill_cfs_bandwidth_runtime(cfs_b);
2300 __start_cfs_bandwidth(cfs_b);
2303 if (cfs_b->runtime > 0) {
2304 amount = min(cfs_b->runtime, min_amount);
2305 cfs_b->runtime -= amount;
2309 expires = cfs_b->runtime_expires;
2310 raw_spin_unlock(&cfs_b->lock);
2312 cfs_rq->runtime_remaining += amount;
2314 * we may have advanced our local expiration to account for allowed
2315 * spread between our sched_clock and the one on which runtime was
2318 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2319 cfs_rq->runtime_expires = expires;
2321 return cfs_rq->runtime_remaining > 0;
2325 * Note: This depends on the synchronization provided by sched_clock and the
2326 * fact that rq->clock snapshots this value.
2328 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2330 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2331 struct rq *rq = rq_of(cfs_rq);
2333 /* if the deadline is ahead of our clock, nothing to do */
2334 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2337 if (cfs_rq->runtime_remaining < 0)
2341 * If the local deadline has passed we have to consider the
2342 * possibility that our sched_clock is 'fast' and the global deadline
2343 * has not truly expired.
2345 * Fortunately we can check determine whether this the case by checking
2346 * whether the global deadline has advanced.
2349 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2350 /* extend local deadline, drift is bounded above by 2 ticks */
2351 cfs_rq->runtime_expires += TICK_NSEC;
2353 /* global deadline is ahead, expiration has passed */
2354 cfs_rq->runtime_remaining = 0;
2358 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2359 unsigned long delta_exec)
2361 /* dock delta_exec before expiring quota (as it could span periods) */
2362 cfs_rq->runtime_remaining -= delta_exec;
2363 expire_cfs_rq_runtime(cfs_rq);
2365 if (likely(cfs_rq->runtime_remaining > 0))
2369 * if we're unable to extend our runtime we resched so that the active
2370 * hierarchy can be throttled
2372 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2373 resched_task(rq_of(cfs_rq)->curr);
2376 static __always_inline
2377 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2379 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2382 __account_cfs_rq_runtime(cfs_rq, delta_exec);
2385 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2387 return cfs_bandwidth_used() && cfs_rq->throttled;
2390 /* check whether cfs_rq, or any parent, is throttled */
2391 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2393 return cfs_bandwidth_used() && cfs_rq->throttle_count;
2397 * Ensure that neither of the group entities corresponding to src_cpu or
2398 * dest_cpu are members of a throttled hierarchy when performing group
2399 * load-balance operations.
2401 static inline int throttled_lb_pair(struct task_group *tg,
2402 int src_cpu, int dest_cpu)
2404 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2406 src_cfs_rq = tg->cfs_rq[src_cpu];
2407 dest_cfs_rq = tg->cfs_rq[dest_cpu];
2409 return throttled_hierarchy(src_cfs_rq) ||
2410 throttled_hierarchy(dest_cfs_rq);
2413 /* updated child weight may affect parent so we have to do this bottom up */
2414 static int tg_unthrottle_up(struct task_group *tg, void *data)
2416 struct rq *rq = data;
2417 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2419 cfs_rq->throttle_count--;
2421 if (!cfs_rq->throttle_count) {
2422 /* adjust cfs_rq_clock_task() */
2423 cfs_rq->throttled_clock_task_time += rq->clock_task -
2424 cfs_rq->throttled_clock_task;
2431 static int tg_throttle_down(struct task_group *tg, void *data)
2433 struct rq *rq = data;
2434 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2436 /* group is entering throttled state, stop time */
2437 if (!cfs_rq->throttle_count)
2438 cfs_rq->throttled_clock_task = rq->clock_task;
2439 cfs_rq->throttle_count++;
2444 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2446 struct rq *rq = rq_of(cfs_rq);
2447 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2448 struct sched_entity *se;
2449 long task_delta, dequeue = 1;
2451 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2453 /* freeze hierarchy runnable averages while throttled */
2455 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2458 task_delta = cfs_rq->h_nr_running;
2459 for_each_sched_entity(se) {
2460 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2461 /* throttled entity or throttle-on-deactivate */
2466 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2467 qcfs_rq->h_nr_running -= task_delta;
2469 if (qcfs_rq->load.weight)
2474 rq->nr_running -= task_delta;
2476 cfs_rq->throttled = 1;
2477 cfs_rq->throttled_clock = rq->clock;
2478 raw_spin_lock(&cfs_b->lock);
2479 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2480 raw_spin_unlock(&cfs_b->lock);
2483 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2485 struct rq *rq = rq_of(cfs_rq);
2486 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2487 struct sched_entity *se;
2491 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2493 cfs_rq->throttled = 0;
2494 raw_spin_lock(&cfs_b->lock);
2495 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2496 list_del_rcu(&cfs_rq->throttled_list);
2497 raw_spin_unlock(&cfs_b->lock);
2499 update_rq_clock(rq);
2500 /* update hierarchical throttle state */
2501 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2503 if (!cfs_rq->load.weight)
2506 task_delta = cfs_rq->h_nr_running;
2507 for_each_sched_entity(se) {
2511 cfs_rq = cfs_rq_of(se);
2513 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2514 cfs_rq->h_nr_running += task_delta;
2516 if (cfs_rq_throttled(cfs_rq))
2521 rq->nr_running += task_delta;
2523 /* determine whether we need to wake up potentially idle cpu */
2524 if (rq->curr == rq->idle && rq->cfs.nr_running)
2525 resched_task(rq->curr);
2528 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2529 u64 remaining, u64 expires)
2531 struct cfs_rq *cfs_rq;
2532 u64 runtime = remaining;
2535 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2537 struct rq *rq = rq_of(cfs_rq);
2539 raw_spin_lock(&rq->lock);
2540 if (!cfs_rq_throttled(cfs_rq))
2543 runtime = -cfs_rq->runtime_remaining + 1;
2544 if (runtime > remaining)
2545 runtime = remaining;
2546 remaining -= runtime;
2548 cfs_rq->runtime_remaining += runtime;
2549 cfs_rq->runtime_expires = expires;
2551 /* we check whether we're throttled above */
2552 if (cfs_rq->runtime_remaining > 0)
2553 unthrottle_cfs_rq(cfs_rq);
2556 raw_spin_unlock(&rq->lock);
2567 * Responsible for refilling a task_group's bandwidth and unthrottling its
2568 * cfs_rqs as appropriate. If there has been no activity within the last
2569 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2570 * used to track this state.
2572 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2574 u64 runtime, runtime_expires;
2575 int idle = 1, throttled;
2577 raw_spin_lock(&cfs_b->lock);
2578 /* no need to continue the timer with no bandwidth constraint */
2579 if (cfs_b->quota == RUNTIME_INF)
2582 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2583 /* idle depends on !throttled (for the case of a large deficit) */
2584 idle = cfs_b->idle && !throttled;
2585 cfs_b->nr_periods += overrun;
2587 /* if we're going inactive then everything else can be deferred */
2591 __refill_cfs_bandwidth_runtime(cfs_b);
2594 /* mark as potentially idle for the upcoming period */
2599 /* account preceding periods in which throttling occurred */
2600 cfs_b->nr_throttled += overrun;
2603 * There are throttled entities so we must first use the new bandwidth
2604 * to unthrottle them before making it generally available. This
2605 * ensures that all existing debts will be paid before a new cfs_rq is
2608 runtime = cfs_b->runtime;
2609 runtime_expires = cfs_b->runtime_expires;
2613 * This check is repeated as we are holding onto the new bandwidth
2614 * while we unthrottle. This can potentially race with an unthrottled
2615 * group trying to acquire new bandwidth from the global pool.
2617 while (throttled && runtime > 0) {
2618 raw_spin_unlock(&cfs_b->lock);
2619 /* we can't nest cfs_b->lock while distributing bandwidth */
2620 runtime = distribute_cfs_runtime(cfs_b, runtime,
2622 raw_spin_lock(&cfs_b->lock);
2624 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2627 /* return (any) remaining runtime */
2628 cfs_b->runtime = runtime;
2630 * While we are ensured activity in the period following an
2631 * unthrottle, this also covers the case in which the new bandwidth is
2632 * insufficient to cover the existing bandwidth deficit. (Forcing the
2633 * timer to remain active while there are any throttled entities.)
2638 cfs_b->timer_active = 0;
2639 raw_spin_unlock(&cfs_b->lock);
2644 /* a cfs_rq won't donate quota below this amount */
2645 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2646 /* minimum remaining period time to redistribute slack quota */
2647 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2648 /* how long we wait to gather additional slack before distributing */
2649 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2651 /* are we near the end of the current quota period? */
2652 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2654 struct hrtimer *refresh_timer = &cfs_b->period_timer;
2657 /* if the call-back is running a quota refresh is already occurring */
2658 if (hrtimer_callback_running(refresh_timer))
2661 /* is a quota refresh about to occur? */
2662 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2663 if (remaining < min_expire)
2669 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2671 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2673 /* if there's a quota refresh soon don't bother with slack */
2674 if (runtime_refresh_within(cfs_b, min_left))
2677 start_bandwidth_timer(&cfs_b->slack_timer,
2678 ns_to_ktime(cfs_bandwidth_slack_period));
2681 /* we know any runtime found here is valid as update_curr() precedes return */
2682 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2684 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2685 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2687 if (slack_runtime <= 0)
2690 raw_spin_lock(&cfs_b->lock);
2691 if (cfs_b->quota != RUNTIME_INF &&
2692 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2693 cfs_b->runtime += slack_runtime;
2695 /* we are under rq->lock, defer unthrottling using a timer */
2696 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2697 !list_empty(&cfs_b->throttled_cfs_rq))
2698 start_cfs_slack_bandwidth(cfs_b);
2700 raw_spin_unlock(&cfs_b->lock);
2702 /* even if it's not valid for return we don't want to try again */
2703 cfs_rq->runtime_remaining -= slack_runtime;
2706 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2708 if (!cfs_bandwidth_used())
2711 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2714 __return_cfs_rq_runtime(cfs_rq);
2718 * This is done with a timer (instead of inline with bandwidth return) since
2719 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2721 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2723 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2726 /* confirm we're still not at a refresh boundary */
2727 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2730 raw_spin_lock(&cfs_b->lock);
2731 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2732 runtime = cfs_b->runtime;
2735 expires = cfs_b->runtime_expires;
2736 raw_spin_unlock(&cfs_b->lock);
2741 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2743 raw_spin_lock(&cfs_b->lock);
2744 if (expires == cfs_b->runtime_expires)
2745 cfs_b->runtime = runtime;
2746 raw_spin_unlock(&cfs_b->lock);
2750 * When a group wakes up we want to make sure that its quota is not already
2751 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2752 * runtime as update_curr() throttling can not not trigger until it's on-rq.
2754 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2756 if (!cfs_bandwidth_used())
2759 /* an active group must be handled by the update_curr()->put() path */
2760 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2763 /* ensure the group is not already throttled */
2764 if (cfs_rq_throttled(cfs_rq))
2767 /* update runtime allocation */
2768 account_cfs_rq_runtime(cfs_rq, 0);
2769 if (cfs_rq->runtime_remaining <= 0)
2770 throttle_cfs_rq(cfs_rq);
2773 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2774 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2776 if (!cfs_bandwidth_used())
2779 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2783 * it's possible for a throttled entity to be forced into a running
2784 * state (e.g. set_curr_task), in this case we're finished.
2786 if (cfs_rq_throttled(cfs_rq))
2789 throttle_cfs_rq(cfs_rq);
2792 static inline u64 default_cfs_period(void);
2793 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2794 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2796 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2798 struct cfs_bandwidth *cfs_b =
2799 container_of(timer, struct cfs_bandwidth, slack_timer);
2800 do_sched_cfs_slack_timer(cfs_b);
2802 return HRTIMER_NORESTART;
2805 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2807 struct cfs_bandwidth *cfs_b =
2808 container_of(timer, struct cfs_bandwidth, period_timer);
2814 now = hrtimer_cb_get_time(timer);
2815 overrun = hrtimer_forward(timer, now, cfs_b->period);
2820 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2823 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2826 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2828 raw_spin_lock_init(&cfs_b->lock);
2830 cfs_b->quota = RUNTIME_INF;
2831 cfs_b->period = ns_to_ktime(default_cfs_period());
2833 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2834 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2835 cfs_b->period_timer.function = sched_cfs_period_timer;
2836 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2837 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2840 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2842 cfs_rq->runtime_enabled = 0;
2843 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2846 /* requires cfs_b->lock, may release to reprogram timer */
2847 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2850 * The timer may be active because we're trying to set a new bandwidth
2851 * period or because we're racing with the tear-down path
2852 * (timer_active==0 becomes visible before the hrtimer call-back
2853 * terminates). In either case we ensure that it's re-programmed
2855 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2856 raw_spin_unlock(&cfs_b->lock);
2857 /* ensure cfs_b->lock is available while we wait */
2858 hrtimer_cancel(&cfs_b->period_timer);
2860 raw_spin_lock(&cfs_b->lock);
2861 /* if someone else restarted the timer then we're done */
2862 if (cfs_b->timer_active)
2866 cfs_b->timer_active = 1;
2867 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2870 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2872 hrtimer_cancel(&cfs_b->period_timer);
2873 hrtimer_cancel(&cfs_b->slack_timer);
2876 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2878 struct cfs_rq *cfs_rq;
2880 for_each_leaf_cfs_rq(rq, cfs_rq) {
2881 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2883 if (!cfs_rq->runtime_enabled)
2887 * clock_task is not advancing so we just need to make sure
2888 * there's some valid quota amount
2890 cfs_rq->runtime_remaining = cfs_b->quota;
2891 if (cfs_rq_throttled(cfs_rq))
2892 unthrottle_cfs_rq(cfs_rq);
2896 #else /* CONFIG_CFS_BANDWIDTH */
2897 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2899 return rq_of(cfs_rq)->clock_task;
2902 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2903 unsigned long delta_exec) {}
2904 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2905 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2906 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2908 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2913 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2918 static inline int throttled_lb_pair(struct task_group *tg,
2919 int src_cpu, int dest_cpu)
2924 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2926 #ifdef CONFIG_FAIR_GROUP_SCHED
2927 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2930 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2934 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2935 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2937 #endif /* CONFIG_CFS_BANDWIDTH */
2939 /**************************************************
2940 * CFS operations on tasks:
2943 #ifdef CONFIG_SCHED_HRTICK
2944 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2946 struct sched_entity *se = &p->se;
2947 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2949 WARN_ON(task_rq(p) != rq);
2951 if (cfs_rq->nr_running > 1) {
2952 u64 slice = sched_slice(cfs_rq, se);
2953 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2954 s64 delta = slice - ran;
2963 * Don't schedule slices shorter than 10000ns, that just
2964 * doesn't make sense. Rely on vruntime for fairness.
2967 delta = max_t(s64, 10000LL, delta);
2969 hrtick_start(rq, delta);
2974 * called from enqueue/dequeue and updates the hrtick when the
2975 * current task is from our class and nr_running is low enough
2978 static void hrtick_update(struct rq *rq)
2980 struct task_struct *curr = rq->curr;
2982 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2985 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2986 hrtick_start_fair(rq, curr);
2988 #else /* !CONFIG_SCHED_HRTICK */
2990 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2994 static inline void hrtick_update(struct rq *rq)
3000 * The enqueue_task method is called before nr_running is
3001 * increased. Here we update the fair scheduling stats and
3002 * then put the task into the rbtree:
3005 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3007 struct cfs_rq *cfs_rq;
3008 struct sched_entity *se = &p->se;
3010 for_each_sched_entity(se) {
3013 cfs_rq = cfs_rq_of(se);
3014 enqueue_entity(cfs_rq, se, flags);
3017 * end evaluation on encountering a throttled cfs_rq
3019 * note: in the case of encountering a throttled cfs_rq we will
3020 * post the final h_nr_running increment below.
3022 if (cfs_rq_throttled(cfs_rq))
3024 cfs_rq->h_nr_running++;
3026 flags = ENQUEUE_WAKEUP;
3029 for_each_sched_entity(se) {
3030 cfs_rq = cfs_rq_of(se);
3031 cfs_rq->h_nr_running++;
3033 if (cfs_rq_throttled(cfs_rq))
3036 update_cfs_shares(cfs_rq);
3037 update_entity_load_avg(se, 1);
3041 update_rq_runnable_avg(rq, rq->nr_running);
3047 static void set_next_buddy(struct sched_entity *se);
3050 * The dequeue_task method is called before nr_running is
3051 * decreased. We remove the task from the rbtree and
3052 * update the fair scheduling stats:
3054 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3056 struct cfs_rq *cfs_rq;
3057 struct sched_entity *se = &p->se;
3058 int task_sleep = flags & DEQUEUE_SLEEP;
3060 for_each_sched_entity(se) {
3061 cfs_rq = cfs_rq_of(se);
3062 dequeue_entity(cfs_rq, se, flags);
3065 * end evaluation on encountering a throttled cfs_rq
3067 * note: in the case of encountering a throttled cfs_rq we will
3068 * post the final h_nr_running decrement below.
3070 if (cfs_rq_throttled(cfs_rq))
3072 cfs_rq->h_nr_running--;
3074 /* Don't dequeue parent if it has other entities besides us */
3075 if (cfs_rq->load.weight) {
3077 * Bias pick_next to pick a task from this cfs_rq, as
3078 * p is sleeping when it is within its sched_slice.
3080 if (task_sleep && parent_entity(se))
3081 set_next_buddy(parent_entity(se));
3083 /* avoid re-evaluating load for this entity */
3084 se = parent_entity(se);
3087 flags |= DEQUEUE_SLEEP;
3090 for_each_sched_entity(se) {
3091 cfs_rq = cfs_rq_of(se);
3092 cfs_rq->h_nr_running--;
3094 if (cfs_rq_throttled(cfs_rq))
3097 update_cfs_shares(cfs_rq);
3098 update_entity_load_avg(se, 1);
3103 update_rq_runnable_avg(rq, 1);
3109 /* Used instead of source_load when we know the type == 0 */
3110 static unsigned long weighted_cpuload(const int cpu)
3112 return cpu_rq(cpu)->load.weight;
3116 * Return a low guess at the load of a migration-source cpu weighted
3117 * according to the scheduling class and "nice" value.
3119 * We want to under-estimate the load of migration sources, to
3120 * balance conservatively.
3122 static unsigned long source_load(int cpu, int type)
3124 struct rq *rq = cpu_rq(cpu);
3125 unsigned long total = weighted_cpuload(cpu);
3127 if (type == 0 || !sched_feat(LB_BIAS))
3130 return min(rq->cpu_load[type-1], total);
3134 * Return a high guess at the load of a migration-target cpu weighted
3135 * according to the scheduling class and "nice" value.
3137 static unsigned long target_load(int cpu, int type)
3139 struct rq *rq = cpu_rq(cpu);
3140 unsigned long total = weighted_cpuload(cpu);
3142 if (type == 0 || !sched_feat(LB_BIAS))
3145 return max(rq->cpu_load[type-1], total);
3148 static unsigned long power_of(int cpu)
3150 return cpu_rq(cpu)->cpu_power;
3153 static unsigned long cpu_avg_load_per_task(int cpu)
3155 struct rq *rq = cpu_rq(cpu);
3156 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3159 return rq->load.weight / nr_running;
3165 static void task_waking_fair(struct task_struct *p)
3167 struct sched_entity *se = &p->se;
3168 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3171 #ifndef CONFIG_64BIT
3172 u64 min_vruntime_copy;
3175 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3177 min_vruntime = cfs_rq->min_vruntime;
3178 } while (min_vruntime != min_vruntime_copy);
3180 min_vruntime = cfs_rq->min_vruntime;
3183 se->vruntime -= min_vruntime;
3186 #ifdef CONFIG_FAIR_GROUP_SCHED
3188 * effective_load() calculates the load change as seen from the root_task_group
3190 * Adding load to a group doesn't make a group heavier, but can cause movement
3191 * of group shares between cpus. Assuming the shares were perfectly aligned one
3192 * can calculate the shift in shares.
3194 * Calculate the effective load difference if @wl is added (subtracted) to @tg
3195 * on this @cpu and results in a total addition (subtraction) of @wg to the
3196 * total group weight.
3198 * Given a runqueue weight distribution (rw_i) we can compute a shares
3199 * distribution (s_i) using:
3201 * s_i = rw_i / \Sum rw_j (1)
3203 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3204 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3205 * shares distribution (s_i):
3207 * rw_i = { 2, 4, 1, 0 }
3208 * s_i = { 2/7, 4/7, 1/7, 0 }
3210 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3211 * task used to run on and the CPU the waker is running on), we need to
3212 * compute the effect of waking a task on either CPU and, in case of a sync
3213 * wakeup, compute the effect of the current task going to sleep.
3215 * So for a change of @wl to the local @cpu with an overall group weight change
3216 * of @wl we can compute the new shares distribution (s'_i) using:
3218 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
3220 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3221 * differences in waking a task to CPU 0. The additional task changes the
3222 * weight and shares distributions like:
3224 * rw'_i = { 3, 4, 1, 0 }
3225 * s'_i = { 3/8, 4/8, 1/8, 0 }
3227 * We can then compute the difference in effective weight by using:
3229 * dw_i = S * (s'_i - s_i) (3)
3231 * Where 'S' is the group weight as seen by its parent.
3233 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3234 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3235 * 4/7) times the weight of the group.
3237 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3239 struct sched_entity *se = tg->se[cpu];
3241 if (!tg->parent) /* the trivial, non-cgroup case */
3244 for_each_sched_entity(se) {
3250 * W = @wg + \Sum rw_j
3252 W = wg + calc_tg_weight(tg, se->my_q);
3257 w = se->my_q->load.weight + wl;
3260 * wl = S * s'_i; see (2)
3263 wl = (w * tg->shares) / W;
3268 * Per the above, wl is the new se->load.weight value; since
3269 * those are clipped to [MIN_SHARES, ...) do so now. See
3270 * calc_cfs_shares().
3272 if (wl < MIN_SHARES)
3276 * wl = dw_i = S * (s'_i - s_i); see (3)
3278 wl -= se->load.weight;
3281 * Recursively apply this logic to all parent groups to compute
3282 * the final effective load change on the root group. Since
3283 * only the @tg group gets extra weight, all parent groups can
3284 * only redistribute existing shares. @wl is the shift in shares
3285 * resulting from this level per the above.
3294 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3295 unsigned long wl, unsigned long wg)
3302 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3304 s64 this_load, load;
3305 int idx, this_cpu, prev_cpu;
3306 unsigned long tl_per_task;
3307 struct task_group *tg;
3308 unsigned long weight;
3312 this_cpu = smp_processor_id();
3313 prev_cpu = task_cpu(p);
3314 load = source_load(prev_cpu, idx);
3315 this_load = target_load(this_cpu, idx);
3318 * If sync wakeup then subtract the (maximum possible)
3319 * effect of the currently running task from the load
3320 * of the current CPU:
3323 tg = task_group(current);
3324 weight = current->se.load.weight;
3326 this_load += effective_load(tg, this_cpu, -weight, -weight);
3327 load += effective_load(tg, prev_cpu, 0, -weight);
3331 weight = p->se.load.weight;
3334 * In low-load situations, where prev_cpu is idle and this_cpu is idle
3335 * due to the sync cause above having dropped this_load to 0, we'll
3336 * always have an imbalance, but there's really nothing you can do
3337 * about that, so that's good too.
3339 * Otherwise check if either cpus are near enough in load to allow this
3340 * task to be woken on this_cpu.
3342 if (this_load > 0) {
3343 s64 this_eff_load, prev_eff_load;
3345 this_eff_load = 100;
3346 this_eff_load *= power_of(prev_cpu);
3347 this_eff_load *= this_load +
3348 effective_load(tg, this_cpu, weight, weight);
3350 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3351 prev_eff_load *= power_of(this_cpu);
3352 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3354 balanced = this_eff_load <= prev_eff_load;
3359 * If the currently running task will sleep within
3360 * a reasonable amount of time then attract this newly
3363 if (sync && balanced)
3366 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3367 tl_per_task = cpu_avg_load_per_task(this_cpu);
3370 (this_load <= load &&
3371 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3373 * This domain has SD_WAKE_AFFINE and
3374 * p is cache cold in this domain, and
3375 * there is no bad imbalance.
3377 schedstat_inc(sd, ttwu_move_affine);
3378 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3386 * find_idlest_group finds and returns the least busy CPU group within the
3389 static struct sched_group *
3390 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3391 int this_cpu, int load_idx)
3393 struct sched_group *idlest = NULL, *group = sd->groups;
3394 unsigned long min_load = ULONG_MAX, this_load = 0;
3395 int imbalance = 100 + (sd->imbalance_pct-100)/2;
3398 unsigned long load, avg_load;
3402 /* Skip over this group if it has no CPUs allowed */
3403 if (!cpumask_intersects(sched_group_cpus(group),
3404 tsk_cpus_allowed(p)))
3407 local_group = cpumask_test_cpu(this_cpu,
3408 sched_group_cpus(group));
3410 /* Tally up the load of all CPUs in the group */
3413 for_each_cpu(i, sched_group_cpus(group)) {
3414 /* Bias balancing toward cpus of our domain */
3416 load = source_load(i, load_idx);
3418 load = target_load(i, load_idx);
3423 /* Adjust by relative CPU power of the group */
3424 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3427 this_load = avg_load;
3428 } else if (avg_load < min_load) {
3429 min_load = avg_load;
3432 } while (group = group->next, group != sd->groups);
3434 if (!idlest || 100*this_load < imbalance*min_load)
3440 * find_idlest_cpu - find the idlest cpu among the cpus in group.
3443 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3445 unsigned long load, min_load = ULONG_MAX;
3449 /* Traverse only the allowed CPUs */
3450 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3451 load = weighted_cpuload(i);
3453 if (load < min_load || (load == min_load && i == this_cpu)) {
3463 * Try and locate an idle CPU in the sched_domain.
3465 static int select_idle_sibling(struct task_struct *p, int target)
3467 struct sched_domain *sd;
3468 struct sched_group *sg;
3469 int i = task_cpu(p);
3471 if (idle_cpu(target))
3475 * If the prevous cpu is cache affine and idle, don't be stupid.
3477 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3481 * Otherwise, iterate the domains and find an elegible idle cpu.
3483 sd = rcu_dereference(per_cpu(sd_llc, target));
3484 for_each_lower_domain(sd) {
3487 if (!cpumask_intersects(sched_group_cpus(sg),
3488 tsk_cpus_allowed(p)))
3491 for_each_cpu(i, sched_group_cpus(sg)) {
3492 if (i == target || !idle_cpu(i))
3496 target = cpumask_first_and(sched_group_cpus(sg),
3497 tsk_cpus_allowed(p));
3501 } while (sg != sd->groups);
3507 #ifdef CONFIG_SCHED_HMP
3509 * Heterogenous multiprocessor (HMP) optimizations
3511 * The cpu types are distinguished using a list of hmp_domains
3512 * which each represent one cpu type using a cpumask.
3513 * The list is assumed ordered by compute capacity with the
3514 * fastest domain first.
3516 DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
3517 static const int hmp_max_tasks = 5;
3519 extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
3521 /* Setup hmp_domains */
3522 static int __init hmp_cpu_mask_setup(void)
3525 struct hmp_domain *domain;
3526 struct list_head *pos;
3529 pr_debug("Initializing HMP scheduler:\n");
3531 /* Initialize hmp_domains using platform code */
3532 arch_get_hmp_domains(&hmp_domains);
3533 if (list_empty(&hmp_domains)) {
3534 pr_debug("HMP domain list is empty!\n");
3538 /* Print hmp_domains */
3540 list_for_each(pos, &hmp_domains) {
3541 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3542 cpulist_scnprintf(buf, 64, &domain->possible_cpus);
3543 pr_debug(" HMP domain %d: %s\n", dc, buf);
3545 for_each_cpu_mask(cpu, domain->possible_cpus) {
3546 per_cpu(hmp_cpu_domain, cpu) = domain;
3554 static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
3556 struct hmp_domain *domain;
3557 struct list_head *pos;
3559 list_for_each(pos, &hmp_domains) {
3560 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3561 if(cpumask_test_cpu(cpu, &domain->possible_cpus))
3567 static void hmp_online_cpu(int cpu)
3569 struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3572 cpumask_set_cpu(cpu, &domain->cpus);
3575 static void hmp_offline_cpu(int cpu)
3577 struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3580 cpumask_clear_cpu(cpu, &domain->cpus);
3583 * Needed to determine heaviest tasks etc.
3585 static inline unsigned int hmp_cpu_is_fastest(int cpu);
3586 static inline unsigned int hmp_cpu_is_slowest(int cpu);
3587 static inline struct hmp_domain *hmp_slower_domain(int cpu);
3588 static inline struct hmp_domain *hmp_faster_domain(int cpu);
3590 /* must hold runqueue lock for queue se is currently on */
3591 static struct sched_entity *hmp_get_heaviest_task(
3592 struct sched_entity *se, int migrate_up)
3594 int num_tasks = hmp_max_tasks;
3595 struct sched_entity *max_se = se;
3596 unsigned long int max_ratio = se->avg.load_avg_ratio;
3597 const struct cpumask *hmp_target_mask = NULL;
3600 struct hmp_domain *hmp;
3601 if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
3604 hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
3605 hmp_target_mask = &hmp->cpus;
3607 /* The currently running task is not on the runqueue */
3608 se = __pick_first_entity(cfs_rq_of(se));
3610 while (num_tasks && se) {
3611 if (entity_is_task(se) &&
3612 (se->avg.load_avg_ratio > max_ratio &&
3614 cpumask_intersects(hmp_target_mask,
3615 tsk_cpus_allowed(task_of(se))))) {
3617 max_ratio = se->avg.load_avg_ratio;
3619 se = __pick_next_entity(se);
3625 static struct sched_entity *hmp_get_lightest_task(
3626 struct sched_entity *se, int migrate_down)
3628 int num_tasks = hmp_max_tasks;
3629 struct sched_entity *min_se = se;
3630 unsigned long int min_ratio = se->avg.load_avg_ratio;
3631 const struct cpumask *hmp_target_mask = NULL;
3634 struct hmp_domain *hmp;
3635 if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
3637 hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
3638 hmp_target_mask = &hmp->cpus;
3640 /* The currently running task is not on the runqueue */
3641 se = __pick_first_entity(cfs_rq_of(se));
3643 while (num_tasks && se) {
3644 if (entity_is_task(se) &&
3645 (se->avg.load_avg_ratio < min_ratio &&
3647 cpumask_intersects(hmp_target_mask,
3648 tsk_cpus_allowed(task_of(se))))) {
3650 min_ratio = se->avg.load_avg_ratio;
3652 se = __pick_next_entity(se);
3659 * Migration thresholds should be in the range [0..1023]
3660 * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3661 * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3663 * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
3664 * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
3665 * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
3667 * Small Task Packing:
3668 * We can choose to fill the littlest CPUs in an HMP system rather than
3669 * the typical spreading mechanic. This behavior is controllable using
3671 * hmp_packing_enabled: runtime control over pack/spread
3672 * hmp_full_threshold: Consider a CPU with this much unweighted load full
3674 unsigned int hmp_up_threshold = 700;
3675 unsigned int hmp_down_threshold = 512;
3676 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3677 unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
3679 unsigned int hmp_next_up_threshold = 4096;
3680 unsigned int hmp_next_down_threshold = 4096;
3682 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3683 unsigned int hmp_packing_enabled = 1;
3684 #ifndef CONFIG_ARCH_VEXPRESS_TC2
3685 unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8;
3687 /* TC2 has a sharp consumption curve @ around 800Mhz, so
3688 we aim to spread the load around that frequency. */
3689 unsigned int hmp_full_threshold = 650; /* 80% of the 800Mhz freq * NICE_0_LOAD */
3693 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
3694 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
3695 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
3696 int *min_cpu, struct cpumask *affinity);
3698 static inline struct hmp_domain *hmp_smallest_domain(void)
3700 return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains);
3703 /* Check if cpu is in fastest hmp_domain */
3704 static inline unsigned int hmp_cpu_is_fastest(int cpu)
3706 struct list_head *pos;
3708 pos = &hmp_cpu_domain(cpu)->hmp_domains;
3709 return pos == hmp_domains.next;
3712 /* Check if cpu is in slowest hmp_domain */
3713 static inline unsigned int hmp_cpu_is_slowest(int cpu)
3715 struct list_head *pos;
3717 pos = &hmp_cpu_domain(cpu)->hmp_domains;
3718 return list_is_last(pos, &hmp_domains);
3721 /* Next (slower) hmp_domain relative to cpu */
3722 static inline struct hmp_domain *hmp_slower_domain(int cpu)
3724 struct list_head *pos;
3726 pos = &hmp_cpu_domain(cpu)->hmp_domains;
3727 return list_entry(pos->next, struct hmp_domain, hmp_domains);
3730 /* Previous (faster) hmp_domain relative to cpu */
3731 static inline struct hmp_domain *hmp_faster_domain(int cpu)
3733 struct list_head *pos;
3735 pos = &hmp_cpu_domain(cpu)->hmp_domains;
3736 return list_entry(pos->prev, struct hmp_domain, hmp_domains);
3740 * Selects a cpu in previous (faster) hmp_domain
3742 static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
3745 int lowest_cpu=NR_CPUS;
3746 __always_unused int lowest_ratio;
3747 struct hmp_domain *hmp;
3749 if (hmp_cpu_is_fastest(cpu))
3750 hmp = hmp_cpu_domain(cpu);
3752 hmp = hmp_faster_domain(cpu);
3754 lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
3755 tsk_cpus_allowed(tsk));
3761 * Selects a cpu in next (slower) hmp_domain
3762 * Note that cpumask_any_and() returns the first cpu in the cpumask
3764 static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
3767 int lowest_cpu=NR_CPUS;
3768 struct hmp_domain *hmp;
3769 __always_unused int lowest_ratio;
3771 if (hmp_cpu_is_slowest(cpu))
3772 hmp = hmp_cpu_domain(cpu);
3774 hmp = hmp_slower_domain(cpu);
3776 lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
3777 tsk_cpus_allowed(tsk));
3781 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3783 * Select the 'best' candidate little CPU to wake up on.
3784 * Implements a packing strategy which examines CPU in
3785 * logical CPU order, and selects the first which will
3786 * have at least 10% capacity available, according to
3787 * both tracked load of the runqueue and the task.
3789 static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
3792 unsigned long estimated_load;
3793 struct hmp_domain *hmp;
3794 struct sched_avg *avg;
3795 struct cpumask allowed_hmp_cpus;
3797 if(!hmp_packing_enabled ||
3798 tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100))
3799 return hmp_select_slower_cpu(tsk, cpu);
3801 if (hmp_cpu_is_slowest(cpu))
3802 hmp = hmp_cpu_domain(cpu);
3804 hmp = hmp_slower_domain(cpu);
3806 /* respect affinity */
3807 cpumask_and(&allowed_hmp_cpus, &hmp->cpus,
3808 tsk_cpus_allowed(tsk));
3810 for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) {
3811 avg = &cpu_rq(tmp_cpu)->avg;
3812 /* estimate new rq load if we add this task */
3813 estimated_load = avg->load_avg_ratio +
3814 tsk->se.avg.load_avg_ratio;
3815 if (estimated_load <= hmp_full_threshold) {
3820 /* if no match was found, the task uses the initial value */
3824 static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
3826 /* hack - always use clock from first online CPU */
3827 u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
3828 se->avg.hmp_last_up_migration = now;
3829 se->avg.hmp_last_down_migration = 0;
3830 cpu_rq(cpu)->avg.hmp_last_up_migration = now;
3831 cpu_rq(cpu)->avg.hmp_last_down_migration = 0;
3834 static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
3836 /* hack - always use clock from first online CPU */
3837 u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
3838 se->avg.hmp_last_down_migration = now;
3839 se->avg.hmp_last_up_migration = 0;
3840 cpu_rq(cpu)->avg.hmp_last_down_migration = now;
3841 cpu_rq(cpu)->avg.hmp_last_up_migration = 0;
3845 * Heterogenous multiprocessor (HMP) optimizations
3847 * These functions allow to change the growing speed of the load_avg_ratio
3848 * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
3849 * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
3851 * These functions also allow to change the up and down threshold of HMP
3852 * using /sys/kernel/hmp/{up,down}_threshold.
3853 * Both must be between 0 and 1023. The threshold that is compared
3854 * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
3856 * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
3857 * task with a load of 0 will reach the threshold after 64ms of busy loop.
3859 * Changing load_avg_periods_ms has the same effect than changing the
3860 * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
3861 * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
3862 * could trigger overflows.
3863 * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
3864 * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
3865 * could be overflowed for a weight > 2^12 even is the load_avg_contrib
3866 * should still be a 32bits result. This would not happen by multiplicating
3867 * delta time by 1/22 and setting load_avg_period_ms = 706.
3871 * By scaling the delta time it end-up increasing or decrease the
3872 * growing speed of the per entity load_avg_ratio
3873 * The scale factor hmp_data.multiplier is a fixed point
3874 * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
3876 static inline u64 hmp_variable_scale_convert(u64 delta)
3878 #ifdef CONFIG_HMP_VARIABLE_SCALE
3879 u64 high = delta >> 32ULL;
3880 u64 low = delta & 0xffffffffULL;
3881 low *= hmp_data.multiplier;
3882 high *= hmp_data.multiplier;
3883 return (low >> HMP_VARIABLE_SCALE_SHIFT)
3884 + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
3890 static ssize_t hmp_show(struct kobject *kobj,
3891 struct attribute *attr, char *buf)
3893 struct hmp_global_attr *hmp_attr =
3894 container_of(attr, struct hmp_global_attr, attr);
3897 if (hmp_attr->to_sysfs_text != NULL)
3898 return hmp_attr->to_sysfs_text(buf, PAGE_SIZE);
3900 temp = *(hmp_attr->value);
3901 if (hmp_attr->to_sysfs != NULL)
3902 temp = hmp_attr->to_sysfs(temp);
3904 return (ssize_t)sprintf(buf, "%d\n", temp);
3907 static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
3908 const char *buf, size_t count)
3911 ssize_t ret = count;
3912 struct hmp_global_attr *hmp_attr =
3913 container_of(attr, struct hmp_global_attr, attr);
3914 char *str = vmalloc(count + 1);
3917 memcpy(str, buf, count);
3919 if (sscanf(str, "%d", &temp) < 1)
3922 if (hmp_attr->from_sysfs != NULL)
3923 temp = hmp_attr->from_sysfs(temp);
3927 *(hmp_attr->value) = temp;
3933 static ssize_t hmp_print_domains(char *outbuf, int outbufsize)
3936 const char nospace[] = "%s", space[] = " %s";
3937 const char *fmt = nospace;
3938 struct hmp_domain *domain;
3939 struct list_head *pos;
3941 list_for_each(pos, &hmp_domains) {
3942 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3943 if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) {
3944 outpos += sprintf(outbuf+outpos, fmt, buf);
3948 strcat(outbuf, "\n");
3952 #ifdef CONFIG_HMP_VARIABLE_SCALE
3953 static int hmp_period_tofrom_sysfs(int value)
3955 return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
3958 /* max value for threshold is 1024 */
3959 static int hmp_theshold_from_sysfs(int value)
3965 #if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \
3966 defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)
3967 /* toggle control is only 0,1 off/on */
3968 static int hmp_toggle_from_sysfs(int value)
3970 if (value < 0 || value > 1)
3975 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3976 /* packing value must be non-negative */
3977 static int hmp_packing_from_sysfs(int value)
3984 static void hmp_attr_add(
3987 int (*to_sysfs)(int),
3988 int (*from_sysfs)(int),
3989 ssize_t (*to_sysfs_text)(char *, int),
3993 while (hmp_data.attributes[i] != NULL) {
3995 if (i >= HMP_DATA_SYSFS_MAX)
3999 hmp_data.attr[i].attr.mode = mode;
4001 hmp_data.attr[i].attr.mode = 0644;
4002 hmp_data.attr[i].show = hmp_show;
4003 hmp_data.attr[i].store = hmp_store;
4004 hmp_data.attr[i].attr.name = name;
4005 hmp_data.attr[i].value = value;
4006 hmp_data.attr[i].to_sysfs = to_sysfs;
4007 hmp_data.attr[i].from_sysfs = from_sysfs;
4008 hmp_data.attr[i].to_sysfs_text = to_sysfs_text;
4009 hmp_data.attributes[i] = &hmp_data.attr[i].attr;
4010 hmp_data.attributes[i + 1] = NULL;
4013 static int hmp_attr_init(void)
4016 memset(&hmp_data, sizeof(hmp_data), 0);
4017 hmp_attr_add("hmp_domains",
4023 hmp_attr_add("up_threshold",
4026 hmp_theshold_from_sysfs,
4029 hmp_attr_add("down_threshold",
4030 &hmp_down_threshold,
4032 hmp_theshold_from_sysfs,
4035 #ifdef CONFIG_HMP_VARIABLE_SCALE
4036 /* by default load_avg_period_ms == LOAD_AVG_PERIOD
4039 hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
4040 hmp_attr_add("load_avg_period_ms",
4041 &hmp_data.multiplier,
4042 hmp_period_tofrom_sysfs,
4043 hmp_period_tofrom_sysfs,
4047 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4048 /* default frequency-invariant scaling ON */
4049 hmp_data.freqinvar_load_scale_enabled = 1;
4050 hmp_attr_add("frequency_invariant_load_scale",
4051 &hmp_data.freqinvar_load_scale_enabled,
4053 hmp_toggle_from_sysfs,
4057 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
4058 hmp_attr_add("packing_enable",
4059 &hmp_packing_enabled,
4061 hmp_toggle_from_sysfs,
4064 hmp_attr_add("packing_limit",
4065 &hmp_full_threshold,
4067 hmp_packing_from_sysfs,
4071 hmp_data.attr_group.name = "hmp";
4072 hmp_data.attr_group.attrs = hmp_data.attributes;
4073 ret = sysfs_create_group(kernel_kobj,
4074 &hmp_data.attr_group);
4077 late_initcall(hmp_attr_init);
4079 * return the load of the lowest-loaded CPU in a given HMP domain
4080 * min_cpu optionally points to an int to receive the CPU.
4081 * affinity optionally points to a cpumask containing the
4082 * CPUs to be considered. note:
4083 * + min_cpu = NR_CPUS only if no CPUs are in the set of
4084 * affinity && hmp_domain cpus
4085 * + min_cpu will always otherwise equal one of the CPUs in
4087 * + when more than one CPU has the same load, the one which
4088 * is least-recently-disturbed by an HMP migration will be
4090 * + if all CPUs are equally loaded or idle and the times are
4091 * all the same, the first in the set will be used
4092 * + if affinity is not set, cpu_online_mask is used
4094 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4095 int *min_cpu, struct cpumask *affinity)
4098 int min_cpu_runnable_temp = NR_CPUS;
4099 u64 min_target_last_migration = ULLONG_MAX;
4100 u64 curr_last_migration;
4101 unsigned long min_runnable_load = INT_MAX;
4102 unsigned long contrib;
4103 struct sched_avg *avg;
4104 struct cpumask temp_cpumask;
4106 * only look at CPUs allowed if specified,
4107 * otherwise look at all online CPUs in the
4110 cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask);
4112 for_each_cpu_mask(cpu, temp_cpumask) {
4113 avg = &cpu_rq(cpu)->avg;
4114 /* used for both up and down migration */
4115 curr_last_migration = avg->hmp_last_up_migration ?
4116 avg->hmp_last_up_migration : avg->hmp_last_down_migration;
4118 contrib = avg->load_avg_ratio;
4120 * Consider a runqueue completely busy if there is any load
4121 * on it. Definitely not the best for overall fairness, but
4122 * does well in typical Android use cases.
4127 if ((contrib < min_runnable_load) ||
4128 (contrib == min_runnable_load &&
4129 curr_last_migration < min_target_last_migration)) {
4131 * if the load is the same target the CPU with
4132 * the longest time since a migration.
4133 * This is to spread migration load between
4134 * members of a domain more evenly when the
4135 * domain is fully loaded
4137 min_runnable_load = contrib;
4138 min_cpu_runnable_temp = cpu;
4139 min_target_last_migration = curr_last_migration;
4144 *min_cpu = min_cpu_runnable_temp;
4146 return min_runnable_load;
4150 * Calculate the task starvation
4151 * This is the ratio of actually running time vs. runnable time.
4152 * If the two are equal the task is getting the cpu time it needs or
4153 * it is alone on the cpu and the cpu is fully utilized.
4155 static inline unsigned int hmp_task_starvation(struct sched_entity *se)
4159 starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
4160 starvation /= (se->avg.runnable_avg_sum + 1);
4162 return scale_load(starvation);
4165 static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
4168 int dest_cpu = NR_CPUS;
4170 if (hmp_cpu_is_slowest(cpu))
4173 /* Is there an idle CPU in the current domain */
4174 min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
4175 if (min_usage == 0) {
4176 trace_sched_hmp_offload_abort(cpu, min_usage, "load");
4180 /* Is the task alone on the cpu? */
4181 if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
4182 trace_sched_hmp_offload_abort(cpu,
4183 cpu_rq(cpu)->cfs.h_nr_running, "nr_running");
4187 /* Is the task actually starving? */
4188 /* >=25% ratio running/runnable = starving */
4189 if (hmp_task_starvation(se) > 768) {
4190 trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se),
4195 /* Does the slower domain have any idle CPUs? */
4196 min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
4197 tsk_cpus_allowed(task_of(se)));
4199 if (min_usage == 0) {
4200 trace_sched_hmp_offload_succeed(cpu, dest_cpu);
4203 trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain");
4206 #endif /* CONFIG_SCHED_HMP */
4209 * sched_balance_self: balance the current task (running on cpu) in domains
4210 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
4213 * Balance, ie. select the least loaded group.
4215 * Returns the target CPU number, or the same CPU if no balancing is needed.
4217 * preempt must be disabled.
4220 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
4222 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4223 int cpu = smp_processor_id();
4224 int prev_cpu = task_cpu(p);
4226 int want_affine = 0;
4227 int sync = wake_flags & WF_SYNC;
4229 if (p->nr_cpus_allowed == 1)
4232 #ifdef CONFIG_SCHED_HMP
4233 /* always put non-kernel forking tasks on a big domain */
4234 if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
4235 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
4236 if (new_cpu != NR_CPUS) {
4237 hmp_next_up_delay(&p->se, new_cpu);
4240 /* failed to perform HMP fork balance, use normal balance */
4245 if (sd_flag & SD_BALANCE_WAKE) {
4246 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4252 for_each_domain(cpu, tmp) {
4253 if (!(tmp->flags & SD_LOAD_BALANCE))
4257 * If both cpu and prev_cpu are part of this domain,
4258 * cpu is a valid SD_WAKE_AFFINE target.
4260 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4261 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4266 if (tmp->flags & sd_flag)
4271 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4274 new_cpu = select_idle_sibling(p, prev_cpu);
4279 int load_idx = sd->forkexec_idx;
4280 struct sched_group *group;
4283 if (!(sd->flags & sd_flag)) {
4288 if (sd_flag & SD_BALANCE_WAKE)
4289 load_idx = sd->wake_idx;
4291 group = find_idlest_group(sd, p, cpu, load_idx);
4297 new_cpu = find_idlest_cpu(group, p, cpu);
4298 if (new_cpu == -1 || new_cpu == cpu) {
4299 /* Now try balancing at a lower domain level of cpu */
4304 /* Now try balancing at a lower domain level of new_cpu */
4306 weight = sd->span_weight;
4308 for_each_domain(cpu, tmp) {
4309 if (weight <= tmp->span_weight)
4311 if (tmp->flags & sd_flag)
4314 /* while loop will break here if sd == NULL */
4319 #ifdef CONFIG_SCHED_HMP
4320 prev_cpu = task_cpu(p);
4322 if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
4323 hmp_next_up_delay(&p->se, new_cpu);
4324 trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
4327 if (hmp_down_migration(prev_cpu, &p->se)) {
4328 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
4329 new_cpu = hmp_best_little_cpu(p, prev_cpu);
4331 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
4333 if (new_cpu != prev_cpu) {
4334 hmp_next_down_delay(&p->se, new_cpu);
4335 trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
4339 /* Make sure that the task stays in its previous hmp domain */
4340 if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
4348 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
4349 * removed when useful for applications beyond shares distribution (e.g.
4352 #ifdef CONFIG_FAIR_GROUP_SCHED
4354 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4355 * cfs_rq_of(p) references at time of call are still valid and identify the
4356 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
4357 * other assumptions, including the state of rq->lock, should be made.
4360 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4362 struct sched_entity *se = &p->se;
4363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4366 * Load tracking: accumulate removed load so that it can be processed
4367 * when we next update owning cfs_rq under rq->lock. Tasks contribute
4368 * to blocked load iff they have a positive decay-count. It can never
4369 * be negative here since on-rq tasks have decay-count == 0.
4371 if (se->avg.decay_count) {
4372 se->avg.decay_count = -__synchronize_entity_decay(se);
4373 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
4377 #endif /* CONFIG_SMP */
4379 static unsigned long
4380 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4382 unsigned long gran = sysctl_sched_wakeup_granularity;
4385 * Since its curr running now, convert the gran from real-time
4386 * to virtual-time in his units.
4388 * By using 'se' instead of 'curr' we penalize light tasks, so
4389 * they get preempted easier. That is, if 'se' < 'curr' then
4390 * the resulting gran will be larger, therefore penalizing the
4391 * lighter, if otoh 'se' > 'curr' then the resulting gran will
4392 * be smaller, again penalizing the lighter task.
4394 * This is especially important for buddies when the leftmost
4395 * task is higher priority than the buddy.
4397 return calc_delta_fair(gran, se);
4401 * Should 'se' preempt 'curr'.
4415 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4417 s64 gran, vdiff = curr->vruntime - se->vruntime;
4422 gran = wakeup_gran(curr, se);
4429 static void set_last_buddy(struct sched_entity *se)
4431 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4434 for_each_sched_entity(se)
4435 cfs_rq_of(se)->last = se;
4438 static void set_next_buddy(struct sched_entity *se)
4440 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4443 for_each_sched_entity(se)
4444 cfs_rq_of(se)->next = se;
4447 static void set_skip_buddy(struct sched_entity *se)
4449 for_each_sched_entity(se)
4450 cfs_rq_of(se)->skip = se;
4454 * Preempt the current task with a newly woken task if needed:
4456 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4458 struct task_struct *curr = rq->curr;
4459 struct sched_entity *se = &curr->se, *pse = &p->se;
4460 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4461 int scale = cfs_rq->nr_running >= sched_nr_latency;
4462 int next_buddy_marked = 0;
4464 if (unlikely(se == pse))
4468 * This is possible from callers such as move_task(), in which we
4469 * unconditionally check_prempt_curr() after an enqueue (which may have
4470 * lead to a throttle). This both saves work and prevents false
4471 * next-buddy nomination below.
4473 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4476 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4477 set_next_buddy(pse);
4478 next_buddy_marked = 1;
4482 * We can come here with TIF_NEED_RESCHED already set from new task
4485 * Note: this also catches the edge-case of curr being in a throttled
4486 * group (e.g. via set_curr_task), since update_curr() (in the
4487 * enqueue of curr) will have resulted in resched being set. This
4488 * prevents us from potentially nominating it as a false LAST_BUDDY
4491 if (test_tsk_need_resched(curr))
4494 /* Idle tasks are by definition preempted by non-idle tasks. */
4495 if (unlikely(curr->policy == SCHED_IDLE) &&
4496 likely(p->policy != SCHED_IDLE))
4500 * Batch and idle tasks do not preempt non-idle tasks (their preemption
4501 * is driven by the tick):
4503 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4506 find_matching_se(&se, &pse);
4507 update_curr(cfs_rq_of(se));
4509 if (wakeup_preempt_entity(se, pse) == 1) {
4511 * Bias pick_next to pick the sched entity that is
4512 * triggering this preemption.
4514 if (!next_buddy_marked)
4515 set_next_buddy(pse);
4524 * Only set the backward buddy when the current task is still
4525 * on the rq. This can happen when a wakeup gets interleaved
4526 * with schedule on the ->pre_schedule() or idle_balance()
4527 * point, either of which can * drop the rq lock.
4529 * Also, during early boot the idle thread is in the fair class,
4530 * for obvious reasons its a bad idea to schedule back to it.
4532 if (unlikely(!se->on_rq || curr == rq->idle))
4535 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4539 static struct task_struct *pick_next_task_fair(struct rq *rq)
4541 struct task_struct *p;
4542 struct cfs_rq *cfs_rq = &rq->cfs;
4543 struct sched_entity *se;
4545 if (!cfs_rq->nr_running)
4549 se = pick_next_entity(cfs_rq);
4550 set_next_entity(cfs_rq, se);
4551 cfs_rq = group_cfs_rq(se);
4555 if (hrtick_enabled(rq))
4556 hrtick_start_fair(rq, p);
4562 * Account for a descheduled task:
4564 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
4566 struct sched_entity *se = &prev->se;
4567 struct cfs_rq *cfs_rq;
4569 for_each_sched_entity(se) {
4570 cfs_rq = cfs_rq_of(se);
4571 put_prev_entity(cfs_rq, se);
4576 * sched_yield() is very simple
4578 * The magic of dealing with the ->skip buddy is in pick_next_entity.
4580 static void yield_task_fair(struct rq *rq)
4582 struct task_struct *curr = rq->curr;
4583 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4584 struct sched_entity *se = &curr->se;
4587 * Are we the only task in the tree?
4589 if (unlikely(rq->nr_running == 1))
4592 clear_buddies(cfs_rq, se);
4594 if (curr->policy != SCHED_BATCH) {
4595 update_rq_clock(rq);
4597 * Update run-time statistics of the 'current'.
4599 update_curr(cfs_rq);
4601 * Tell update_rq_clock() that we've just updated,
4602 * so we don't do microscopic update in schedule()
4603 * and double the fastpath cost.
4605 rq->skip_clock_update = 1;
4611 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
4613 struct sched_entity *se = &p->se;
4615 /* throttled hierarchies are not runnable */
4616 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
4619 /* Tell the scheduler that we'd really like pse to run next. */
4622 yield_task_fair(rq);
4628 /**************************************************
4629 * Fair scheduling class load-balancing methods.
4633 * The purpose of load-balancing is to achieve the same basic fairness the
4634 * per-cpu scheduler provides, namely provide a proportional amount of compute
4635 * time to each task. This is expressed in the following equation:
4637 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
4639 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
4640 * W_i,0 is defined as:
4642 * W_i,0 = \Sum_j w_i,j (2)
4644 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
4645 * is derived from the nice value as per prio_to_weight[].
4647 * The weight average is an exponential decay average of the instantaneous
4650 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
4652 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
4653 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4654 * can also include other factors [XXX].
4656 * To achieve this balance we define a measure of imbalance which follows
4657 * directly from (1):
4659 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
4661 * We them move tasks around to minimize the imbalance. In the continuous
4662 * function space it is obvious this converges, in the discrete case we get
4663 * a few fun cases generally called infeasible weight scenarios.
4666 * - infeasible weights;
4667 * - local vs global optima in the discrete case. ]
4672 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
4673 * for all i,j solution, we create a tree of cpus that follows the hardware
4674 * topology where each level pairs two lower groups (or better). This results
4675 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
4676 * tree to only the first of the previous level and we decrease the frequency
4677 * of load-balance at each level inv. proportional to the number of cpus in
4683 * \Sum { --- * --- * 2^i } = O(n) (5)
4685 * `- size of each group
4686 * | | `- number of cpus doing load-balance
4688 * `- sum over all levels
4690 * Coupled with a limit on how many tasks we can migrate every balance pass,
4691 * this makes (5) the runtime complexity of the balancer.
4693 * An important property here is that each CPU is still (indirectly) connected
4694 * to every other cpu in at most O(log n) steps:
4696 * The adjacency matrix of the resulting graph is given by:
4699 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
4702 * And you'll find that:
4704 * A^(log_2 n)_i,j != 0 for all i,j (7)
4706 * Showing there's indeed a path between every cpu in at most O(log n) steps.
4707 * The task movement gives a factor of O(m), giving a convergence complexity
4710 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
4715 * In order to avoid CPUs going idle while there's still work to do, new idle
4716 * balancing is more aggressive and has the newly idle cpu iterate up the domain
4717 * tree itself instead of relying on other CPUs to bring it work.
4719 * This adds some complexity to both (5) and (8) but it reduces the total idle
4727 * Cgroups make a horror show out of (2), instead of a simple sum we get:
4730 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
4735 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
4737 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
4739 * The big problem is S_k, its a global sum needed to compute a local (W_i)
4742 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
4743 * rewrite all of this once again.]
4746 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4748 #define LBF_ALL_PINNED 0x01
4749 #define LBF_NEED_BREAK 0x02
4750 #define LBF_SOME_PINNED 0x04
4753 struct sched_domain *sd;
4761 struct cpumask *dst_grpmask;
4763 enum cpu_idle_type idle;
4765 /* The set of CPUs under consideration for load-balancing */
4766 struct cpumask *cpus;
4771 unsigned int loop_break;
4772 unsigned int loop_max;
4776 * move_task - move a task from one runqueue to another runqueue.
4777 * Both runqueues must be locked.
4779 static void move_task(struct task_struct *p, struct lb_env *env)
4781 deactivate_task(env->src_rq, p, 0);
4782 set_task_cpu(p, env->dst_cpu);
4783 activate_task(env->dst_rq, p, 0);
4784 check_preempt_curr(env->dst_rq, p, 0);
4788 * Is this task likely cache-hot:
4791 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4795 if (p->sched_class != &fair_sched_class)
4798 if (unlikely(p->policy == SCHED_IDLE))
4802 * Buddy candidates are cache hot:
4804 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4805 (&p->se == cfs_rq_of(&p->se)->next ||
4806 &p->se == cfs_rq_of(&p->se)->last))
4809 if (sysctl_sched_migration_cost == -1)
4811 if (sysctl_sched_migration_cost == 0)
4814 delta = now - p->se.exec_start;
4816 return delta < (s64)sysctl_sched_migration_cost;
4820 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4823 int can_migrate_task(struct task_struct *p, struct lb_env *env)
4825 int tsk_cache_hot = 0;
4827 * We do not migrate tasks that are:
4828 * 1) throttled_lb_pair, or
4829 * 2) cannot be migrated to this CPU due to cpus_allowed, or
4830 * 3) running (obviously), or
4831 * 4) are cache-hot on their current CPU.
4833 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4836 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
4839 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
4842 * Remember if this task can be migrated to any other cpu in
4843 * our sched_group. We may want to revisit it if we couldn't
4844 * meet load balance goals by pulling other tasks on src_cpu.
4846 * Also avoid computing new_dst_cpu if we have already computed
4847 * one in current iteration.
4849 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
4852 /* Prevent to re-select dst_cpu via env's cpus */
4853 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4854 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4855 env->flags |= LBF_SOME_PINNED;
4856 env->new_dst_cpu = cpu;
4864 /* Record that we found atleast one task that could run on dst_cpu */
4865 env->flags &= ~LBF_ALL_PINNED;
4867 if (task_running(env->src_rq, p)) {
4868 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
4873 * Aggressive migration if:
4874 * 1) task is cache cold, or
4875 * 2) too many balance attempts have failed.
4877 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
4878 if (!tsk_cache_hot ||
4879 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4881 if (tsk_cache_hot) {
4882 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4883 schedstat_inc(p, se.statistics.nr_forced_migrations);
4889 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4894 * move_one_task tries to move exactly one task from busiest to this_rq, as
4895 * part of active balancing operations within "domain".
4896 * Returns 1 if successful and 0 otherwise.
4898 * Called with both runqueues locked.
4900 static int move_one_task(struct lb_env *env)
4902 struct task_struct *p, *n;
4904 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4905 if (!can_migrate_task(p, env))
4910 * Right now, this is only the second place move_task()
4911 * is called, so we can safely collect move_task()
4912 * stats here rather than inside move_task().
4914 schedstat_inc(env->sd, lb_gained[env->idle]);
4920 static unsigned long task_h_load(struct task_struct *p);
4922 static const unsigned int sched_nr_migrate_break = 32;
4925 * move_tasks tries to move up to imbalance weighted load from busiest to
4926 * this_rq, as part of a balancing operation within domain "sd".
4927 * Returns 1 if successful and 0 otherwise.
4929 * Called with both runqueues locked.
4931 static int move_tasks(struct lb_env *env)
4933 struct list_head *tasks = &env->src_rq->cfs_tasks;
4934 struct task_struct *p;
4938 if (env->imbalance <= 0)
4941 while (!list_empty(tasks)) {
4942 p = list_first_entry(tasks, struct task_struct, se.group_node);
4945 /* We've more or less seen every task there is, call it quits */
4946 if (env->loop > env->loop_max)
4949 /* take a breather every nr_migrate tasks */
4950 if (env->loop > env->loop_break) {
4951 env->loop_break += sched_nr_migrate_break;
4952 env->flags |= LBF_NEED_BREAK;
4956 if (!can_migrate_task(p, env))
4959 load = task_h_load(p);
4961 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4964 if ((load / 2) > env->imbalance)
4969 env->imbalance -= load;
4971 #ifdef CONFIG_PREEMPT
4973 * NEWIDLE balancing is a source of latency, so preemptible
4974 * kernels will stop after the first task is pulled to minimize
4975 * the critical section.
4977 if (env->idle == CPU_NEWLY_IDLE)
4982 * We only want to steal up to the prescribed amount of
4985 if (env->imbalance <= 0)
4990 list_move_tail(&p->se.group_node, tasks);
4994 * Right now, this is one of only two places move_task() is called,
4995 * so we can safely collect move_task() stats here rather than
4996 * inside move_task().
4998 schedstat_add(env->sd, lb_gained[env->idle], pulled);
5003 #ifdef CONFIG_FAIR_GROUP_SCHED
5005 * update tg->load_weight by folding this cpu's load_avg
5007 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5009 struct sched_entity *se = tg->se[cpu];
5010 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5012 /* throttled entities do not contribute to load */
5013 if (throttled_hierarchy(cfs_rq))
5016 update_cfs_rq_blocked_load(cfs_rq, 1);
5019 update_entity_load_avg(se, 1);
5021 * We pivot on our runnable average having decayed to zero for
5022 * list removal. This generally implies that all our children
5023 * have also been removed (modulo rounding error or bandwidth
5024 * control); however, such cases are rare and we can fix these
5027 * TODO: fix up out-of-order children on enqueue.
5029 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5030 list_del_leaf_cfs_rq(cfs_rq);
5032 struct rq *rq = rq_of(cfs_rq);
5033 update_rq_runnable_avg(rq, rq->nr_running);
5037 static void update_blocked_averages(int cpu)
5039 struct rq *rq = cpu_rq(cpu);
5040 struct cfs_rq *cfs_rq;
5041 unsigned long flags;
5043 raw_spin_lock_irqsave(&rq->lock, flags);
5044 update_rq_clock(rq);
5046 * Iterates the task_group tree in a bottom up fashion, see
5047 * list_add_leaf_cfs_rq() for details.
5049 for_each_leaf_cfs_rq(rq, cfs_rq) {
5051 * Note: We may want to consider periodically releasing
5052 * rq->lock about these updates so that creating many task
5053 * groups does not result in continually extending hold time.
5055 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5058 raw_spin_unlock_irqrestore(&rq->lock, flags);
5062 * Compute the cpu's hierarchical load factor for each task group.
5063 * This needs to be done in a top-down fashion because the load of a child
5064 * group is a fraction of its parents load.
5066 static int tg_load_down(struct task_group *tg, void *data)
5069 long cpu = (long)data;
5072 load = cpu_rq(cpu)->load.weight;
5074 load = tg->parent->cfs_rq[cpu]->h_load;
5075 load *= tg->se[cpu]->load.weight;
5076 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
5079 tg->cfs_rq[cpu]->h_load = load;
5084 static void update_h_load(long cpu)
5086 struct rq *rq = cpu_rq(cpu);
5087 unsigned long now = jiffies;
5089 if (rq->h_load_throttle == now)
5092 rq->h_load_throttle = now;
5095 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
5099 static unsigned long task_h_load(struct task_struct *p)
5101 struct cfs_rq *cfs_rq = task_cfs_rq(p);
5104 load = p->se.load.weight;
5105 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
5110 static inline void update_blocked_averages(int cpu)
5114 static inline void update_h_load(long cpu)
5118 static unsigned long task_h_load(struct task_struct *p)
5120 return p->se.load.weight;
5124 /********** Helpers for find_busiest_group ************************/
5126 * sd_lb_stats - Structure to store the statistics of a sched_domain
5127 * during load balancing.
5129 struct sd_lb_stats {
5130 struct sched_group *busiest; /* Busiest group in this sd */
5131 struct sched_group *this; /* Local group in this sd */
5132 unsigned long total_load; /* Total load of all groups in sd */
5133 unsigned long total_pwr; /* Total power of all groups in sd */
5134 unsigned long avg_load; /* Average load across all groups in sd */
5136 /** Statistics of this group */
5137 unsigned long this_load;
5138 unsigned long this_load_per_task;
5139 unsigned long this_nr_running;
5140 unsigned long this_has_capacity;
5141 unsigned int this_idle_cpus;
5143 /* Statistics of the busiest group */
5144 unsigned int busiest_idle_cpus;
5145 unsigned long max_load;
5146 unsigned long busiest_load_per_task;
5147 unsigned long busiest_nr_running;
5148 unsigned long busiest_group_capacity;
5149 unsigned long busiest_has_capacity;
5150 unsigned int busiest_group_weight;
5152 int group_imb; /* Is there imbalance in this sd */
5156 * sg_lb_stats - stats of a sched_group required for load_balancing
5158 struct sg_lb_stats {
5159 unsigned long avg_load; /*Avg load across the CPUs of the group */
5160 unsigned long group_load; /* Total load over the CPUs of the group */
5161 unsigned long sum_nr_running; /* Nr tasks running in the group */
5162 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5163 unsigned long group_capacity;
5164 unsigned long idle_cpus;
5165 unsigned long group_weight;
5166 int group_imb; /* Is there an imbalance in the group ? */
5167 int group_has_capacity; /* Is there extra capacity in the group? */
5171 * get_sd_load_idx - Obtain the load index for a given sched domain.
5172 * @sd: The sched_domain whose load_idx is to be obtained.
5173 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
5175 static inline int get_sd_load_idx(struct sched_domain *sd,
5176 enum cpu_idle_type idle)
5182 load_idx = sd->busy_idx;
5185 case CPU_NEWLY_IDLE:
5186 load_idx = sd->newidle_idx;
5189 load_idx = sd->idle_idx;
5196 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
5198 return SCHED_POWER_SCALE;
5201 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
5203 return default_scale_freq_power(sd, cpu);
5206 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5208 unsigned long weight = sd->span_weight;
5209 unsigned long smt_gain = sd->smt_gain;
5216 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
5218 return default_scale_smt_power(sd, cpu);
5221 static unsigned long scale_rt_power(int cpu)
5223 struct rq *rq = cpu_rq(cpu);
5224 u64 total, available, age_stamp, avg;
5227 * Since we're reading these variables without serialization make sure
5228 * we read them once before doing sanity checks on them.
5230 age_stamp = ACCESS_ONCE(rq->age_stamp);
5231 avg = ACCESS_ONCE(rq->rt_avg);
5233 total = sched_avg_period() + (rq->clock - age_stamp);
5235 if (unlikely(total < avg)) {
5236 /* Ensures that power won't end up being negative */
5239 available = total - avg;
5242 if (unlikely((s64)total < SCHED_POWER_SCALE))
5243 total = SCHED_POWER_SCALE;
5245 total >>= SCHED_POWER_SHIFT;
5247 return div_u64(available, total);
5250 static void update_cpu_power(struct sched_domain *sd, int cpu)
5252 unsigned long weight = sd->span_weight;
5253 unsigned long power = SCHED_POWER_SCALE;
5254 struct sched_group *sdg = sd->groups;
5256 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
5257 if (sched_feat(ARCH_POWER))
5258 power *= arch_scale_smt_power(sd, cpu);
5260 power *= default_scale_smt_power(sd, cpu);
5262 power >>= SCHED_POWER_SHIFT;
5265 sdg->sgp->power_orig = power;
5267 if (sched_feat(ARCH_POWER))
5268 power *= arch_scale_freq_power(sd, cpu);
5270 power *= default_scale_freq_power(sd, cpu);
5272 power >>= SCHED_POWER_SHIFT;
5274 power *= scale_rt_power(cpu);
5275 power >>= SCHED_POWER_SHIFT;
5280 cpu_rq(cpu)->cpu_power = power;
5281 sdg->sgp->power = power;
5284 void update_group_power(struct sched_domain *sd, int cpu)
5286 struct sched_domain *child = sd->child;
5287 struct sched_group *group, *sdg = sd->groups;
5288 unsigned long power;
5289 unsigned long interval;
5291 interval = msecs_to_jiffies(sd->balance_interval);
5292 interval = clamp(interval, 1UL, max_load_balance_interval);
5293 sdg->sgp->next_update = jiffies + interval;
5296 update_cpu_power(sd, cpu);
5302 if (child->flags & SD_OVERLAP) {
5304 * SD_OVERLAP domains cannot assume that child groups
5305 * span the current group.
5308 for_each_cpu(cpu, sched_group_cpus(sdg))
5309 power += power_of(cpu);
5312 * !SD_OVERLAP domains can assume that child groups
5313 * span the current group.
5316 group = child->groups;
5318 power += group->sgp->power;
5319 group = group->next;
5320 } while (group != child->groups);
5323 sdg->sgp->power_orig = sdg->sgp->power = power;
5327 * Try and fix up capacity for tiny siblings, this is needed when
5328 * things like SD_ASYM_PACKING need f_b_g to select another sibling
5329 * which on its own isn't powerful enough.
5331 * See update_sd_pick_busiest() and check_asym_packing().
5334 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5337 * Only siblings can have significantly less than SCHED_POWER_SCALE
5339 if (!(sd->flags & SD_SHARE_CPUPOWER))
5343 * If ~90% of the cpu_power is still there, we're good.
5345 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
5352 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5353 * @env: The load balancing environment.
5354 * @group: sched_group whose statistics are to be updated.
5355 * @load_idx: Load index of sched_domain of this_cpu for load calc.
5356 * @local_group: Does group contain this_cpu.
5357 * @balance: Should we balance.
5358 * @sgs: variable to hold the statistics for this group.
5360 static inline void update_sg_lb_stats(struct lb_env *env,
5361 struct sched_group *group, int load_idx,
5362 int local_group, int *balance, struct sg_lb_stats *sgs)
5364 unsigned long nr_running, max_nr_running, min_nr_running;
5365 unsigned long load, max_cpu_load, min_cpu_load;
5366 unsigned int balance_cpu = -1, first_idle_cpu = 0;
5367 unsigned long avg_load_per_task = 0;
5371 balance_cpu = group_balance_cpu(group);
5373 /* Tally up the load of all CPUs in the group */
5375 min_cpu_load = ~0UL;
5377 min_nr_running = ~0UL;
5379 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5380 struct rq *rq = cpu_rq(i);
5382 nr_running = rq->nr_running;
5384 /* Bias balancing toward cpus of our domain */
5386 if (idle_cpu(i) && !first_idle_cpu &&
5387 cpumask_test_cpu(i, sched_group_mask(group))) {
5392 load = target_load(i, load_idx);
5394 load = source_load(i, load_idx);
5395 if (load > max_cpu_load)
5396 max_cpu_load = load;
5397 if (min_cpu_load > load)
5398 min_cpu_load = load;
5400 if (nr_running > max_nr_running)
5401 max_nr_running = nr_running;
5402 if (min_nr_running > nr_running)
5403 min_nr_running = nr_running;
5406 sgs->group_load += load;
5407 sgs->sum_nr_running += nr_running;
5408 sgs->sum_weighted_load += weighted_cpuload(i);
5414 * First idle cpu or the first cpu(busiest) in this sched group
5415 * is eligible for doing load balancing at this and above
5416 * domains. In the newly idle case, we will allow all the cpu's
5417 * to do the newly idle load balance.
5420 if (env->idle != CPU_NEWLY_IDLE) {
5421 if (balance_cpu != env->dst_cpu) {
5425 update_group_power(env->sd, env->dst_cpu);
5426 } else if (time_after_eq(jiffies, group->sgp->next_update))
5427 update_group_power(env->sd, env->dst_cpu);
5430 /* Adjust by relative CPU power of the group */
5431 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
5434 * Consider the group unbalanced when the imbalance is larger
5435 * than the average weight of a task.
5437 * APZ: with cgroup the avg task weight can vary wildly and
5438 * might not be a suitable number - should we keep a
5439 * normalized nr_running number somewhere that negates
5442 if (sgs->sum_nr_running)
5443 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5445 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
5446 (max_nr_running - min_nr_running) > 1)
5449 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
5451 if (!sgs->group_capacity)
5452 sgs->group_capacity = fix_small_capacity(env->sd, group);
5453 sgs->group_weight = group->group_weight;
5455 if (sgs->group_capacity > sgs->sum_nr_running)
5456 sgs->group_has_capacity = 1;
5460 * update_sd_pick_busiest - return 1 on busiest group
5461 * @env: The load balancing environment.
5462 * @sds: sched_domain statistics
5463 * @sg: sched_group candidate to be checked for being the busiest
5464 * @sgs: sched_group statistics
5466 * Determine if @sg is a busier group than the previously selected
5469 static bool update_sd_pick_busiest(struct lb_env *env,
5470 struct sd_lb_stats *sds,
5471 struct sched_group *sg,
5472 struct sg_lb_stats *sgs)
5474 if (sgs->avg_load <= sds->max_load)
5477 if (sgs->sum_nr_running > sgs->group_capacity)
5484 * ASYM_PACKING needs to move all the work to the lowest
5485 * numbered CPUs in the group, therefore mark all groups
5486 * higher than ourself as busy.
5488 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5489 env->dst_cpu < group_first_cpu(sg)) {
5493 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5501 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5502 * @env: The load balancing environment.
5503 * @balance: Should we balance.
5504 * @sds: variable to hold the statistics for this sched_domain.
5506 static inline void update_sd_lb_stats(struct lb_env *env,
5507 int *balance, struct sd_lb_stats *sds)
5509 struct sched_domain *child = env->sd->child;
5510 struct sched_group *sg = env->sd->groups;
5511 struct sg_lb_stats sgs;
5512 int load_idx, prefer_sibling = 0;
5514 if (child && child->flags & SD_PREFER_SIBLING)
5517 load_idx = get_sd_load_idx(env->sd, env->idle);
5522 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
5523 memset(&sgs, 0, sizeof(sgs));
5524 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
5526 if (local_group && !(*balance))
5529 sds->total_load += sgs.group_load;
5530 sds->total_pwr += sg->sgp->power;
5533 * In case the child domain prefers tasks go to siblings
5534 * first, lower the sg capacity to one so that we'll try
5535 * and move all the excess tasks away. We lower the capacity
5536 * of a group only if the local group has the capacity to fit
5537 * these excess tasks, i.e. nr_running < group_capacity. The
5538 * extra check prevents the case where you always pull from the
5539 * heaviest group when it is already under-utilized (possible
5540 * with a large weight task outweighs the tasks on the system).
5542 if (prefer_sibling && !local_group && sds->this_has_capacity)
5543 sgs.group_capacity = min(sgs.group_capacity, 1UL);
5546 sds->this_load = sgs.avg_load;
5548 sds->this_nr_running = sgs.sum_nr_running;
5549 sds->this_load_per_task = sgs.sum_weighted_load;
5550 sds->this_has_capacity = sgs.group_has_capacity;
5551 sds->this_idle_cpus = sgs.idle_cpus;
5552 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
5553 sds->max_load = sgs.avg_load;
5555 sds->busiest_nr_running = sgs.sum_nr_running;
5556 sds->busiest_idle_cpus = sgs.idle_cpus;
5557 sds->busiest_group_capacity = sgs.group_capacity;
5558 sds->busiest_load_per_task = sgs.sum_weighted_load;
5559 sds->busiest_has_capacity = sgs.group_has_capacity;
5560 sds->busiest_group_weight = sgs.group_weight;
5561 sds->group_imb = sgs.group_imb;
5565 } while (sg != env->sd->groups);
5569 * check_asym_packing - Check to see if the group is packed into the
5572 * This is primarily intended to used at the sibling level. Some
5573 * cores like POWER7 prefer to use lower numbered SMT threads. In the
5574 * case of POWER7, it can move to lower SMT modes only when higher
5575 * threads are idle. When in lower SMT modes, the threads will
5576 * perform better since they share less core resources. Hence when we
5577 * have idle threads, we want them to be the higher ones.
5579 * This packing function is run on idle threads. It checks to see if
5580 * the busiest CPU in this domain (core in the P7 case) has a higher
5581 * CPU number than the packing function is being run on. Here we are
5582 * assuming lower CPU number will be equivalent to lower a SMT thread
5585 * Returns 1 when packing is required and a task should be moved to
5586 * this CPU. The amount of the imbalance is returned in *imbalance.
5588 * @env: The load balancing environment.
5589 * @sds: Statistics of the sched_domain which is to be packed
5591 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
5595 if (!(env->sd->flags & SD_ASYM_PACKING))
5601 busiest_cpu = group_first_cpu(sds->busiest);
5602 if (env->dst_cpu > busiest_cpu)
5605 env->imbalance = DIV_ROUND_CLOSEST(
5606 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
5612 * fix_small_imbalance - Calculate the minor imbalance that exists
5613 * amongst the groups of a sched_domain, during
5615 * @env: The load balancing environment.
5616 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
5619 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5621 unsigned long tmp, pwr_now = 0, pwr_move = 0;
5622 unsigned int imbn = 2;
5623 unsigned long scaled_busy_load_per_task;
5625 if (sds->this_nr_running) {
5626 sds->this_load_per_task /= sds->this_nr_running;
5627 if (sds->busiest_load_per_task >
5628 sds->this_load_per_task)
5631 sds->this_load_per_task =
5632 cpu_avg_load_per_task(env->dst_cpu);
5635 scaled_busy_load_per_task = sds->busiest_load_per_task
5636 * SCHED_POWER_SCALE;
5637 scaled_busy_load_per_task /= sds->busiest->sgp->power;
5639 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
5640 (scaled_busy_load_per_task * imbn)) {
5641 env->imbalance = sds->busiest_load_per_task;
5646 * OK, we don't have enough imbalance to justify moving tasks,
5647 * however we may be able to increase total CPU power used by
5651 pwr_now += sds->busiest->sgp->power *
5652 min(sds->busiest_load_per_task, sds->max_load);
5653 pwr_now += sds->this->sgp->power *
5654 min(sds->this_load_per_task, sds->this_load);
5655 pwr_now /= SCHED_POWER_SCALE;
5657 /* Amount of load we'd subtract */
5658 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
5659 sds->busiest->sgp->power;
5660 if (sds->max_load > tmp)
5661 pwr_move += sds->busiest->sgp->power *
5662 min(sds->busiest_load_per_task, sds->max_load - tmp);
5664 /* Amount of load we'd add */
5665 if (sds->max_load * sds->busiest->sgp->power <
5666 sds->busiest_load_per_task * SCHED_POWER_SCALE)
5667 tmp = (sds->max_load * sds->busiest->sgp->power) /
5668 sds->this->sgp->power;
5670 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
5671 sds->this->sgp->power;
5672 pwr_move += sds->this->sgp->power *
5673 min(sds->this_load_per_task, sds->this_load + tmp);
5674 pwr_move /= SCHED_POWER_SCALE;
5676 /* Move if we gain throughput */
5677 if (pwr_move > pwr_now)
5678 env->imbalance = sds->busiest_load_per_task;
5682 * calculate_imbalance - Calculate the amount of imbalance present within the
5683 * groups of a given sched_domain during load balance.
5684 * @env: load balance environment
5685 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
5687 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5689 unsigned long max_pull, load_above_capacity = ~0UL;
5691 sds->busiest_load_per_task /= sds->busiest_nr_running;
5692 if (sds->group_imb) {
5693 sds->busiest_load_per_task =
5694 min(sds->busiest_load_per_task, sds->avg_load);
5698 * In the presence of smp nice balancing, certain scenarios can have
5699 * max load less than avg load(as we skip the groups at or below
5700 * its cpu_power, while calculating max_load..)
5702 if (sds->max_load < sds->avg_load) {
5704 return fix_small_imbalance(env, sds);
5707 if (!sds->group_imb) {
5709 * Don't want to pull so many tasks that a group would go idle.
5711 load_above_capacity = (sds->busiest_nr_running -
5712 sds->busiest_group_capacity);
5714 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
5716 load_above_capacity /= sds->busiest->sgp->power;
5720 * We're trying to get all the cpus to the average_load, so we don't
5721 * want to push ourselves above the average load, nor do we wish to
5722 * reduce the max loaded cpu below the average load. At the same time,
5723 * we also don't want to reduce the group load below the group capacity
5724 * (so that we can implement power-savings policies etc). Thus we look
5725 * for the minimum possible imbalance.
5726 * Be careful of negative numbers as they'll appear as very large values
5727 * with unsigned longs.
5729 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
5731 /* How much load to actually move to equalise the imbalance */
5732 env->imbalance = min(max_pull * sds->busiest->sgp->power,
5733 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
5734 / SCHED_POWER_SCALE;
5737 * if *imbalance is less than the average load per runnable task
5738 * there is no guarantee that any tasks will be moved so we'll have
5739 * a think about bumping its value to force at least one task to be
5742 if (env->imbalance < sds->busiest_load_per_task)
5743 return fix_small_imbalance(env, sds);
5747 /******* find_busiest_group() helpers end here *********************/
5750 * find_busiest_group - Returns the busiest group within the sched_domain
5751 * if there is an imbalance. If there isn't an imbalance, and
5752 * the user has opted for power-savings, it returns a group whose
5753 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
5754 * such a group exists.
5756 * Also calculates the amount of weighted load which should be moved
5757 * to restore balance.
5759 * @env: The load balancing environment.
5760 * @balance: Pointer to a variable indicating if this_cpu
5761 * is the appropriate cpu to perform load balancing at this_level.
5763 * Returns: - the busiest group if imbalance exists.
5764 * - If no imbalance and user has opted for power-savings balance,
5765 * return the least loaded group whose CPUs can be
5766 * put to idle by rebalancing its tasks onto our group.
5768 static struct sched_group *
5769 find_busiest_group(struct lb_env *env, int *balance)
5771 struct sd_lb_stats sds;
5773 memset(&sds, 0, sizeof(sds));
5776 * Compute the various statistics relavent for load balancing at
5779 update_sd_lb_stats(env, balance, &sds);
5782 * this_cpu is not the appropriate cpu to perform load balancing at
5788 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
5789 check_asym_packing(env, &sds))
5792 /* There is no busy sibling group to pull tasks from */
5793 if (!sds.busiest || sds.busiest_nr_running == 0)
5796 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
5799 * If the busiest group is imbalanced the below checks don't
5800 * work because they assumes all things are equal, which typically
5801 * isn't true due to cpus_allowed constraints and the like.
5806 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
5807 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
5808 !sds.busiest_has_capacity)
5812 * If the local group is more busy than the selected busiest group
5813 * don't try and pull any tasks.
5815 if (sds.this_load >= sds.max_load)
5819 * Don't pull any tasks if this group is already above the domain
5822 if (sds.this_load >= sds.avg_load)
5825 if (env->idle == CPU_IDLE) {
5827 * This cpu is idle. If the busiest group load doesn't
5828 * have more tasks than the number of available cpu's and
5829 * there is no imbalance between this and busiest group
5830 * wrt to idle cpu's, it is balanced.
5832 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
5833 sds.busiest_nr_running <= sds.busiest_group_weight)
5837 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
5838 * imbalance_pct to be conservative.
5840 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
5845 /* Looks like there is an imbalance. Compute it */
5846 calculate_imbalance(env, &sds);
5856 * find_busiest_queue - find the busiest runqueue among the cpus in group.
5858 static struct rq *find_busiest_queue(struct lb_env *env,
5859 struct sched_group *group)
5861 struct rq *busiest = NULL, *rq;
5862 unsigned long max_load = 0;
5865 for_each_cpu(i, sched_group_cpus(group)) {
5866 unsigned long power = power_of(i);
5867 unsigned long capacity = DIV_ROUND_CLOSEST(power,
5872 capacity = fix_small_capacity(env->sd, group);
5874 if (!cpumask_test_cpu(i, env->cpus))
5878 wl = weighted_cpuload(i);
5881 * When comparing with imbalance, use weighted_cpuload()
5882 * which is not scaled with the cpu power.
5884 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
5888 * For the load comparisons with the other cpu's, consider
5889 * the weighted_cpuload() scaled with the cpu power, so that
5890 * the load can be moved away from the cpu that is potentially
5891 * running at a lower capacity.
5893 wl = (wl * SCHED_POWER_SCALE) / power;
5895 if (wl > max_load) {
5905 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
5906 * so long as it is large enough.
5908 #define MAX_PINNED_INTERVAL 512
5910 /* Working cpumask for load_balance and load_balance_newidle. */
5911 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5913 static int need_active_balance(struct lb_env *env)
5915 struct sched_domain *sd = env->sd;
5917 if (env->idle == CPU_NEWLY_IDLE) {
5920 * ASYM_PACKING needs to force migrate tasks from busy but
5921 * higher numbered CPUs in order to pack all tasks in the
5922 * lowest numbered CPUs.
5924 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
5928 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5931 static int active_load_balance_cpu_stop(void *data);
5934 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5935 * tasks if there is an imbalance.
5937 static int load_balance(int this_cpu, struct rq *this_rq,
5938 struct sched_domain *sd, enum cpu_idle_type idle,
5941 int ld_moved, cur_ld_moved, active_balance = 0;
5942 struct sched_group *group;
5944 unsigned long flags;
5945 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5947 struct lb_env env = {
5949 .dst_cpu = this_cpu,
5951 .dst_grpmask = sched_group_cpus(sd->groups),
5953 .loop_break = sched_nr_migrate_break,
5958 * For NEWLY_IDLE load_balancing, we don't need to consider
5959 * other cpus in our group
5961 if (idle == CPU_NEWLY_IDLE)
5962 env.dst_grpmask = NULL;
5964 cpumask_copy(cpus, cpu_active_mask);
5966 schedstat_inc(sd, lb_count[idle]);
5969 group = find_busiest_group(&env, balance);
5975 schedstat_inc(sd, lb_nobusyg[idle]);
5979 busiest = find_busiest_queue(&env, group);
5981 schedstat_inc(sd, lb_nobusyq[idle]);
5985 BUG_ON(busiest == env.dst_rq);
5987 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5990 if (busiest->nr_running > 1) {
5992 * Attempt to move tasks. If find_busiest_group has found
5993 * an imbalance but busiest->nr_running <= 1, the group is
5994 * still unbalanced. ld_moved simply stays zero, so it is
5995 * correctly treated as an imbalance.
5997 env.flags |= LBF_ALL_PINNED;
5998 env.src_cpu = busiest->cpu;
5999 env.src_rq = busiest;
6000 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6002 update_h_load(env.src_cpu);
6004 local_irq_save(flags);
6005 double_rq_lock(env.dst_rq, busiest);
6008 * cur_ld_moved - load moved in current iteration
6009 * ld_moved - cumulative load moved across iterations
6011 cur_ld_moved = move_tasks(&env);
6012 ld_moved += cur_ld_moved;
6013 double_rq_unlock(env.dst_rq, busiest);
6014 local_irq_restore(flags);
6017 * some other cpu did the load balance for us.
6019 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
6020 resched_cpu(env.dst_cpu);
6022 if (env.flags & LBF_NEED_BREAK) {
6023 env.flags &= ~LBF_NEED_BREAK;
6028 * Revisit (affine) tasks on src_cpu that couldn't be moved to
6029 * us and move them to an alternate dst_cpu in our sched_group
6030 * where they can run. The upper limit on how many times we
6031 * iterate on same src_cpu is dependent on number of cpus in our
6034 * This changes load balance semantics a bit on who can move
6035 * load to a given_cpu. In addition to the given_cpu itself
6036 * (or a ilb_cpu acting on its behalf where given_cpu is
6037 * nohz-idle), we now have balance_cpu in a position to move
6038 * load to given_cpu. In rare situations, this may cause
6039 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
6040 * _independently_ and at _same_ time to move some load to
6041 * given_cpu) causing exceess load to be moved to given_cpu.
6042 * This however should not happen so much in practice and
6043 * moreover subsequent load balance cycles should correct the
6044 * excess load moved.
6046 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6048 env.dst_rq = cpu_rq(env.new_dst_cpu);
6049 env.dst_cpu = env.new_dst_cpu;
6050 env.flags &= ~LBF_SOME_PINNED;
6052 env.loop_break = sched_nr_migrate_break;
6054 /* Prevent to re-select dst_cpu via env's cpus */
6055 cpumask_clear_cpu(env.dst_cpu, env.cpus);
6058 * Go back to "more_balance" rather than "redo" since we
6059 * need to continue with same src_cpu.
6064 /* All tasks on this runqueue were pinned by CPU affinity */
6065 if (unlikely(env.flags & LBF_ALL_PINNED)) {
6066 cpumask_clear_cpu(cpu_of(busiest), cpus);
6067 if (!cpumask_empty(cpus)) {
6069 env.loop_break = sched_nr_migrate_break;
6077 schedstat_inc(sd, lb_failed[idle]);
6079 * Increment the failure counter only on periodic balance.
6080 * We do not want newidle balance, which can be very
6081 * frequent, pollute the failure counter causing
6082 * excessive cache_hot migrations and active balances.
6084 if (idle != CPU_NEWLY_IDLE)
6085 sd->nr_balance_failed++;
6087 if (need_active_balance(&env)) {
6088 raw_spin_lock_irqsave(&busiest->lock, flags);
6090 /* don't kick the active_load_balance_cpu_stop,
6091 * if the curr task on busiest cpu can't be
6094 if (!cpumask_test_cpu(this_cpu,
6095 tsk_cpus_allowed(busiest->curr))) {
6096 raw_spin_unlock_irqrestore(&busiest->lock,
6098 env.flags |= LBF_ALL_PINNED;
6099 goto out_one_pinned;
6103 * ->active_balance synchronizes accesses to
6104 * ->active_balance_work. Once set, it's cleared
6105 * only after active load balance is finished.
6107 if (!busiest->active_balance) {
6108 busiest->active_balance = 1;
6109 busiest->push_cpu = this_cpu;
6112 raw_spin_unlock_irqrestore(&busiest->lock, flags);
6114 if (active_balance) {
6115 stop_one_cpu_nowait(cpu_of(busiest),
6116 active_load_balance_cpu_stop, busiest,
6117 &busiest->active_balance_work);
6121 * We've kicked active balancing, reset the failure
6124 sd->nr_balance_failed = sd->cache_nice_tries+1;
6127 sd->nr_balance_failed = 0;
6129 if (likely(!active_balance)) {
6130 /* We were unbalanced, so reset the balancing interval */
6131 sd->balance_interval = sd->min_interval;
6134 * If we've begun active balancing, start to back off. This
6135 * case may not be covered by the all_pinned logic if there
6136 * is only 1 task on the busy runqueue (because we don't call
6139 if (sd->balance_interval < sd->max_interval)
6140 sd->balance_interval *= 2;
6146 schedstat_inc(sd, lb_balanced[idle]);
6148 sd->nr_balance_failed = 0;
6151 /* tune up the balancing interval */
6152 if (((env.flags & LBF_ALL_PINNED) &&
6153 sd->balance_interval < MAX_PINNED_INTERVAL) ||
6154 (sd->balance_interval < sd->max_interval))
6155 sd->balance_interval *= 2;
6161 #ifdef CONFIG_SCHED_HMP
6162 static unsigned int hmp_idle_pull(int this_cpu);
6165 * idle_balance is called by schedule() if this_cpu is about to become
6166 * idle. Attempts to pull tasks from other CPUs.
6168 void idle_balance(int this_cpu, struct rq *this_rq)
6170 struct sched_domain *sd;
6171 int pulled_task = 0;
6172 unsigned long next_balance = jiffies + HZ;
6174 this_rq->idle_stamp = this_rq->clock;
6176 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6180 * Drop the rq->lock, but keep IRQ/preempt disabled.
6182 raw_spin_unlock(&this_rq->lock);
6184 update_blocked_averages(this_cpu);
6186 for_each_domain(this_cpu, sd) {
6187 unsigned long interval;
6190 if (!(sd->flags & SD_LOAD_BALANCE))
6193 if (sd->flags & SD_BALANCE_NEWIDLE) {
6194 /* If we've pulled tasks over stop searching: */
6195 pulled_task = load_balance(this_cpu, this_rq,
6196 sd, CPU_NEWLY_IDLE, &balance);
6199 interval = msecs_to_jiffies(sd->balance_interval);
6200 if (time_after(next_balance, sd->last_balance + interval))
6201 next_balance = sd->last_balance + interval;
6203 this_rq->idle_stamp = 0;
6208 #ifdef CONFIG_SCHED_HMP
6210 pulled_task = hmp_idle_pull(this_cpu);
6212 raw_spin_lock(&this_rq->lock);
6214 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6216 * We are going idle. next_balance may be set based on
6217 * a busy processor. So reset next_balance.
6219 this_rq->next_balance = next_balance;
6224 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
6225 * running tasks off the busiest CPU onto idle CPUs. It requires at
6226 * least 1 task to be running on each physical CPU where possible, and
6227 * avoids physical / logical imbalances.
6229 static int active_load_balance_cpu_stop(void *data)
6231 struct rq *busiest_rq = data;
6232 int busiest_cpu = cpu_of(busiest_rq);
6233 int target_cpu = busiest_rq->push_cpu;
6234 struct rq *target_rq = cpu_rq(target_cpu);
6235 struct sched_domain *sd;
6237 raw_spin_lock_irq(&busiest_rq->lock);
6239 /* make sure the requested cpu hasn't gone down in the meantime */
6240 if (unlikely(busiest_cpu != smp_processor_id() ||
6241 !busiest_rq->active_balance))
6244 /* Is there any task to move? */
6245 if (busiest_rq->nr_running <= 1)
6249 * This condition is "impossible", if it occurs
6250 * we need to fix it. Originally reported by
6251 * Bjorn Helgaas on a 128-cpu setup.
6253 BUG_ON(busiest_rq == target_rq);
6255 /* move a task from busiest_rq to target_rq */
6256 double_lock_balance(busiest_rq, target_rq);
6258 /* Search for an sd spanning us and the target CPU. */
6260 for_each_domain(target_cpu, sd) {
6261 if ((sd->flags & SD_LOAD_BALANCE) &&
6262 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6267 struct lb_env env = {
6269 .dst_cpu = target_cpu,
6270 .dst_rq = target_rq,
6271 .src_cpu = busiest_rq->cpu,
6272 .src_rq = busiest_rq,
6276 schedstat_inc(sd, alb_count);
6278 if (move_one_task(&env))
6279 schedstat_inc(sd, alb_pushed);
6281 schedstat_inc(sd, alb_failed);
6284 double_unlock_balance(busiest_rq, target_rq);
6286 busiest_rq->active_balance = 0;
6287 raw_spin_unlock_irq(&busiest_rq->lock);
6291 #ifdef CONFIG_NO_HZ_COMMON
6293 * idle load balancing details
6294 * - When one of the busy CPUs notice that there may be an idle rebalancing
6295 * needed, they will kick the idle load balancer, which then does idle
6296 * load balancing for all the idle CPUs.
6299 cpumask_var_t idle_cpus_mask;
6301 unsigned long next_balance; /* in jiffy units */
6302 } nohz ____cacheline_aligned;
6304 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6306 * Decide if the tasks on the busy CPUs in the
6307 * littlest domain would benefit from an idle balance
6309 static int hmp_packing_ilb_needed(int cpu)
6311 struct hmp_domain *hmp;
6312 /* always allow ilb on non-slowest domain */
6313 if (!hmp_cpu_is_slowest(cpu))
6316 hmp = hmp_cpu_domain(cpu);
6317 for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) {
6318 /* only idle balance if a CPU is loaded over threshold */
6319 if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold)
6326 static inline int find_new_ilb(int call_cpu)
6328 int ilb = cpumask_first(nohz.idle_cpus_mask);
6329 #ifdef CONFIG_SCHED_HMP
6332 /* restrict nohz balancing to occur in the same hmp domain */
6333 ilb = cpumask_first_and(nohz.idle_cpus_mask,
6334 &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
6336 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6337 if (ilb < nr_cpu_ids)
6338 ilb_needed = hmp_packing_ilb_needed(ilb);
6341 if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb))
6344 if (ilb < nr_cpu_ids && idle_cpu(ilb))
6352 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
6353 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6354 * CPU (if there is one).
6356 static void nohz_balancer_kick(int cpu)
6360 nohz.next_balance++;
6362 ilb_cpu = find_new_ilb(cpu);
6364 if (ilb_cpu >= nr_cpu_ids)
6367 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6370 * Use smp_send_reschedule() instead of resched_cpu().
6371 * This way we generate a sched IPI on the target cpu which
6372 * is idle. And the softirq performing nohz idle load balance
6373 * will be run before returning from the IPI.
6375 smp_send_reschedule(ilb_cpu);
6379 static inline void nohz_balance_exit_idle(int cpu)
6381 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6382 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6383 atomic_dec(&nohz.nr_cpus);
6384 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6388 static inline void set_cpu_sd_state_busy(void)
6390 struct sched_domain *sd;
6391 int cpu = smp_processor_id();
6394 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
6396 if (!sd || !sd->nohz_idle)
6400 for (; sd; sd = sd->parent)
6401 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
6406 void set_cpu_sd_state_idle(void)
6408 struct sched_domain *sd;
6409 int cpu = smp_processor_id();
6412 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
6414 if (!sd || sd->nohz_idle)
6418 for (; sd; sd = sd->parent)
6419 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
6425 * This routine will record that the cpu is going idle with tick stopped.
6426 * This info will be used in performing idle load balancing in the future.
6428 void nohz_balance_enter_idle(int cpu)
6431 * If this cpu is going down, then nothing needs to be done.
6433 if (!cpu_active(cpu))
6436 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6439 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6440 atomic_inc(&nohz.nr_cpus);
6441 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6444 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
6445 unsigned long action, void *hcpu)
6447 switch (action & ~CPU_TASKS_FROZEN) {
6449 nohz_balance_exit_idle(smp_processor_id());
6457 static DEFINE_SPINLOCK(balancing);
6460 * Scale the max load_balance interval with the number of CPUs in the system.
6461 * This trades load-balance latency on larger machines for less cross talk.
6463 void update_max_interval(void)
6465 max_load_balance_interval = HZ*num_online_cpus()/10;
6469 * It checks each scheduling domain to see if it is due to be balanced,
6470 * and initiates a balancing operation if so.
6472 * Balancing parameters are set up in init_sched_domains.
6474 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
6477 struct rq *rq = cpu_rq(cpu);
6478 unsigned long interval;
6479 struct sched_domain *sd;
6480 /* Earliest time when we have to do rebalance again */
6481 unsigned long next_balance = jiffies + 60*HZ;
6482 int update_next_balance = 0;
6485 update_blocked_averages(cpu);
6488 for_each_domain(cpu, sd) {
6489 if (!(sd->flags & SD_LOAD_BALANCE))
6492 interval = sd->balance_interval;
6493 if (idle != CPU_IDLE)
6494 interval *= sd->busy_factor;
6496 /* scale ms to jiffies */
6497 interval = msecs_to_jiffies(interval);
6498 interval = clamp(interval, 1UL, max_load_balance_interval);
6500 need_serialize = sd->flags & SD_SERIALIZE;
6502 if (need_serialize) {
6503 if (!spin_trylock(&balancing))
6507 if (time_after_eq(jiffies, sd->last_balance + interval)) {
6508 if (load_balance(cpu, rq, sd, idle, &balance)) {
6510 * The LBF_SOME_PINNED logic could have changed
6511 * env->dst_cpu, so we can't know our idle
6512 * state even if we migrated tasks. Update it.
6514 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
6516 sd->last_balance = jiffies;
6519 spin_unlock(&balancing);
6521 if (time_after(next_balance, sd->last_balance + interval)) {
6522 next_balance = sd->last_balance + interval;
6523 update_next_balance = 1;
6527 * Stop the load balance at this level. There is another
6528 * CPU in our sched group which is doing load balancing more
6537 * next_balance will be updated only when there is a need.
6538 * When the cpu is attached to null domain for ex, it will not be
6541 if (likely(update_next_balance))
6542 rq->next_balance = next_balance;
6545 #ifdef CONFIG_NO_HZ_COMMON
6547 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6548 * rebalancing for all the cpus for whom scheduler ticks are stopped.
6550 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6552 struct rq *this_rq = cpu_rq(this_cpu);
6556 if (idle != CPU_IDLE ||
6557 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
6560 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
6561 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
6565 * If this cpu gets work to do, stop the load balancing
6566 * work being done for other cpus. Next load
6567 * balancing owner will pick it up.
6572 rq = cpu_rq(balance_cpu);
6574 raw_spin_lock_irq(&rq->lock);
6575 update_rq_clock(rq);
6576 update_idle_cpu_load(rq);
6577 raw_spin_unlock_irq(&rq->lock);
6579 rebalance_domains(balance_cpu, CPU_IDLE);
6581 if (time_after(this_rq->next_balance, rq->next_balance))
6582 this_rq->next_balance = rq->next_balance;
6584 nohz.next_balance = this_rq->next_balance;
6586 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
6590 * Current heuristic for kicking the idle load balancer in the presence
6591 * of an idle cpu is the system.
6592 * - This rq has more than one task.
6593 * - At any scheduler domain level, this cpu's scheduler group has multiple
6594 * busy cpu's exceeding the group's power.
6595 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6596 * domain span are idle.
6598 static inline int nohz_kick_needed(struct rq *rq, int cpu)
6600 unsigned long now = jiffies;
6601 struct sched_domain *sd;
6603 if (unlikely(idle_cpu(cpu)))
6607 * We may be recently in ticked or tickless idle mode. At the first
6608 * busy tick after returning from idle, we will update the busy stats.
6610 set_cpu_sd_state_busy();
6611 nohz_balance_exit_idle(cpu);
6614 * None are in tickless mode and hence no need for NOHZ idle load
6617 if (likely(!atomic_read(&nohz.nr_cpus)))
6620 if (time_before(now, nohz.next_balance))
6623 #ifdef CONFIG_SCHED_HMP
6625 * Bail out if there are no nohz CPUs in our
6626 * HMP domain, since we will move tasks between
6627 * domains through wakeup and force balancing
6628 * as necessary based upon task load.
6630 if (cpumask_first_and(nohz.idle_cpus_mask,
6631 &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
6635 if (rq->nr_running >= 2)
6639 for_each_domain(cpu, sd) {
6640 struct sched_group *sg = sd->groups;
6641 struct sched_group_power *sgp = sg->sgp;
6642 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
6644 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
6645 goto need_kick_unlock;
6647 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
6648 && (cpumask_first_and(nohz.idle_cpus_mask,
6649 sched_domain_span(sd)) < cpu))
6650 goto need_kick_unlock;
6652 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
6664 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6667 #ifdef CONFIG_SCHED_HMP
6668 /* Check if task should migrate to a faster cpu */
6669 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
6671 struct task_struct *p = task_of(se);
6672 int temp_target_cpu;
6675 if (hmp_cpu_is_fastest(cpu))
6678 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
6679 /* Filter by task priority */
6680 if (p->prio >= hmp_up_prio)
6683 if (se->avg.load_avg_ratio < hmp_up_threshold)
6686 /* Let the task load settle before doing another up migration */
6687 /* hack - always use clock from first online CPU */
6688 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
6689 if (((now - se->avg.hmp_last_up_migration) >> 10)
6690 < hmp_next_up_threshold)
6693 /* hmp_domain_min_load only returns 0 for an
6694 * idle CPU or 1023 for any partly-busy one.
6695 * Be explicit about requirement for an idle CPU.
6697 if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu,
6698 tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) {
6700 *target_cpu = temp_target_cpu;
6706 /* Check if task should migrate to a slower cpu */
6707 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
6709 struct task_struct *p = task_of(se);
6712 if (hmp_cpu_is_slowest(cpu)) {
6713 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6714 if(hmp_packing_enabled)
6721 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
6722 /* Filter by task priority */
6723 if ((p->prio >= hmp_up_prio) &&
6724 cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
6725 tsk_cpus_allowed(p))) {
6730 /* Let the task load settle before doing another down migration */
6731 /* hack - always use clock from first online CPU */
6732 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
6733 if (((now - se->avg.hmp_last_down_migration) >> 10)
6734 < hmp_next_down_threshold)
6737 if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
6738 tsk_cpus_allowed(p))
6739 && se->avg.load_avg_ratio < hmp_down_threshold) {
6746 * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6747 * Ideally this function should be merged with can_migrate_task() to avoid
6750 static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
6752 int tsk_cache_hot = 0;
6755 * We do not migrate tasks that are:
6756 * 1) running (obviously), or
6757 * 2) cannot be migrated to this CPU due to cpus_allowed
6759 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6760 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6763 env->flags &= ~LBF_ALL_PINNED;
6765 if (task_running(env->src_rq, p)) {
6766 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6771 * Aggressive migration if:
6772 * 1) task is cache cold, or
6773 * 2) too many balance attempts have failed.
6776 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
6777 if (!tsk_cache_hot ||
6778 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6779 #ifdef CONFIG_SCHEDSTATS
6780 if (tsk_cache_hot) {
6781 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6782 schedstat_inc(p, se.statistics.nr_forced_migrations);
6792 * move_specific_task tries to move a specific task.
6793 * Returns 1 if successful and 0 otherwise.
6794 * Called with both runqueues locked.
6796 static int move_specific_task(struct lb_env *env, struct task_struct *pm)
6798 struct task_struct *p, *n;
6800 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6801 if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
6805 if (!hmp_can_migrate_task(p, env))
6807 /* Check if we found the right task */
6813 * Right now, this is only the third place move_task()
6814 * is called, so we can safely collect move_task()
6815 * stats here rather than inside move_task().
6817 schedstat_inc(env->sd, lb_gained[env->idle]);
6824 * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
6825 * migrate a specific task from one runqueue to another.
6826 * hmp_force_up_migration uses this to push a currently running task
6828 * Based on active_load_balance_stop_cpu and can potentially be merged.
6830 static int hmp_active_task_migration_cpu_stop(void *data)
6832 struct rq *busiest_rq = data;
6833 struct task_struct *p = busiest_rq->migrate_task;
6834 int busiest_cpu = cpu_of(busiest_rq);
6835 int target_cpu = busiest_rq->push_cpu;
6836 struct rq *target_rq = cpu_rq(target_cpu);
6837 struct sched_domain *sd;
6839 raw_spin_lock_irq(&busiest_rq->lock);
6840 /* make sure the requested cpu hasn't gone down in the meantime */
6841 if (unlikely(busiest_cpu != smp_processor_id() ||
6842 !busiest_rq->active_balance)) {
6845 /* Is there any task to move? */
6846 if (busiest_rq->nr_running <= 1)
6848 /* Task has migrated meanwhile, abort forced migration */
6849 if (task_rq(p) != busiest_rq)
6852 * This condition is "impossible", if it occurs
6853 * we need to fix it. Originally reported by
6854 * Bjorn Helgaas on a 128-cpu setup.
6856 BUG_ON(busiest_rq == target_rq);
6858 /* move a task from busiest_rq to target_rq */
6859 double_lock_balance(busiest_rq, target_rq);
6861 /* Search for an sd spanning us and the target CPU. */
6863 for_each_domain(target_cpu, sd) {
6864 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6869 struct lb_env env = {
6871 .dst_cpu = target_cpu,
6872 .dst_rq = target_rq,
6873 .src_cpu = busiest_rq->cpu,
6874 .src_rq = busiest_rq,
6878 schedstat_inc(sd, alb_count);
6880 if (move_specific_task(&env, p))
6881 schedstat_inc(sd, alb_pushed);
6883 schedstat_inc(sd, alb_failed);
6886 double_unlock_balance(busiest_rq, target_rq);
6889 busiest_rq->active_balance = 0;
6890 raw_spin_unlock_irq(&busiest_rq->lock);
6895 * hmp_idle_pull_cpu_stop is run by cpu stopper and used to
6896 * migrate a specific task from one runqueue to another.
6897 * hmp_idle_pull uses this to push a currently running task
6898 * off a runqueue to a faster CPU.
6899 * Locking is slightly different than usual.
6900 * Based on active_load_balance_stop_cpu and can potentially be merged.
6902 static int hmp_idle_pull_cpu_stop(void *data)
6904 struct rq *busiest_rq = data;
6905 struct task_struct *p = busiest_rq->migrate_task;
6906 int busiest_cpu = cpu_of(busiest_rq);
6907 int target_cpu = busiest_rq->push_cpu;
6908 struct rq *target_rq = cpu_rq(target_cpu);
6909 struct sched_domain *sd;
6911 raw_spin_lock_irq(&busiest_rq->lock);
6913 /* make sure the requested cpu hasn't gone down in the meantime */
6914 if (unlikely(busiest_cpu != smp_processor_id() ||
6915 !busiest_rq->active_balance))
6918 /* Is there any task to move? */
6919 if (busiest_rq->nr_running <= 1)
6922 /* Task has migrated meanwhile, abort forced migration */
6923 if (task_rq(p) != busiest_rq)
6927 * This condition is "impossible", if it occurs
6928 * we need to fix it. Originally reported by
6929 * Bjorn Helgaas on a 128-cpu setup.
6931 BUG_ON(busiest_rq == target_rq);
6933 /* move a task from busiest_rq to target_rq */
6934 double_lock_balance(busiest_rq, target_rq);
6936 /* Search for an sd spanning us and the target CPU. */
6938 for_each_domain(target_cpu, sd) {
6939 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6943 struct lb_env env = {
6945 .dst_cpu = target_cpu,
6946 .dst_rq = target_rq,
6947 .src_cpu = busiest_rq->cpu,
6948 .src_rq = busiest_rq,
6952 schedstat_inc(sd, alb_count);
6954 if (move_specific_task(&env, p))
6955 schedstat_inc(sd, alb_pushed);
6957 schedstat_inc(sd, alb_failed);
6960 double_unlock_balance(busiest_rq, target_rq);
6963 busiest_rq->active_balance = 0;
6964 raw_spin_unlock_irq(&busiest_rq->lock);
6969 * Move task in a runnable state to another CPU.
6971 * Tailored on 'active_load_balance_stop_cpu' with slight
6972 * modification to locking and pre-transfer checks. Note
6973 * rq->lock must be held before calling.
6975 static void hmp_migrate_runnable_task(struct rq *rq)
6977 struct sched_domain *sd;
6978 int src_cpu = cpu_of(rq);
6979 struct rq *src_rq = rq;
6980 int dst_cpu = rq->push_cpu;
6981 struct rq *dst_rq = cpu_rq(dst_cpu);
6982 struct task_struct *p = rq->migrate_task;
6984 * One last check to make sure nobody else is playing
6985 * with the source rq.
6987 if (src_rq->active_balance)
6990 if (src_rq->nr_running <= 1)
6993 if (task_rq(p) != src_rq)
6996 * Not sure if this applies here but one can never
6999 BUG_ON(src_rq == dst_rq);
7001 double_lock_balance(src_rq, dst_rq);
7004 for_each_domain(dst_cpu, sd) {
7005 if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
7010 struct lb_env env = {
7019 schedstat_inc(sd, alb_count);
7021 if (move_specific_task(&env, p))
7022 schedstat_inc(sd, alb_pushed);
7024 schedstat_inc(sd, alb_failed);
7028 double_unlock_balance(src_rq, dst_rq);
7031 static DEFINE_SPINLOCK(hmp_force_migration);
7034 * hmp_force_up_migration checks runqueues for tasks that need to
7035 * be actively migrated to a faster cpu.
7037 static void hmp_force_up_migration(int this_cpu)
7039 int cpu, target_cpu;
7040 struct sched_entity *curr, *orig;
7042 unsigned long flags;
7043 unsigned int force, got_target;
7044 struct task_struct *p;
7046 if (!spin_trylock(&hmp_force_migration))
7048 for_each_online_cpu(cpu) {
7051 target = cpu_rq(cpu);
7052 raw_spin_lock_irqsave(&target->lock, flags);
7053 curr = target->cfs.curr;
7055 raw_spin_unlock_irqrestore(&target->lock, flags);
7058 if (!entity_is_task(curr)) {
7059 struct cfs_rq *cfs_rq;
7061 cfs_rq = group_cfs_rq(curr);
7063 curr = cfs_rq->curr;
7064 cfs_rq = group_cfs_rq(curr);
7068 curr = hmp_get_heaviest_task(curr, 1);
7070 if (hmp_up_migration(cpu, &target_cpu, curr)) {
7071 if (!target->active_balance) {
7073 target->push_cpu = target_cpu;
7074 target->migrate_task = p;
7076 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
7077 hmp_next_up_delay(&p->se, target->push_cpu);
7080 if (!got_target && !target->active_balance) {
7082 * For now we just check the currently running task.
7083 * Selecting the lightest task for offloading will
7084 * require extensive book keeping.
7086 curr = hmp_get_lightest_task(orig, 1);
7088 target->push_cpu = hmp_offload_down(cpu, curr);
7089 if (target->push_cpu < NR_CPUS) {
7091 target->migrate_task = p;
7093 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
7094 hmp_next_down_delay(&p->se, target->push_cpu);
7098 * We have a target with no active_balance. If the task
7099 * is not currently running move it, otherwise let the
7100 * CPU stopper take care of it.
7102 if (got_target && !target->active_balance) {
7103 if (!task_running(target, p)) {
7104 trace_sched_hmp_migrate_force_running(p, 0);
7105 hmp_migrate_runnable_task(target);
7107 target->active_balance = 1;
7112 raw_spin_unlock_irqrestore(&target->lock, flags);
7115 stop_one_cpu_nowait(cpu_of(target),
7116 hmp_active_task_migration_cpu_stop,
7117 target, &target->active_balance_work);
7119 spin_unlock(&hmp_force_migration);
7122 * hmp_idle_pull looks at little domain runqueues to see
7123 * if a task should be pulled.
7125 * Reuses hmp_force_migration spinlock.
7128 static unsigned int hmp_idle_pull(int this_cpu)
7131 struct sched_entity *curr, *orig;
7132 struct hmp_domain *hmp_domain = NULL;
7133 struct rq *target = NULL, *rq;
7134 unsigned long flags, ratio = 0;
7135 unsigned int force = 0;
7136 struct task_struct *p = NULL;
7138 if (!hmp_cpu_is_slowest(this_cpu))
7139 hmp_domain = hmp_slower_domain(this_cpu);
7143 if (!spin_trylock(&hmp_force_migration))
7146 /* first select a task */
7147 for_each_cpu(cpu, &hmp_domain->cpus) {
7149 raw_spin_lock_irqsave(&rq->lock, flags);
7150 curr = rq->cfs.curr;
7152 raw_spin_unlock_irqrestore(&rq->lock, flags);
7155 if (!entity_is_task(curr)) {
7156 struct cfs_rq *cfs_rq;
7158 cfs_rq = group_cfs_rq(curr);
7160 curr = cfs_rq->curr;
7161 if (!entity_is_task(curr))
7162 cfs_rq = group_cfs_rq(curr);
7168 curr = hmp_get_heaviest_task(curr, 1);
7169 if (curr->avg.load_avg_ratio > hmp_up_threshold &&
7170 curr->avg.load_avg_ratio > ratio) {
7173 ratio = curr->avg.load_avg_ratio;
7175 raw_spin_unlock_irqrestore(&rq->lock, flags);
7181 /* now we have a candidate */
7182 raw_spin_lock_irqsave(&target->lock, flags);
7183 if (!target->active_balance && task_rq(p) == target) {
7185 target->push_cpu = this_cpu;
7186 target->migrate_task = p;
7187 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
7188 hmp_next_up_delay(&p->se, target->push_cpu);
7190 * if the task isn't running move it right away.
7191 * Otherwise setup the active_balance mechanic and let
7192 * the CPU stopper do its job.
7194 if (!task_running(target, p)) {
7195 trace_sched_hmp_migrate_idle_running(p, 0);
7196 hmp_migrate_runnable_task(target);
7198 target->active_balance = 1;
7202 raw_spin_unlock_irqrestore(&target->lock, flags);
7205 stop_one_cpu_nowait(cpu_of(target),
7206 hmp_idle_pull_cpu_stop,
7207 target, &target->active_balance_work);
7210 spin_unlock(&hmp_force_migration);
7214 static void hmp_force_up_migration(int this_cpu) { }
7215 #endif /* CONFIG_SCHED_HMP */
7218 * run_rebalance_domains is triggered when needed from the scheduler tick.
7219 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7221 static void run_rebalance_domains(struct softirq_action *h)
7223 int this_cpu = smp_processor_id();
7224 struct rq *this_rq = cpu_rq(this_cpu);
7225 enum cpu_idle_type idle = this_rq->idle_balance ?
7226 CPU_IDLE : CPU_NOT_IDLE;
7228 hmp_force_up_migration(this_cpu);
7230 rebalance_domains(this_cpu, idle);
7233 * If this cpu has a pending nohz_balance_kick, then do the
7234 * balancing on behalf of the other idle cpus whose ticks are
7237 nohz_idle_balance(this_cpu, idle);
7240 static inline int on_null_domain(int cpu)
7242 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
7246 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7248 void trigger_load_balance(struct rq *rq, int cpu)
7250 /* Don't need to rebalance while attached to NULL domain */
7251 if (time_after_eq(jiffies, rq->next_balance) &&
7252 likely(!on_null_domain(cpu)))
7253 raise_softirq(SCHED_SOFTIRQ);
7254 #ifdef CONFIG_NO_HZ_COMMON
7255 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
7256 nohz_balancer_kick(cpu);
7260 static void rq_online_fair(struct rq *rq)
7262 #ifdef CONFIG_SCHED_HMP
7263 hmp_online_cpu(rq->cpu);
7268 static void rq_offline_fair(struct rq *rq)
7270 #ifdef CONFIG_SCHED_HMP
7271 hmp_offline_cpu(rq->cpu);
7275 /* Ensure any throttled groups are reachable by pick_next_task */
7276 unthrottle_offline_cfs_rqs(rq);
7279 #endif /* CONFIG_SMP */
7282 * scheduler tick hitting a task of our scheduling class:
7284 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7286 struct cfs_rq *cfs_rq;
7287 struct sched_entity *se = &curr->se;
7289 for_each_sched_entity(se) {
7290 cfs_rq = cfs_rq_of(se);
7291 entity_tick(cfs_rq, se, queued);
7294 if (sched_feat_numa(NUMA))
7295 task_tick_numa(rq, curr);
7297 update_rq_runnable_avg(rq, 1);
7301 * called on fork with the child task as argument from the parent's context
7302 * - child not yet on the tasklist
7303 * - preemption disabled
7305 static void task_fork_fair(struct task_struct *p)
7307 struct cfs_rq *cfs_rq;
7308 struct sched_entity *se = &p->se, *curr;
7309 int this_cpu = smp_processor_id();
7310 struct rq *rq = this_rq();
7311 unsigned long flags;
7313 raw_spin_lock_irqsave(&rq->lock, flags);
7315 update_rq_clock(rq);
7317 cfs_rq = task_cfs_rq(current);
7318 curr = cfs_rq->curr;
7321 * Not only the cpu but also the task_group of the parent might have
7322 * been changed after parent->se.parent,cfs_rq were copied to
7323 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7324 * of child point to valid ones.
7327 __set_task_cpu(p, this_cpu);
7330 update_curr(cfs_rq);
7333 se->vruntime = curr->vruntime;
7334 place_entity(cfs_rq, se, 1);
7336 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7338 * Upon rescheduling, sched_class::put_prev_task() will place
7339 * 'current' within the tree based on its new key value.
7341 swap(curr->vruntime, se->vruntime);
7342 resched_task(rq->curr);
7345 se->vruntime -= cfs_rq->min_vruntime;
7347 raw_spin_unlock_irqrestore(&rq->lock, flags);
7351 * Priority of the task has changed. Check to see if we preempt
7355 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7361 * Reschedule if we are currently running on this runqueue and
7362 * our priority decreased, or if we are not currently running on
7363 * this runqueue and our priority is higher than the current's
7365 if (rq->curr == p) {
7366 if (p->prio > oldprio)
7367 resched_task(rq->curr);
7369 check_preempt_curr(rq, p, 0);
7372 static void switched_from_fair(struct rq *rq, struct task_struct *p)
7374 struct sched_entity *se = &p->se;
7375 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7378 * Ensure the task's vruntime is normalized, so that when its
7379 * switched back to the fair class the enqueue_entity(.flags=0) will
7380 * do the right thing.
7382 * If it was on_rq, then the dequeue_entity(.flags=0) will already
7383 * have normalized the vruntime, if it was !on_rq, then only when
7384 * the task is sleeping will it still have non-normalized vruntime.
7386 if (!se->on_rq && p->state != TASK_RUNNING) {
7388 * Fix up our vruntime so that the current sleep doesn't
7389 * cause 'unlimited' sleep bonus.
7391 place_entity(cfs_rq, se, 0);
7392 se->vruntime -= cfs_rq->min_vruntime;
7395 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
7397 * Remove our load from contribution when we leave sched_fair
7398 * and ensure we don't carry in an old decay_count if we
7401 if (p->se.avg.decay_count) {
7402 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
7403 __synchronize_entity_decay(&p->se);
7404 subtract_blocked_load_contrib(cfs_rq,
7405 p->se.avg.load_avg_contrib);
7411 * We switched to the sched_fair class.
7413 static void switched_to_fair(struct rq *rq, struct task_struct *p)
7419 * We were most likely switched from sched_rt, so
7420 * kick off the schedule if running, otherwise just see
7421 * if we can still preempt the current task.
7424 resched_task(rq->curr);
7426 check_preempt_curr(rq, p, 0);
7429 /* Account for a task changing its policy or group.
7431 * This routine is mostly called to set cfs_rq->curr field when a task
7432 * migrates between groups/classes.
7434 static void set_curr_task_fair(struct rq *rq)
7436 struct sched_entity *se = &rq->curr->se;
7438 for_each_sched_entity(se) {
7439 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7441 set_next_entity(cfs_rq, se);
7442 /* ensure bandwidth has been allocated on our new cfs_rq */
7443 account_cfs_rq_runtime(cfs_rq, 0);
7447 void init_cfs_rq(struct cfs_rq *cfs_rq)
7449 cfs_rq->tasks_timeline = RB_ROOT;
7450 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7451 #ifndef CONFIG_64BIT
7452 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7454 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
7455 atomic64_set(&cfs_rq->decay_counter, 1);
7456 atomic64_set(&cfs_rq->removed_load, 0);
7460 #ifdef CONFIG_FAIR_GROUP_SCHED
7461 static void task_move_group_fair(struct task_struct *p, int on_rq)
7463 struct cfs_rq *cfs_rq;
7465 * If the task was not on the rq at the time of this cgroup movement
7466 * it must have been asleep, sleeping tasks keep their ->vruntime
7467 * absolute on their old rq until wakeup (needed for the fair sleeper
7468 * bonus in place_entity()).
7470 * If it was on the rq, we've just 'preempted' it, which does convert
7471 * ->vruntime to a relative base.
7473 * Make sure both cases convert their relative position when migrating
7474 * to another cgroup's rq. This does somewhat interfere with the
7475 * fair sleeper stuff for the first placement, but who cares.
7478 * When !on_rq, vruntime of the task has usually NOT been normalized.
7479 * But there are some cases where it has already been normalized:
7481 * - Moving a forked child which is waiting for being woken up by
7482 * wake_up_new_task().
7483 * - Moving a task which has been woken up by try_to_wake_up() and
7484 * waiting for actually being woken up by sched_ttwu_pending().
7486 * To prevent boost or penalty in the new cfs_rq caused by delta
7487 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7489 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7493 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
7494 set_task_rq(p, task_cpu(p));
7496 cfs_rq = cfs_rq_of(&p->se);
7497 p->se.vruntime += cfs_rq->min_vruntime;
7500 * migrate_task_rq_fair() will have removed our previous
7501 * contribution, but we must synchronize for ongoing future
7504 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7505 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
7510 void free_fair_sched_group(struct task_group *tg)
7514 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7516 for_each_possible_cpu(i) {
7518 kfree(tg->cfs_rq[i]);
7527 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7529 struct cfs_rq *cfs_rq;
7530 struct sched_entity *se;
7533 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7536 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7540 tg->shares = NICE_0_LOAD;
7542 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7544 for_each_possible_cpu(i) {
7545 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7546 GFP_KERNEL, cpu_to_node(i));
7550 se = kzalloc_node(sizeof(struct sched_entity),
7551 GFP_KERNEL, cpu_to_node(i));
7555 init_cfs_rq(cfs_rq);
7556 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7567 void unregister_fair_sched_group(struct task_group *tg, int cpu)
7569 struct rq *rq = cpu_rq(cpu);
7570 unsigned long flags;
7573 * Only empty task groups can be destroyed; so we can speculatively
7574 * check on_list without danger of it being re-added.
7576 if (!tg->cfs_rq[cpu]->on_list)
7579 raw_spin_lock_irqsave(&rq->lock, flags);
7580 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7581 raw_spin_unlock_irqrestore(&rq->lock, flags);
7584 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7585 struct sched_entity *se, int cpu,
7586 struct sched_entity *parent)
7588 struct rq *rq = cpu_rq(cpu);
7592 init_cfs_rq_runtime(cfs_rq);
7594 tg->cfs_rq[cpu] = cfs_rq;
7597 /* se could be NULL for root_task_group */
7602 se->cfs_rq = &rq->cfs;
7604 se->cfs_rq = parent->my_q;
7607 update_load_set(&se->load, 0);
7608 se->parent = parent;
7611 static DEFINE_MUTEX(shares_mutex);
7613 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7616 unsigned long flags;
7619 * We can't change the weight of the root cgroup.
7624 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7626 mutex_lock(&shares_mutex);
7627 if (tg->shares == shares)
7630 tg->shares = shares;
7631 for_each_possible_cpu(i) {
7632 struct rq *rq = cpu_rq(i);
7633 struct sched_entity *se;
7636 /* Propagate contribution to hierarchy */
7637 raw_spin_lock_irqsave(&rq->lock, flags);
7638 for_each_sched_entity(se)
7639 update_cfs_shares(group_cfs_rq(se));
7640 raw_spin_unlock_irqrestore(&rq->lock, flags);
7644 mutex_unlock(&shares_mutex);
7647 #else /* CONFIG_FAIR_GROUP_SCHED */
7649 void free_fair_sched_group(struct task_group *tg) { }
7651 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7656 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7658 #endif /* CONFIG_FAIR_GROUP_SCHED */
7661 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
7663 struct sched_entity *se = &task->se;
7664 unsigned int rr_interval = 0;
7667 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
7670 if (rq->cfs.load.weight)
7671 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7677 * All the scheduling class methods:
7679 const struct sched_class fair_sched_class = {
7680 .next = &idle_sched_class,
7681 .enqueue_task = enqueue_task_fair,
7682 .dequeue_task = dequeue_task_fair,
7683 .yield_task = yield_task_fair,
7684 .yield_to_task = yield_to_task_fair,
7686 .check_preempt_curr = check_preempt_wakeup,
7688 .pick_next_task = pick_next_task_fair,
7689 .put_prev_task = put_prev_task_fair,
7692 .select_task_rq = select_task_rq_fair,
7693 #ifdef CONFIG_FAIR_GROUP_SCHED
7694 .migrate_task_rq = migrate_task_rq_fair,
7696 .rq_online = rq_online_fair,
7697 .rq_offline = rq_offline_fair,
7699 .task_waking = task_waking_fair,
7702 .set_curr_task = set_curr_task_fair,
7703 .task_tick = task_tick_fair,
7704 .task_fork = task_fork_fair,
7706 .prio_changed = prio_changed_fair,
7707 .switched_from = switched_from_fair,
7708 .switched_to = switched_to_fair,
7710 .get_rr_interval = get_rr_interval_fair,
7712 #ifdef CONFIG_FAIR_GROUP_SCHED
7713 .task_move_group = task_move_group_fair,
7717 #ifdef CONFIG_SCHED_DEBUG
7718 void print_cfs_stats(struct seq_file *m, int cpu)
7720 struct cfs_rq *cfs_rq;
7723 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7724 print_cfs_rq(m, cpu, cfs_rq);
7729 __init void init_sched_fair_class(void)
7732 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7734 #ifdef CONFIG_NO_HZ_COMMON
7735 nohz.next_balance = jiffies;
7736 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7737 cpu_notifier(sched_ilb_notifier, 0);
7740 #ifdef CONFIG_SCHED_HMP
7741 hmp_cpu_mask_setup();
7747 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
7748 static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
7750 u32 result = curr / max;
7754 /* Called when the CPU Frequency is changed.
7755 * Once for each CPU.
7757 static int cpufreq_callback(struct notifier_block *nb,
7758 unsigned long val, void *data)
7760 struct cpufreq_freqs *freq = data;
7761 int cpu = freq->cpu;
7762 struct cpufreq_extents *extents;
7764 if (freq->flags & CPUFREQ_CONST_LOOPS)
7767 if (val != CPUFREQ_POSTCHANGE)
7770 /* if dynamic load scale is disabled, set the load scale to 1.0 */
7771 if (!hmp_data.freqinvar_load_scale_enabled) {
7772 freq_scale[cpu].curr_scale = 1024;
7776 extents = &freq_scale[cpu];
7777 if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
7778 /* If our governor was recognised as a single-freq governor,
7781 extents->curr_scale = 1024;
7783 extents->curr_scale = cpufreq_calc_scale(extents->min,
7784 extents->max, freq->new);
7790 /* Called when the CPUFreq governor is changed.
7791 * Only called for the CPUs which are actually changed by the
7794 static int cpufreq_policy_callback(struct notifier_block *nb,
7795 unsigned long event, void *data)
7797 struct cpufreq_policy *policy = data;
7798 struct cpufreq_extents *extents;
7799 int cpu, singleFreq = 0;
7800 static const char performance_governor[] = "performance";
7801 static const char powersave_governor[] = "powersave";
7803 if (event == CPUFREQ_START)
7806 if (event != CPUFREQ_INCOMPATIBLE)
7809 /* CPUFreq governors do not accurately report the range of
7810 * CPU Frequencies they will choose from.
7811 * We recognise performance and powersave governors as
7812 * single-frequency only.
7814 if (!strncmp(policy->governor->name, performance_governor,
7815 strlen(performance_governor)) ||
7816 !strncmp(policy->governor->name, powersave_governor,
7817 strlen(powersave_governor)))
7820 /* Make sure that all CPUs impacted by this policy are
7821 * updated since we will only get a notification when the
7822 * user explicitly changes the policy on a CPU.
7824 for_each_cpu(cpu, policy->cpus) {
7825 extents = &freq_scale[cpu];
7826 extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
7827 extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
7828 if (!hmp_data.freqinvar_load_scale_enabled) {
7829 extents->curr_scale = 1024;
7830 } else if (singleFreq) {
7831 extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
7832 extents->curr_scale = 1024;
7834 extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
7835 extents->curr_scale = cpufreq_calc_scale(extents->min,
7836 extents->max, policy->cur);
7843 static struct notifier_block cpufreq_notifier = {
7844 .notifier_call = cpufreq_callback,
7846 static struct notifier_block cpufreq_policy_notifier = {
7847 .notifier_call = cpufreq_policy_callback,
7850 static int __init register_sched_cpufreq_notifier(void)
7854 /* init safe defaults since there are no policies at registration */
7855 for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
7857 freq_scale[ret].max = 1024;
7858 freq_scale[ret].min = 1024;
7859 freq_scale[ret].curr_scale = 1024;
7862 pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
7863 ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
7864 CPUFREQ_POLICY_NOTIFIER);
7867 ret = cpufreq_register_notifier(&cpufreq_notifier,
7868 CPUFREQ_TRANSITION_NOTIFIER);
7873 core_initcall(register_sched_cpufreq_notifier);
7874 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */