4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <linux/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/capability.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/debug_locks.h>
34 #include <linux/security.h>
35 #include <linux/notifier.h>
36 #include <linux/profile.h>
37 #include <linux/freezer.h>
38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h>
40 #include <linux/delay.h>
41 #include <linux/smp.h>
42 #include <linux/threads.h>
43 #include <linux/timer.h>
44 #include <linux/rcupdate.h>
45 #include <linux/cpu.h>
46 #include <linux/cpuset.h>
47 #include <linux/percpu.h>
48 #include <linux/kthread.h>
49 #include <linux/seq_file.h>
50 #include <linux/syscalls.h>
51 #include <linux/times.h>
52 #include <linux/tsacct_kern.h>
53 #include <linux/kprobes.h>
54 #include <linux/delayacct.h>
55 #include <linux/reciprocal_div.h>
56 #include <linux/unistd.h>
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
65 unsigned long long __attribute__((weak)) sched_clock(void)
67 return (unsigned long long)jiffies * (1000000000 / HZ);
71 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
75 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
84 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
89 * Some helpers for converting nanosecond timing to jiffy resolution
91 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
94 #define NICE_0_LOAD SCHED_LOAD_SCALE
95 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
98 * These are the 'tuning knobs' of the scheduler:
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
104 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105 #define DEF_TIMESLICE (100 * HZ / 1000)
109 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
110 * Since cpu_power is a 'constant', we can use a reciprocal divide.
112 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
114 return reciprocal_divide(load, sg->reciprocal_cpu_power);
118 * Each time a sched group cpu_power is changed,
119 * we must compute its reciprocal value
121 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
123 sg->__cpu_power += val;
124 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
128 #define SCALE_PRIO(x, prio) \
129 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
132 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
133 * to time slice values: [800ms ... 100ms ... 5ms]
135 static unsigned int static_prio_timeslice(int static_prio)
137 if (static_prio == NICE_TO_PRIO(19))
140 if (static_prio < NICE_TO_PRIO(0))
141 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
143 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
146 static inline int rt_policy(int policy)
148 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
153 static inline int task_has_rt_policy(struct task_struct *p)
155 return rt_policy(p->policy);
159 * This is the priority-queue data structure of the RT scheduling class:
161 struct rt_prio_array {
162 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
163 struct list_head queue[MAX_RT_PRIO];
167 struct load_weight load;
168 u64 load_update_start, load_update_last;
169 unsigned long delta_fair, delta_exec, delta_stat;
172 /* CFS-related fields in a runqueue */
174 struct load_weight load;
175 unsigned long nr_running;
181 unsigned long wait_runtime_overruns, wait_runtime_underruns;
183 struct rb_root tasks_timeline;
184 struct rb_node *rb_leftmost;
185 struct rb_node *rb_load_balance_curr;
186 #ifdef CONFIG_FAIR_GROUP_SCHED
187 /* 'curr' points to currently running entity on this cfs_rq.
188 * It is set to NULL otherwise (i.e when none are currently running).
190 struct sched_entity *curr;
191 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
193 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
194 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
195 * (like users, containers etc.)
197 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
198 * list is used during load balance.
200 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
204 /* Real-Time classes' related field in a runqueue: */
206 struct rt_prio_array active;
207 int rt_load_balance_idx;
208 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
212 * This is the main, per-CPU runqueue data structure.
214 * Locking rule: those places that want to lock multiple runqueues
215 * (such as the load balancing or the thread migration code), lock
216 * acquire operations must be ordered by ascending &runqueue.
219 spinlock_t lock; /* runqueue lock */
222 * nr_running and cpu_load should be in the same cacheline because
223 * remote CPUs use both these fields when doing load calculation.
225 unsigned long nr_running;
226 #define CPU_LOAD_IDX_MAX 5
227 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
228 unsigned char idle_at_tick;
230 unsigned char in_nohz_recently;
232 struct load_stat ls; /* capture load from *all* tasks on this cpu */
233 unsigned long nr_load_updates;
237 #ifdef CONFIG_FAIR_GROUP_SCHED
238 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
243 * This is part of a global counter where only the total sum
244 * over all CPUs matters. A task can increase this counter on
245 * one CPU and if it got migrated afterwards it may decrease
246 * it on another CPU. Always updated under the runqueue lock:
248 unsigned long nr_uninterruptible;
250 struct task_struct *curr, *idle;
251 unsigned long next_balance;
252 struct mm_struct *prev_mm;
254 u64 clock, prev_clock_raw;
257 unsigned int clock_warps, clock_overflows;
258 unsigned int clock_unstable_events;
260 struct sched_class *load_balance_class;
265 struct sched_domain *sd;
267 /* For active balancing */
270 int cpu; /* cpu of this runqueue */
272 struct task_struct *migration_thread;
273 struct list_head migration_queue;
276 #ifdef CONFIG_SCHEDSTATS
278 struct sched_info rq_sched_info;
280 /* sys_sched_yield() stats */
281 unsigned long yld_exp_empty;
282 unsigned long yld_act_empty;
283 unsigned long yld_both_empty;
284 unsigned long yld_cnt;
286 /* schedule() stats */
287 unsigned long sched_switch;
288 unsigned long sched_cnt;
289 unsigned long sched_goidle;
291 /* try_to_wake_up() stats */
292 unsigned long ttwu_cnt;
293 unsigned long ttwu_local;
295 struct lock_class_key rq_lock_key;
298 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
299 static DEFINE_MUTEX(sched_hotcpu_mutex);
301 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
303 rq->curr->sched_class->check_preempt_curr(rq, p);
306 static inline int cpu_of(struct rq *rq)
316 * Per-runqueue clock, as finegrained as the platform can give us:
318 static unsigned long long __rq_clock(struct rq *rq)
320 u64 prev_raw = rq->prev_clock_raw;
321 u64 now = sched_clock();
322 s64 delta = now - prev_raw;
323 u64 clock = rq->clock;
326 * Protect against sched_clock() occasionally going backwards:
328 if (unlikely(delta < 0)) {
333 * Catch too large forward jumps too:
335 if (unlikely(delta > 2*TICK_NSEC)) {
337 rq->clock_overflows++;
339 if (unlikely(delta > rq->clock_max_delta))
340 rq->clock_max_delta = delta;
345 rq->prev_clock_raw = now;
351 static inline unsigned long long rq_clock(struct rq *rq)
353 int this_cpu = smp_processor_id();
355 if (this_cpu == cpu_of(rq))
356 return __rq_clock(rq);
362 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
363 * See detach_destroy_domains: synchronize_sched for details.
365 * The domain tree of any CPU may only be accessed from within
366 * preempt-disabled sections.
368 #define for_each_domain(cpu, __sd) \
369 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
371 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
372 #define this_rq() (&__get_cpu_var(runqueues))
373 #define task_rq(p) cpu_rq(task_cpu(p))
374 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
376 #ifdef CONFIG_FAIR_GROUP_SCHED
377 /* Change a task's ->cfs_rq if it moves across CPUs */
378 static inline void set_task_cfs_rq(struct task_struct *p)
380 p->se.cfs_rq = &task_rq(p)->cfs;
383 static inline void set_task_cfs_rq(struct task_struct *p)
388 #ifndef prepare_arch_switch
389 # define prepare_arch_switch(next) do { } while (0)
391 #ifndef finish_arch_switch
392 # define finish_arch_switch(prev) do { } while (0)
395 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
396 static inline int task_running(struct rq *rq, struct task_struct *p)
398 return rq->curr == p;
401 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
405 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
407 #ifdef CONFIG_DEBUG_SPINLOCK
408 /* this is a valid case when another task releases the spinlock */
409 rq->lock.owner = current;
412 * If we are tracking spinlock dependencies then we have to
413 * fix up the runqueue lock - which gets 'carried over' from
416 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
418 spin_unlock_irq(&rq->lock);
421 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
422 static inline int task_running(struct rq *rq, struct task_struct *p)
427 return rq->curr == p;
431 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
435 * We can optimise this out completely for !SMP, because the
436 * SMP rebalancing from interrupt is the only thing that cares
441 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
442 spin_unlock_irq(&rq->lock);
444 spin_unlock(&rq->lock);
448 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
452 * After ->oncpu is cleared, the task can be moved to a different CPU.
453 * We must ensure this doesn't happen until the switch is completely
459 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
463 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
466 * __task_rq_lock - lock the runqueue a given task resides on.
467 * Must be called interrupts disabled.
469 static inline struct rq *__task_rq_lock(struct task_struct *p)
476 spin_lock(&rq->lock);
477 if (unlikely(rq != task_rq(p))) {
478 spin_unlock(&rq->lock);
479 goto repeat_lock_task;
485 * task_rq_lock - lock the runqueue a given task resides on and disable
486 * interrupts. Note the ordering: we can safely lookup the task_rq without
487 * explicitly disabling preemption.
489 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
495 local_irq_save(*flags);
497 spin_lock(&rq->lock);
498 if (unlikely(rq != task_rq(p))) {
499 spin_unlock_irqrestore(&rq->lock, *flags);
500 goto repeat_lock_task;
505 static inline void __task_rq_unlock(struct rq *rq)
508 spin_unlock(&rq->lock);
511 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
514 spin_unlock_irqrestore(&rq->lock, *flags);
518 * this_rq_lock - lock this runqueue and disable interrupts.
520 static inline struct rq *this_rq_lock(void)
527 spin_lock(&rq->lock);
533 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
535 void sched_clock_unstable_event(void)
540 rq = task_rq_lock(current, &flags);
541 rq->prev_clock_raw = sched_clock();
542 rq->clock_unstable_events++;
543 task_rq_unlock(rq, &flags);
547 * resched_task - mark a task 'to be rescheduled now'.
549 * On UP this means the setting of the need_resched flag, on SMP it
550 * might also involve a cross-CPU call to trigger the scheduler on
555 #ifndef tsk_is_polling
556 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
559 static void resched_task(struct task_struct *p)
563 assert_spin_locked(&task_rq(p)->lock);
565 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
568 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
571 if (cpu == smp_processor_id())
574 /* NEED_RESCHED must be visible before we test polling */
576 if (!tsk_is_polling(p))
577 smp_send_reschedule(cpu);
580 static void resched_cpu(int cpu)
582 struct rq *rq = cpu_rq(cpu);
585 if (!spin_trylock_irqsave(&rq->lock, flags))
587 resched_task(cpu_curr(cpu));
588 spin_unlock_irqrestore(&rq->lock, flags);
591 static inline void resched_task(struct task_struct *p)
593 assert_spin_locked(&task_rq(p)->lock);
594 set_tsk_need_resched(p);
598 static u64 div64_likely32(u64 divident, unsigned long divisor)
600 #if BITS_PER_LONG == 32
601 if (likely(divident <= 0xffffffffULL))
602 return (u32)divident / divisor;
603 do_div(divident, divisor);
607 return divident / divisor;
611 #if BITS_PER_LONG == 32
612 # define WMULT_CONST (~0UL)
614 # define WMULT_CONST (1UL << 32)
617 #define WMULT_SHIFT 32
619 static inline unsigned long
620 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
621 struct load_weight *lw)
625 if (unlikely(!lw->inv_weight))
626 lw->inv_weight = WMULT_CONST / lw->weight;
628 tmp = (u64)delta_exec * weight;
630 * Check whether we'd overflow the 64-bit multiplication:
632 if (unlikely(tmp > WMULT_CONST)) {
633 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
636 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
639 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
642 static inline unsigned long
643 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
645 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
648 static void update_load_add(struct load_weight *lw, unsigned long inc)
654 static void update_load_sub(struct load_weight *lw, unsigned long dec)
660 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
662 if (rq->curr != rq->idle && ls->load.weight) {
663 ls->delta_exec += ls->delta_stat;
664 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
670 * Update delta_exec, delta_fair fields for rq.
672 * delta_fair clock advances at a rate inversely proportional to
673 * total load (rq->ls.load.weight) on the runqueue, while
674 * delta_exec advances at the same rate as wall-clock (provided
677 * delta_exec / delta_fair is a measure of the (smoothened) load on this
678 * runqueue over any given interval. This (smoothened) load is used
679 * during load balance.
681 * This function is called /before/ updating rq->ls.load
682 * and when switching tasks.
684 static void update_curr_load(struct rq *rq, u64 now)
686 struct load_stat *ls = &rq->ls;
689 start = ls->load_update_start;
690 ls->load_update_start = now;
691 ls->delta_stat += now - start;
693 * Stagger updates to ls->delta_fair. Very frequent updates
696 if (ls->delta_stat >= sysctl_sched_stat_granularity)
697 __update_curr_load(rq, ls);
701 * To aid in avoiding the subversion of "niceness" due to uneven distribution
702 * of tasks with abnormal "nice" values across CPUs the contribution that
703 * each task makes to its run queue's load is weighted according to its
704 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
705 * scaled version of the new time slice allocation that they receive on time
710 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
711 * If static_prio_timeslice() is ever changed to break this assumption then
712 * this code will need modification
714 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
715 #define load_weight(lp) \
716 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
717 #define PRIO_TO_LOAD_WEIGHT(prio) \
718 load_weight(static_prio_timeslice(prio))
719 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
720 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
722 #define WEIGHT_IDLEPRIO 2
723 #define WMULT_IDLEPRIO (1 << 31)
726 * Nice levels are multiplicative, with a gentle 10% change for every
727 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
728 * nice 1, it will get ~10% less CPU time than another CPU-bound task
729 * that remained on nice 0.
731 * The "10% effect" is relative and cumulative: from _any_ nice level,
732 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
733 * it's +10% CPU usage.
735 static const int prio_to_weight[40] = {
736 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
737 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
738 /* 0 */ NICE_0_LOAD /* 1024 */,
739 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
740 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
743 static const u32 prio_to_wmult[40] = {
744 48356, 60446, 75558, 94446, 118058, 147573,
745 184467, 230589, 288233, 360285, 450347,
746 562979, 703746, 879575, 1099582, 1374389,
747 717986, 2147483, 2684354, 3355443, 4194304,
748 244160, 6557201, 8196502, 10250518, 12782640,
749 16025997, 19976592, 24970740, 31350126, 39045157,
750 49367440, 61356675, 76695844, 95443717, 119304647,
751 148102320, 186737708, 238609294, 286331153,
755 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
757 update_curr_load(rq, now);
758 update_load_add(&rq->ls.load, p->se.load.weight);
762 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
764 update_curr_load(rq, now);
765 update_load_sub(&rq->ls.load, p->se.load.weight);
768 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
771 inc_load(rq, p, now);
774 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
777 dec_load(rq, p, now);
780 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
783 * runqueue iterator, to support SMP load-balancing between different
784 * scheduling classes, without having to expose their internal data
785 * structures to the load-balancing proper:
789 struct task_struct *(*start)(void *);
790 struct task_struct *(*next)(void *);
793 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
794 unsigned long max_nr_move, unsigned long max_load_move,
795 struct sched_domain *sd, enum cpu_idle_type idle,
796 int *all_pinned, unsigned long *load_moved,
797 int this_best_prio, int best_prio, int best_prio_seen,
798 struct rq_iterator *iterator);
800 #include "sched_stats.h"
801 #include "sched_rt.c"
802 #include "sched_fair.c"
803 #include "sched_idletask.c"
804 #ifdef CONFIG_SCHED_DEBUG
805 # include "sched_debug.c"
808 #define sched_class_highest (&rt_sched_class)
810 static void set_load_weight(struct task_struct *p)
812 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
813 p->se.wait_runtime = 0;
815 if (task_has_rt_policy(p)) {
816 p->se.load.weight = prio_to_weight[0] * 2;
817 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
822 * SCHED_IDLE tasks get minimal weight:
824 if (p->policy == SCHED_IDLE) {
825 p->se.load.weight = WEIGHT_IDLEPRIO;
826 p->se.load.inv_weight = WMULT_IDLEPRIO;
830 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
831 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
835 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
837 sched_info_queued(p);
838 p->sched_class->enqueue_task(rq, p, wakeup, now);
843 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
845 p->sched_class->dequeue_task(rq, p, sleep, now);
850 * __normal_prio - return the priority that is based on the static prio
852 static inline int __normal_prio(struct task_struct *p)
854 return p->static_prio;
858 * Calculate the expected normal priority: i.e. priority
859 * without taking RT-inheritance into account. Might be
860 * boosted by interactivity modifiers. Changes upon fork,
861 * setprio syscalls, and whenever the interactivity
862 * estimator recalculates.
864 static inline int normal_prio(struct task_struct *p)
868 if (task_has_rt_policy(p))
869 prio = MAX_RT_PRIO-1 - p->rt_priority;
871 prio = __normal_prio(p);
876 * Calculate the current priority, i.e. the priority
877 * taken into account by the scheduler. This value might
878 * be boosted by RT tasks, or might be boosted by
879 * interactivity modifiers. Will be RT if the task got
880 * RT-boosted. If not then it returns p->normal_prio.
882 static int effective_prio(struct task_struct *p)
884 p->normal_prio = normal_prio(p);
886 * If we are RT tasks or we were boosted to RT priority,
887 * keep the priority unchanged. Otherwise, update priority
888 * to the normal priority:
890 if (!rt_prio(p->prio))
891 return p->normal_prio;
896 * activate_task - move a task to the runqueue.
898 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
900 u64 now = rq_clock(rq);
902 if (p->state == TASK_UNINTERRUPTIBLE)
903 rq->nr_uninterruptible--;
905 enqueue_task(rq, p, wakeup, now);
906 inc_nr_running(p, rq, now);
910 * activate_idle_task - move idle task to the _front_ of runqueue.
912 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
914 u64 now = rq_clock(rq);
916 if (p->state == TASK_UNINTERRUPTIBLE)
917 rq->nr_uninterruptible--;
919 enqueue_task(rq, p, 0, now);
920 inc_nr_running(p, rq, now);
924 * deactivate_task - remove a task from the runqueue.
926 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
928 u64 now = rq_clock(rq);
930 if (p->state == TASK_UNINTERRUPTIBLE)
931 rq->nr_uninterruptible++;
933 dequeue_task(rq, p, sleep, now);
934 dec_nr_running(p, rq, now);
938 * task_curr - is this task currently executing on a CPU?
939 * @p: the task in question.
941 inline int task_curr(const struct task_struct *p)
943 return cpu_curr(task_cpu(p)) == p;
946 /* Used instead of source_load when we know the type == 0 */
947 unsigned long weighted_cpuload(const int cpu)
949 return cpu_rq(cpu)->ls.load.weight;
952 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
955 task_thread_info(p)->cpu = cpu;
962 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
964 int old_cpu = task_cpu(p);
965 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
966 u64 clock_offset, fair_clock_offset;
968 clock_offset = old_rq->clock - new_rq->clock;
969 fair_clock_offset = old_rq->cfs.fair_clock -
970 new_rq->cfs.fair_clock;
971 if (p->se.wait_start)
972 p->se.wait_start -= clock_offset;
973 if (p->se.wait_start_fair)
974 p->se.wait_start_fair -= fair_clock_offset;
975 if (p->se.sleep_start)
976 p->se.sleep_start -= clock_offset;
977 if (p->se.block_start)
978 p->se.block_start -= clock_offset;
979 if (p->se.sleep_start_fair)
980 p->se.sleep_start_fair -= fair_clock_offset;
982 __set_task_cpu(p, new_cpu);
985 struct migration_req {
986 struct list_head list;
988 struct task_struct *task;
991 struct completion done;
995 * The task's runqueue lock must be held.
996 * Returns true if you have to wait for migration thread.
999 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1001 struct rq *rq = task_rq(p);
1004 * If the task is not on a runqueue (and not running), then
1005 * it is sufficient to simply update the task's cpu field.
1007 if (!p->se.on_rq && !task_running(rq, p)) {
1008 set_task_cpu(p, dest_cpu);
1012 init_completion(&req->done);
1014 req->dest_cpu = dest_cpu;
1015 list_add(&req->list, &rq->migration_queue);
1021 * wait_task_inactive - wait for a thread to unschedule.
1023 * The caller must ensure that the task *will* unschedule sometime soon,
1024 * else this function might spin for a *long* time. This function can't
1025 * be called with interrupts off, or it may introduce deadlock with
1026 * smp_call_function() if an IPI is sent by the same process we are
1027 * waiting to become inactive.
1029 void wait_task_inactive(struct task_struct *p)
1031 unsigned long flags;
1037 * We do the initial early heuristics without holding
1038 * any task-queue locks at all. We'll only try to get
1039 * the runqueue lock when things look like they will
1045 * If the task is actively running on another CPU
1046 * still, just relax and busy-wait without holding
1049 * NOTE! Since we don't hold any locks, it's not
1050 * even sure that "rq" stays as the right runqueue!
1051 * But we don't care, since "task_running()" will
1052 * return false if the runqueue has changed and p
1053 * is actually now running somewhere else!
1055 while (task_running(rq, p))
1059 * Ok, time to look more closely! We need the rq
1060 * lock now, to be *sure*. If we're wrong, we'll
1061 * just go back and repeat.
1063 rq = task_rq_lock(p, &flags);
1064 running = task_running(rq, p);
1065 on_rq = p->se.on_rq;
1066 task_rq_unlock(rq, &flags);
1069 * Was it really running after all now that we
1070 * checked with the proper locks actually held?
1072 * Oops. Go back and try again..
1074 if (unlikely(running)) {
1080 * It's not enough that it's not actively running,
1081 * it must be off the runqueue _entirely_, and not
1084 * So if it wa still runnable (but just not actively
1085 * running right now), it's preempted, and we should
1086 * yield - it could be a while.
1088 if (unlikely(on_rq)) {
1094 * Ahh, all good. It wasn't running, and it wasn't
1095 * runnable, which means that it will never become
1096 * running in the future either. We're all done!
1101 * kick_process - kick a running thread to enter/exit the kernel
1102 * @p: the to-be-kicked thread
1104 * Cause a process which is running on another CPU to enter
1105 * kernel-mode, without any delay. (to get signals handled.)
1107 * NOTE: this function doesnt have to take the runqueue lock,
1108 * because all it wants to ensure is that the remote task enters
1109 * the kernel. If the IPI races and the task has been migrated
1110 * to another CPU then no harm is done and the purpose has been
1113 void kick_process(struct task_struct *p)
1119 if ((cpu != smp_processor_id()) && task_curr(p))
1120 smp_send_reschedule(cpu);
1125 * Return a low guess at the load of a migration-source cpu weighted
1126 * according to the scheduling class and "nice" value.
1128 * We want to under-estimate the load of migration sources, to
1129 * balance conservatively.
1131 static inline unsigned long source_load(int cpu, int type)
1133 struct rq *rq = cpu_rq(cpu);
1134 unsigned long total = weighted_cpuload(cpu);
1139 return min(rq->cpu_load[type-1], total);
1143 * Return a high guess at the load of a migration-target cpu weighted
1144 * according to the scheduling class and "nice" value.
1146 static inline unsigned long target_load(int cpu, int type)
1148 struct rq *rq = cpu_rq(cpu);
1149 unsigned long total = weighted_cpuload(cpu);
1154 return max(rq->cpu_load[type-1], total);
1158 * Return the average load per task on the cpu's run queue
1160 static inline unsigned long cpu_avg_load_per_task(int cpu)
1162 struct rq *rq = cpu_rq(cpu);
1163 unsigned long total = weighted_cpuload(cpu);
1164 unsigned long n = rq->nr_running;
1166 return n ? total / n : SCHED_LOAD_SCALE;
1170 * find_idlest_group finds and returns the least busy CPU group within the
1173 static struct sched_group *
1174 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1176 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1177 unsigned long min_load = ULONG_MAX, this_load = 0;
1178 int load_idx = sd->forkexec_idx;
1179 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1182 unsigned long load, avg_load;
1186 /* Skip over this group if it has no CPUs allowed */
1187 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1190 local_group = cpu_isset(this_cpu, group->cpumask);
1192 /* Tally up the load of all CPUs in the group */
1195 for_each_cpu_mask(i, group->cpumask) {
1196 /* Bias balancing toward cpus of our domain */
1198 load = source_load(i, load_idx);
1200 load = target_load(i, load_idx);
1205 /* Adjust by relative CPU power of the group */
1206 avg_load = sg_div_cpu_power(group,
1207 avg_load * SCHED_LOAD_SCALE);
1210 this_load = avg_load;
1212 } else if (avg_load < min_load) {
1213 min_load = avg_load;
1217 group = group->next;
1218 } while (group != sd->groups);
1220 if (!idlest || 100*this_load < imbalance*min_load)
1226 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1229 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1232 unsigned long load, min_load = ULONG_MAX;
1236 /* Traverse only the allowed CPUs */
1237 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1239 for_each_cpu_mask(i, tmp) {
1240 load = weighted_cpuload(i);
1242 if (load < min_load || (load == min_load && i == this_cpu)) {
1252 * sched_balance_self: balance the current task (running on cpu) in domains
1253 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1256 * Balance, ie. select the least loaded group.
1258 * Returns the target CPU number, or the same CPU if no balancing is needed.
1260 * preempt must be disabled.
1262 static int sched_balance_self(int cpu, int flag)
1264 struct task_struct *t = current;
1265 struct sched_domain *tmp, *sd = NULL;
1267 for_each_domain(cpu, tmp) {
1269 * If power savings logic is enabled for a domain, stop there.
1271 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1273 if (tmp->flags & flag)
1279 struct sched_group *group;
1280 int new_cpu, weight;
1282 if (!(sd->flags & flag)) {
1288 group = find_idlest_group(sd, t, cpu);
1294 new_cpu = find_idlest_cpu(group, t, cpu);
1295 if (new_cpu == -1 || new_cpu == cpu) {
1296 /* Now try balancing at a lower domain level of cpu */
1301 /* Now try balancing at a lower domain level of new_cpu */
1304 weight = cpus_weight(span);
1305 for_each_domain(cpu, tmp) {
1306 if (weight <= cpus_weight(tmp->span))
1308 if (tmp->flags & flag)
1311 /* while loop will break here if sd == NULL */
1317 #endif /* CONFIG_SMP */
1320 * wake_idle() will wake a task on an idle cpu if task->cpu is
1321 * not idle and an idle cpu is available. The span of cpus to
1322 * search starts with cpus closest then further out as needed,
1323 * so we always favor a closer, idle cpu.
1325 * Returns the CPU we should wake onto.
1327 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1328 static int wake_idle(int cpu, struct task_struct *p)
1331 struct sched_domain *sd;
1335 * If it is idle, then it is the best cpu to run this task.
1337 * This cpu is also the best, if it has more than one task already.
1338 * Siblings must be also busy(in most cases) as they didn't already
1339 * pickup the extra load from this cpu and hence we need not check
1340 * sibling runqueue info. This will avoid the checks and cache miss
1341 * penalities associated with that.
1343 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1346 for_each_domain(cpu, sd) {
1347 if (sd->flags & SD_WAKE_IDLE) {
1348 cpus_and(tmp, sd->span, p->cpus_allowed);
1349 for_each_cpu_mask(i, tmp) {
1360 static inline int wake_idle(int cpu, struct task_struct *p)
1367 * try_to_wake_up - wake up a thread
1368 * @p: the to-be-woken-up thread
1369 * @state: the mask of task states that can be woken
1370 * @sync: do a synchronous wakeup?
1372 * Put it on the run-queue if it's not already there. The "current"
1373 * thread is always on the run-queue (except when the actual
1374 * re-schedule is in progress), and as such you're allowed to do
1375 * the simpler "current->state = TASK_RUNNING" to mark yourself
1376 * runnable without the overhead of this.
1378 * returns failure only if the task is already active.
1380 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1382 int cpu, this_cpu, success = 0;
1383 unsigned long flags;
1387 struct sched_domain *sd, *this_sd = NULL;
1388 unsigned long load, this_load;
1392 rq = task_rq_lock(p, &flags);
1393 old_state = p->state;
1394 if (!(old_state & state))
1401 this_cpu = smp_processor_id();
1404 if (unlikely(task_running(rq, p)))
1409 schedstat_inc(rq, ttwu_cnt);
1410 if (cpu == this_cpu) {
1411 schedstat_inc(rq, ttwu_local);
1415 for_each_domain(this_cpu, sd) {
1416 if (cpu_isset(cpu, sd->span)) {
1417 schedstat_inc(sd, ttwu_wake_remote);
1423 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1427 * Check for affine wakeup and passive balancing possibilities.
1430 int idx = this_sd->wake_idx;
1431 unsigned int imbalance;
1433 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1435 load = source_load(cpu, idx);
1436 this_load = target_load(this_cpu, idx);
1438 new_cpu = this_cpu; /* Wake to this CPU if we can */
1440 if (this_sd->flags & SD_WAKE_AFFINE) {
1441 unsigned long tl = this_load;
1442 unsigned long tl_per_task;
1444 tl_per_task = cpu_avg_load_per_task(this_cpu);
1447 * If sync wakeup then subtract the (maximum possible)
1448 * effect of the currently running task from the load
1449 * of the current CPU:
1452 tl -= current->se.load.weight;
1455 tl + target_load(cpu, idx) <= tl_per_task) ||
1456 100*(tl + p->se.load.weight) <= imbalance*load) {
1458 * This domain has SD_WAKE_AFFINE and
1459 * p is cache cold in this domain, and
1460 * there is no bad imbalance.
1462 schedstat_inc(this_sd, ttwu_move_affine);
1468 * Start passive balancing when half the imbalance_pct
1471 if (this_sd->flags & SD_WAKE_BALANCE) {
1472 if (imbalance*this_load <= 100*load) {
1473 schedstat_inc(this_sd, ttwu_move_balance);
1479 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1481 new_cpu = wake_idle(new_cpu, p);
1482 if (new_cpu != cpu) {
1483 set_task_cpu(p, new_cpu);
1484 task_rq_unlock(rq, &flags);
1485 /* might preempt at this point */
1486 rq = task_rq_lock(p, &flags);
1487 old_state = p->state;
1488 if (!(old_state & state))
1493 this_cpu = smp_processor_id();
1498 #endif /* CONFIG_SMP */
1499 activate_task(rq, p, 1);
1501 * Sync wakeups (i.e. those types of wakeups where the waker
1502 * has indicated that it will leave the CPU in short order)
1503 * don't trigger a preemption, if the woken up task will run on
1504 * this cpu. (in this case the 'I will reschedule' promise of
1505 * the waker guarantees that the freshly woken up task is going
1506 * to be considered on this CPU.)
1508 if (!sync || cpu != this_cpu)
1509 check_preempt_curr(rq, p);
1513 p->state = TASK_RUNNING;
1515 task_rq_unlock(rq, &flags);
1520 int fastcall wake_up_process(struct task_struct *p)
1522 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1523 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1525 EXPORT_SYMBOL(wake_up_process);
1527 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1529 return try_to_wake_up(p, state, 0);
1533 * Perform scheduler related setup for a newly forked process p.
1534 * p is forked by current.
1536 * __sched_fork() is basic setup used by init_idle() too:
1538 static void __sched_fork(struct task_struct *p)
1540 p->se.wait_start_fair = 0;
1541 p->se.wait_start = 0;
1542 p->se.exec_start = 0;
1543 p->se.sum_exec_runtime = 0;
1544 p->se.delta_exec = 0;
1545 p->se.delta_fair_run = 0;
1546 p->se.delta_fair_sleep = 0;
1547 p->se.wait_runtime = 0;
1548 p->se.sum_wait_runtime = 0;
1549 p->se.sum_sleep_runtime = 0;
1550 p->se.sleep_start = 0;
1551 p->se.sleep_start_fair = 0;
1552 p->se.block_start = 0;
1553 p->se.sleep_max = 0;
1554 p->se.block_max = 0;
1557 p->se.wait_runtime_overruns = 0;
1558 p->se.wait_runtime_underruns = 0;
1560 INIT_LIST_HEAD(&p->run_list);
1564 * We mark the process as running here, but have not actually
1565 * inserted it onto the runqueue yet. This guarantees that
1566 * nobody will actually run it, and a signal or other external
1567 * event cannot wake it up and insert it on the runqueue either.
1569 p->state = TASK_RUNNING;
1573 * fork()/clone()-time setup:
1575 void sched_fork(struct task_struct *p, int clone_flags)
1577 int cpu = get_cpu();
1582 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1584 __set_task_cpu(p, cpu);
1587 * Make sure we do not leak PI boosting priority to the child:
1589 p->prio = current->normal_prio;
1591 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1592 if (likely(sched_info_on()))
1593 memset(&p->sched_info, 0, sizeof(p->sched_info));
1595 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1598 #ifdef CONFIG_PREEMPT
1599 /* Want to start with kernel preemption disabled. */
1600 task_thread_info(p)->preempt_count = 1;
1606 * After fork, child runs first. (default) If set to 0 then
1607 * parent will (try to) run first.
1609 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1612 * wake_up_new_task - wake up a newly created task for the first time.
1614 * This function will do some initial scheduler statistics housekeeping
1615 * that must be done for every newly created context, then puts the task
1616 * on the runqueue and wakes it.
1618 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1620 unsigned long flags;
1624 rq = task_rq_lock(p, &flags);
1625 BUG_ON(p->state != TASK_RUNNING);
1626 this_cpu = smp_processor_id(); /* parent's CPU */
1628 p->prio = effective_prio(p);
1630 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1631 task_cpu(p) != this_cpu || !current->se.on_rq) {
1632 activate_task(rq, p, 0);
1635 * Let the scheduling class do new task startup
1636 * management (if any):
1638 p->sched_class->task_new(rq, p);
1640 check_preempt_curr(rq, p);
1641 task_rq_unlock(rq, &flags);
1645 * prepare_task_switch - prepare to switch tasks
1646 * @rq: the runqueue preparing to switch
1647 * @next: the task we are going to switch to.
1649 * This is called with the rq lock held and interrupts off. It must
1650 * be paired with a subsequent finish_task_switch after the context
1653 * prepare_task_switch sets up locking and calls architecture specific
1656 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1658 prepare_lock_switch(rq, next);
1659 prepare_arch_switch(next);
1663 * finish_task_switch - clean up after a task-switch
1664 * @rq: runqueue associated with task-switch
1665 * @prev: the thread we just switched away from.
1667 * finish_task_switch must be called after the context switch, paired
1668 * with a prepare_task_switch call before the context switch.
1669 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1670 * and do any other architecture-specific cleanup actions.
1672 * Note that we may have delayed dropping an mm in context_switch(). If
1673 * so, we finish that here outside of the runqueue lock. (Doing it
1674 * with the lock held can cause deadlocks; see schedule() for
1677 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1678 __releases(rq->lock)
1680 struct mm_struct *mm = rq->prev_mm;
1686 * A task struct has one reference for the use as "current".
1687 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1688 * schedule one last time. The schedule call will never return, and
1689 * the scheduled task must drop that reference.
1690 * The test for TASK_DEAD must occur while the runqueue locks are
1691 * still held, otherwise prev could be scheduled on another cpu, die
1692 * there before we look at prev->state, and then the reference would
1694 * Manfred Spraul <manfred@colorfullife.com>
1696 prev_state = prev->state;
1697 finish_arch_switch(prev);
1698 finish_lock_switch(rq, prev);
1701 if (unlikely(prev_state == TASK_DEAD)) {
1703 * Remove function-return probe instances associated with this
1704 * task and put them back on the free list.
1706 kprobe_flush_task(prev);
1707 put_task_struct(prev);
1712 * schedule_tail - first thing a freshly forked thread must call.
1713 * @prev: the thread we just switched away from.
1715 asmlinkage void schedule_tail(struct task_struct *prev)
1716 __releases(rq->lock)
1718 struct rq *rq = this_rq();
1720 finish_task_switch(rq, prev);
1721 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1722 /* In this case, finish_task_switch does not reenable preemption */
1725 if (current->set_child_tid)
1726 put_user(current->pid, current->set_child_tid);
1730 * context_switch - switch to the new MM and the new
1731 * thread's register state.
1734 context_switch(struct rq *rq, struct task_struct *prev,
1735 struct task_struct *next)
1737 struct mm_struct *mm, *oldmm;
1739 prepare_task_switch(rq, next);
1741 oldmm = prev->active_mm;
1743 * For paravirt, this is coupled with an exit in switch_to to
1744 * combine the page table reload and the switch backend into
1747 arch_enter_lazy_cpu_mode();
1749 if (unlikely(!mm)) {
1750 next->active_mm = oldmm;
1751 atomic_inc(&oldmm->mm_count);
1752 enter_lazy_tlb(oldmm, next);
1754 switch_mm(oldmm, mm, next);
1756 if (unlikely(!prev->mm)) {
1757 prev->active_mm = NULL;
1758 rq->prev_mm = oldmm;
1761 * Since the runqueue lock will be released by the next
1762 * task (which is an invalid locking op but in the case
1763 * of the scheduler it's an obvious special-case), so we
1764 * do an early lockdep release here:
1766 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1767 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1770 /* Here we just switch the register state and the stack. */
1771 switch_to(prev, next, prev);
1775 * this_rq must be evaluated again because prev may have moved
1776 * CPUs since it called schedule(), thus the 'rq' on its stack
1777 * frame will be invalid.
1779 finish_task_switch(this_rq(), prev);
1783 * nr_running, nr_uninterruptible and nr_context_switches:
1785 * externally visible scheduler statistics: current number of runnable
1786 * threads, current number of uninterruptible-sleeping threads, total
1787 * number of context switches performed since bootup.
1789 unsigned long nr_running(void)
1791 unsigned long i, sum = 0;
1793 for_each_online_cpu(i)
1794 sum += cpu_rq(i)->nr_running;
1799 unsigned long nr_uninterruptible(void)
1801 unsigned long i, sum = 0;
1803 for_each_possible_cpu(i)
1804 sum += cpu_rq(i)->nr_uninterruptible;
1807 * Since we read the counters lockless, it might be slightly
1808 * inaccurate. Do not allow it to go below zero though:
1810 if (unlikely((long)sum < 0))
1816 unsigned long long nr_context_switches(void)
1819 unsigned long long sum = 0;
1821 for_each_possible_cpu(i)
1822 sum += cpu_rq(i)->nr_switches;
1827 unsigned long nr_iowait(void)
1829 unsigned long i, sum = 0;
1831 for_each_possible_cpu(i)
1832 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1837 unsigned long nr_active(void)
1839 unsigned long i, running = 0, uninterruptible = 0;
1841 for_each_online_cpu(i) {
1842 running += cpu_rq(i)->nr_running;
1843 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1846 if (unlikely((long)uninterruptible < 0))
1847 uninterruptible = 0;
1849 return running + uninterruptible;
1853 * Update rq->cpu_load[] statistics. This function is usually called every
1854 * scheduler tick (TICK_NSEC).
1856 static void update_cpu_load(struct rq *this_rq)
1858 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1859 unsigned long total_load = this_rq->ls.load.weight;
1860 unsigned long this_load = total_load;
1861 struct load_stat *ls = &this_rq->ls;
1862 u64 now = __rq_clock(this_rq);
1865 this_rq->nr_load_updates++;
1866 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1869 /* Update delta_fair/delta_exec fields first */
1870 update_curr_load(this_rq, now);
1872 fair_delta64 = ls->delta_fair + 1;
1875 exec_delta64 = ls->delta_exec + 1;
1878 sample_interval64 = now - ls->load_update_last;
1879 ls->load_update_last = now;
1881 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1882 sample_interval64 = TICK_NSEC;
1884 if (exec_delta64 > sample_interval64)
1885 exec_delta64 = sample_interval64;
1887 idle_delta64 = sample_interval64 - exec_delta64;
1889 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1890 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1892 this_load = (unsigned long)tmp64;
1896 /* Update our load: */
1897 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1898 unsigned long old_load, new_load;
1900 /* scale is effectively 1 << i now, and >> i divides by scale */
1902 old_load = this_rq->cpu_load[i];
1903 new_load = this_load;
1905 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912 * double_rq_lock - safely lock two runqueues
1914 * Note this does not disable interrupts like task_rq_lock,
1915 * you need to do so manually before calling.
1917 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1918 __acquires(rq1->lock)
1919 __acquires(rq2->lock)
1921 BUG_ON(!irqs_disabled());
1923 spin_lock(&rq1->lock);
1924 __acquire(rq2->lock); /* Fake it out ;) */
1927 spin_lock(&rq1->lock);
1928 spin_lock(&rq2->lock);
1930 spin_lock(&rq2->lock);
1931 spin_lock(&rq1->lock);
1937 * double_rq_unlock - safely unlock two runqueues
1939 * Note this does not restore interrupts like task_rq_unlock,
1940 * you need to do so manually after calling.
1942 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1943 __releases(rq1->lock)
1944 __releases(rq2->lock)
1946 spin_unlock(&rq1->lock);
1948 spin_unlock(&rq2->lock);
1950 __release(rq2->lock);
1954 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1956 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1957 __releases(this_rq->lock)
1958 __acquires(busiest->lock)
1959 __acquires(this_rq->lock)
1961 if (unlikely(!irqs_disabled())) {
1962 /* printk() doesn't work good under rq->lock */
1963 spin_unlock(&this_rq->lock);
1966 if (unlikely(!spin_trylock(&busiest->lock))) {
1967 if (busiest < this_rq) {
1968 spin_unlock(&this_rq->lock);
1969 spin_lock(&busiest->lock);
1970 spin_lock(&this_rq->lock);
1972 spin_lock(&busiest->lock);
1977 * If dest_cpu is allowed for this process, migrate the task to it.
1978 * This is accomplished by forcing the cpu_allowed mask to only
1979 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1980 * the cpu_allowed mask is restored.
1982 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1984 struct migration_req req;
1985 unsigned long flags;
1988 rq = task_rq_lock(p, &flags);
1989 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1990 || unlikely(cpu_is_offline(dest_cpu)))
1993 /* force the process onto the specified CPU */
1994 if (migrate_task(p, dest_cpu, &req)) {
1995 /* Need to wait for migration thread (might exit: take ref). */
1996 struct task_struct *mt = rq->migration_thread;
1998 get_task_struct(mt);
1999 task_rq_unlock(rq, &flags);
2000 wake_up_process(mt);
2001 put_task_struct(mt);
2002 wait_for_completion(&req.done);
2007 task_rq_unlock(rq, &flags);
2011 * sched_exec - execve() is a valuable balancing opportunity, because at
2012 * this point the task has the smallest effective memory and cache footprint.
2014 void sched_exec(void)
2016 int new_cpu, this_cpu = get_cpu();
2017 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2019 if (new_cpu != this_cpu)
2020 sched_migrate_task(current, new_cpu);
2024 * pull_task - move a task from a remote runqueue to the local runqueue.
2025 * Both runqueues must be locked.
2027 static void pull_task(struct rq *src_rq, struct task_struct *p,
2028 struct rq *this_rq, int this_cpu)
2030 deactivate_task(src_rq, p, 0);
2031 set_task_cpu(p, this_cpu);
2032 activate_task(this_rq, p, 0);
2034 * Note that idle threads have a prio of MAX_PRIO, for this test
2035 * to be always true for them.
2037 check_preempt_curr(this_rq, p);
2041 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2044 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2045 struct sched_domain *sd, enum cpu_idle_type idle,
2049 * We do not migrate tasks that are:
2050 * 1) running (obviously), or
2051 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2052 * 3) are cache-hot on their current CPU.
2054 if (!cpu_isset(this_cpu, p->cpus_allowed))
2058 if (task_running(rq, p))
2062 * Aggressive migration if too many balance attempts have failed:
2064 if (sd->nr_balance_failed > sd->cache_nice_tries)
2070 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2071 unsigned long max_nr_move, unsigned long max_load_move,
2072 struct sched_domain *sd, enum cpu_idle_type idle,
2073 int *all_pinned, unsigned long *load_moved,
2074 int this_best_prio, int best_prio, int best_prio_seen,
2075 struct rq_iterator *iterator)
2077 int pulled = 0, pinned = 0, skip_for_load;
2078 struct task_struct *p;
2079 long rem_load_move = max_load_move;
2081 if (max_nr_move == 0 || max_load_move == 0)
2087 * Start the load-balancing iterator:
2089 p = iterator->start(iterator->arg);
2094 * To help distribute high priority tasks accross CPUs we don't
2095 * skip a task if it will be the highest priority task (i.e. smallest
2096 * prio value) on its new queue regardless of its load weight
2098 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2099 SCHED_LOAD_SCALE_FUZZ;
2100 if (skip_for_load && p->prio < this_best_prio)
2101 skip_for_load = !best_prio_seen && p->prio == best_prio;
2102 if (skip_for_load ||
2103 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2105 best_prio_seen |= p->prio == best_prio;
2106 p = iterator->next(iterator->arg);
2110 pull_task(busiest, p, this_rq, this_cpu);
2112 rem_load_move -= p->se.load.weight;
2115 * We only want to steal up to the prescribed number of tasks
2116 * and the prescribed amount of weighted load.
2118 if (pulled < max_nr_move && rem_load_move > 0) {
2119 if (p->prio < this_best_prio)
2120 this_best_prio = p->prio;
2121 p = iterator->next(iterator->arg);
2126 * Right now, this is the only place pull_task() is called,
2127 * so we can safely collect pull_task() stats here rather than
2128 * inside pull_task().
2130 schedstat_add(sd, lb_gained[idle], pulled);
2133 *all_pinned = pinned;
2134 *load_moved = max_load_move - rem_load_move;
2139 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2140 * load from busiest to this_rq, as part of a balancing operation within
2141 * "domain". Returns the number of tasks moved.
2143 * Called with both runqueues locked.
2145 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2146 unsigned long max_nr_move, unsigned long max_load_move,
2147 struct sched_domain *sd, enum cpu_idle_type idle,
2150 struct sched_class *class = sched_class_highest;
2151 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2152 long rem_load_move = max_load_move;
2155 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2156 max_nr_move, (unsigned long)rem_load_move,
2157 sd, idle, all_pinned, &load_moved);
2158 total_nr_moved += nr_moved;
2159 max_nr_move -= nr_moved;
2160 rem_load_move -= load_moved;
2161 class = class->next;
2162 } while (class && max_nr_move && rem_load_move > 0);
2164 return total_nr_moved;
2168 * find_busiest_group finds and returns the busiest CPU group within the
2169 * domain. It calculates and returns the amount of weighted load which
2170 * should be moved to restore balance via the imbalance parameter.
2172 static struct sched_group *
2173 find_busiest_group(struct sched_domain *sd, int this_cpu,
2174 unsigned long *imbalance, enum cpu_idle_type idle,
2175 int *sd_idle, cpumask_t *cpus, int *balance)
2177 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2178 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2179 unsigned long max_pull;
2180 unsigned long busiest_load_per_task, busiest_nr_running;
2181 unsigned long this_load_per_task, this_nr_running;
2183 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2184 int power_savings_balance = 1;
2185 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2186 unsigned long min_nr_running = ULONG_MAX;
2187 struct sched_group *group_min = NULL, *group_leader = NULL;
2190 max_load = this_load = total_load = total_pwr = 0;
2191 busiest_load_per_task = busiest_nr_running = 0;
2192 this_load_per_task = this_nr_running = 0;
2193 if (idle == CPU_NOT_IDLE)
2194 load_idx = sd->busy_idx;
2195 else if (idle == CPU_NEWLY_IDLE)
2196 load_idx = sd->newidle_idx;
2198 load_idx = sd->idle_idx;
2201 unsigned long load, group_capacity;
2204 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2205 unsigned long sum_nr_running, sum_weighted_load;
2207 local_group = cpu_isset(this_cpu, group->cpumask);
2210 balance_cpu = first_cpu(group->cpumask);
2212 /* Tally up the load of all CPUs in the group */
2213 sum_weighted_load = sum_nr_running = avg_load = 0;
2215 for_each_cpu_mask(i, group->cpumask) {
2218 if (!cpu_isset(i, *cpus))
2223 if (*sd_idle && !idle_cpu(i))
2226 /* Bias balancing toward cpus of our domain */
2228 if (idle_cpu(i) && !first_idle_cpu) {
2233 load = target_load(i, load_idx);
2235 load = source_load(i, load_idx);
2238 sum_nr_running += rq->nr_running;
2239 sum_weighted_load += weighted_cpuload(i);
2243 * First idle cpu or the first cpu(busiest) in this sched group
2244 * is eligible for doing load balancing at this and above
2247 if (local_group && balance_cpu != this_cpu && balance) {
2252 total_load += avg_load;
2253 total_pwr += group->__cpu_power;
2255 /* Adjust by relative CPU power of the group */
2256 avg_load = sg_div_cpu_power(group,
2257 avg_load * SCHED_LOAD_SCALE);
2259 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2262 this_load = avg_load;
2264 this_nr_running = sum_nr_running;
2265 this_load_per_task = sum_weighted_load;
2266 } else if (avg_load > max_load &&
2267 sum_nr_running > group_capacity) {
2268 max_load = avg_load;
2270 busiest_nr_running = sum_nr_running;
2271 busiest_load_per_task = sum_weighted_load;
2274 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2276 * Busy processors will not participate in power savings
2279 if (idle == CPU_NOT_IDLE ||
2280 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2284 * If the local group is idle or completely loaded
2285 * no need to do power savings balance at this domain
2287 if (local_group && (this_nr_running >= group_capacity ||
2289 power_savings_balance = 0;
2292 * If a group is already running at full capacity or idle,
2293 * don't include that group in power savings calculations
2295 if (!power_savings_balance || sum_nr_running >= group_capacity
2300 * Calculate the group which has the least non-idle load.
2301 * This is the group from where we need to pick up the load
2304 if ((sum_nr_running < min_nr_running) ||
2305 (sum_nr_running == min_nr_running &&
2306 first_cpu(group->cpumask) <
2307 first_cpu(group_min->cpumask))) {
2309 min_nr_running = sum_nr_running;
2310 min_load_per_task = sum_weighted_load /
2315 * Calculate the group which is almost near its
2316 * capacity but still has some space to pick up some load
2317 * from other group and save more power
2319 if (sum_nr_running <= group_capacity - 1) {
2320 if (sum_nr_running > leader_nr_running ||
2321 (sum_nr_running == leader_nr_running &&
2322 first_cpu(group->cpumask) >
2323 first_cpu(group_leader->cpumask))) {
2324 group_leader = group;
2325 leader_nr_running = sum_nr_running;
2330 group = group->next;
2331 } while (group != sd->groups);
2333 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2336 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2338 if (this_load >= avg_load ||
2339 100*max_load <= sd->imbalance_pct*this_load)
2342 busiest_load_per_task /= busiest_nr_running;
2344 * We're trying to get all the cpus to the average_load, so we don't
2345 * want to push ourselves above the average load, nor do we wish to
2346 * reduce the max loaded cpu below the average load, as either of these
2347 * actions would just result in more rebalancing later, and ping-pong
2348 * tasks around. Thus we look for the minimum possible imbalance.
2349 * Negative imbalances (*we* are more loaded than anyone else) will
2350 * be counted as no imbalance for these purposes -- we can't fix that
2351 * by pulling tasks to us. Be careful of negative numbers as they'll
2352 * appear as very large values with unsigned longs.
2354 if (max_load <= busiest_load_per_task)
2358 * In the presence of smp nice balancing, certain scenarios can have
2359 * max load less than avg load(as we skip the groups at or below
2360 * its cpu_power, while calculating max_load..)
2362 if (max_load < avg_load) {
2364 goto small_imbalance;
2367 /* Don't want to pull so many tasks that a group would go idle */
2368 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2370 /* How much load to actually move to equalise the imbalance */
2371 *imbalance = min(max_pull * busiest->__cpu_power,
2372 (avg_load - this_load) * this->__cpu_power)
2376 * if *imbalance is less than the average load per runnable task
2377 * there is no gaurantee that any tasks will be moved so we'll have
2378 * a think about bumping its value to force at least one task to be
2381 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2382 unsigned long tmp, pwr_now, pwr_move;
2386 pwr_move = pwr_now = 0;
2388 if (this_nr_running) {
2389 this_load_per_task /= this_nr_running;
2390 if (busiest_load_per_task > this_load_per_task)
2393 this_load_per_task = SCHED_LOAD_SCALE;
2395 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2396 busiest_load_per_task * imbn) {
2397 *imbalance = busiest_load_per_task;
2402 * OK, we don't have enough imbalance to justify moving tasks,
2403 * however we may be able to increase total CPU power used by
2407 pwr_now += busiest->__cpu_power *
2408 min(busiest_load_per_task, max_load);
2409 pwr_now += this->__cpu_power *
2410 min(this_load_per_task, this_load);
2411 pwr_now /= SCHED_LOAD_SCALE;
2413 /* Amount of load we'd subtract */
2414 tmp = sg_div_cpu_power(busiest,
2415 busiest_load_per_task * SCHED_LOAD_SCALE);
2417 pwr_move += busiest->__cpu_power *
2418 min(busiest_load_per_task, max_load - tmp);
2420 /* Amount of load we'd add */
2421 if (max_load * busiest->__cpu_power <
2422 busiest_load_per_task * SCHED_LOAD_SCALE)
2423 tmp = sg_div_cpu_power(this,
2424 max_load * busiest->__cpu_power);
2426 tmp = sg_div_cpu_power(this,
2427 busiest_load_per_task * SCHED_LOAD_SCALE);
2428 pwr_move += this->__cpu_power *
2429 min(this_load_per_task, this_load + tmp);
2430 pwr_move /= SCHED_LOAD_SCALE;
2432 /* Move if we gain throughput */
2433 if (pwr_move <= pwr_now)
2436 *imbalance = busiest_load_per_task;
2442 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2443 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2446 if (this == group_leader && group_leader != group_min) {
2447 *imbalance = min_load_per_task;
2457 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2460 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2461 unsigned long imbalance, cpumask_t *cpus)
2463 struct rq *busiest = NULL, *rq;
2464 unsigned long max_load = 0;
2467 for_each_cpu_mask(i, group->cpumask) {
2470 if (!cpu_isset(i, *cpus))
2474 wl = weighted_cpuload(i);
2476 if (rq->nr_running == 1 && wl > imbalance)
2479 if (wl > max_load) {
2489 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2490 * so long as it is large enough.
2492 #define MAX_PINNED_INTERVAL 512
2494 static inline unsigned long minus_1_or_zero(unsigned long n)
2496 return n > 0 ? n - 1 : 0;
2500 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2501 * tasks if there is an imbalance.
2503 static int load_balance(int this_cpu, struct rq *this_rq,
2504 struct sched_domain *sd, enum cpu_idle_type idle,
2507 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2508 struct sched_group *group;
2509 unsigned long imbalance;
2511 cpumask_t cpus = CPU_MASK_ALL;
2512 unsigned long flags;
2515 * When power savings policy is enabled for the parent domain, idle
2516 * sibling can pick up load irrespective of busy siblings. In this case,
2517 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2518 * portraying it as CPU_NOT_IDLE.
2520 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2521 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2524 schedstat_inc(sd, lb_cnt[idle]);
2527 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2534 schedstat_inc(sd, lb_nobusyg[idle]);
2538 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2540 schedstat_inc(sd, lb_nobusyq[idle]);
2544 BUG_ON(busiest == this_rq);
2546 schedstat_add(sd, lb_imbalance[idle], imbalance);
2549 if (busiest->nr_running > 1) {
2551 * Attempt to move tasks. If find_busiest_group has found
2552 * an imbalance but busiest->nr_running <= 1, the group is
2553 * still unbalanced. nr_moved simply stays zero, so it is
2554 * correctly treated as an imbalance.
2556 local_irq_save(flags);
2557 double_rq_lock(this_rq, busiest);
2558 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2559 minus_1_or_zero(busiest->nr_running),
2560 imbalance, sd, idle, &all_pinned);
2561 double_rq_unlock(this_rq, busiest);
2562 local_irq_restore(flags);
2565 * some other cpu did the load balance for us.
2567 if (nr_moved && this_cpu != smp_processor_id())
2568 resched_cpu(this_cpu);
2570 /* All tasks on this runqueue were pinned by CPU affinity */
2571 if (unlikely(all_pinned)) {
2572 cpu_clear(cpu_of(busiest), cpus);
2573 if (!cpus_empty(cpus))
2580 schedstat_inc(sd, lb_failed[idle]);
2581 sd->nr_balance_failed++;
2583 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2585 spin_lock_irqsave(&busiest->lock, flags);
2587 /* don't kick the migration_thread, if the curr
2588 * task on busiest cpu can't be moved to this_cpu
2590 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2591 spin_unlock_irqrestore(&busiest->lock, flags);
2593 goto out_one_pinned;
2596 if (!busiest->active_balance) {
2597 busiest->active_balance = 1;
2598 busiest->push_cpu = this_cpu;
2601 spin_unlock_irqrestore(&busiest->lock, flags);
2603 wake_up_process(busiest->migration_thread);
2606 * We've kicked active balancing, reset the failure
2609 sd->nr_balance_failed = sd->cache_nice_tries+1;
2612 sd->nr_balance_failed = 0;
2614 if (likely(!active_balance)) {
2615 /* We were unbalanced, so reset the balancing interval */
2616 sd->balance_interval = sd->min_interval;
2619 * If we've begun active balancing, start to back off. This
2620 * case may not be covered by the all_pinned logic if there
2621 * is only 1 task on the busy runqueue (because we don't call
2624 if (sd->balance_interval < sd->max_interval)
2625 sd->balance_interval *= 2;
2628 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2629 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2634 schedstat_inc(sd, lb_balanced[idle]);
2636 sd->nr_balance_failed = 0;
2639 /* tune up the balancing interval */
2640 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2641 (sd->balance_interval < sd->max_interval))
2642 sd->balance_interval *= 2;
2644 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2645 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2651 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2652 * tasks if there is an imbalance.
2654 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2655 * this_rq is locked.
2658 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2660 struct sched_group *group;
2661 struct rq *busiest = NULL;
2662 unsigned long imbalance;
2665 cpumask_t cpus = CPU_MASK_ALL;
2668 * When power savings policy is enabled for the parent domain, idle
2669 * sibling can pick up load irrespective of busy siblings. In this case,
2670 * let the state of idle sibling percolate up as IDLE, instead of
2671 * portraying it as CPU_NOT_IDLE.
2673 if (sd->flags & SD_SHARE_CPUPOWER &&
2674 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2677 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2679 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2680 &sd_idle, &cpus, NULL);
2682 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2686 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2689 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2693 BUG_ON(busiest == this_rq);
2695 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2698 if (busiest->nr_running > 1) {
2699 /* Attempt to move tasks */
2700 double_lock_balance(this_rq, busiest);
2701 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2702 minus_1_or_zero(busiest->nr_running),
2703 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2704 spin_unlock(&busiest->lock);
2707 cpu_clear(cpu_of(busiest), cpus);
2708 if (!cpus_empty(cpus))
2714 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2715 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2716 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2719 sd->nr_balance_failed = 0;
2724 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2725 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2726 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2728 sd->nr_balance_failed = 0;
2734 * idle_balance is called by schedule() if this_cpu is about to become
2735 * idle. Attempts to pull tasks from other CPUs.
2737 static void idle_balance(int this_cpu, struct rq *this_rq)
2739 struct sched_domain *sd;
2740 int pulled_task = -1;
2741 unsigned long next_balance = jiffies + HZ;
2743 for_each_domain(this_cpu, sd) {
2744 unsigned long interval;
2746 if (!(sd->flags & SD_LOAD_BALANCE))
2749 if (sd->flags & SD_BALANCE_NEWIDLE)
2750 /* If we've pulled tasks over stop searching: */
2751 pulled_task = load_balance_newidle(this_cpu,
2754 interval = msecs_to_jiffies(sd->balance_interval);
2755 if (time_after(next_balance, sd->last_balance + interval))
2756 next_balance = sd->last_balance + interval;
2760 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2762 * We are going idle. next_balance may be set based on
2763 * a busy processor. So reset next_balance.
2765 this_rq->next_balance = next_balance;
2770 * active_load_balance is run by migration threads. It pushes running tasks
2771 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2772 * running on each physical CPU where possible, and avoids physical /
2773 * logical imbalances.
2775 * Called with busiest_rq locked.
2777 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2779 int target_cpu = busiest_rq->push_cpu;
2780 struct sched_domain *sd;
2781 struct rq *target_rq;
2783 /* Is there any task to move? */
2784 if (busiest_rq->nr_running <= 1)
2787 target_rq = cpu_rq(target_cpu);
2790 * This condition is "impossible", if it occurs
2791 * we need to fix it. Originally reported by
2792 * Bjorn Helgaas on a 128-cpu setup.
2794 BUG_ON(busiest_rq == target_rq);
2796 /* move a task from busiest_rq to target_rq */
2797 double_lock_balance(busiest_rq, target_rq);
2799 /* Search for an sd spanning us and the target CPU. */
2800 for_each_domain(target_cpu, sd) {
2801 if ((sd->flags & SD_LOAD_BALANCE) &&
2802 cpu_isset(busiest_cpu, sd->span))
2807 schedstat_inc(sd, alb_cnt);
2809 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2810 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2812 schedstat_inc(sd, alb_pushed);
2814 schedstat_inc(sd, alb_failed);
2816 spin_unlock(&target_rq->lock);
2821 atomic_t load_balancer;
2823 } nohz ____cacheline_aligned = {
2824 .load_balancer = ATOMIC_INIT(-1),
2825 .cpu_mask = CPU_MASK_NONE,
2829 * This routine will try to nominate the ilb (idle load balancing)
2830 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2831 * load balancing on behalf of all those cpus. If all the cpus in the system
2832 * go into this tickless mode, then there will be no ilb owner (as there is
2833 * no need for one) and all the cpus will sleep till the next wakeup event
2836 * For the ilb owner, tick is not stopped. And this tick will be used
2837 * for idle load balancing. ilb owner will still be part of
2840 * While stopping the tick, this cpu will become the ilb owner if there
2841 * is no other owner. And will be the owner till that cpu becomes busy
2842 * or if all cpus in the system stop their ticks at which point
2843 * there is no need for ilb owner.
2845 * When the ilb owner becomes busy, it nominates another owner, during the
2846 * next busy scheduler_tick()
2848 int select_nohz_load_balancer(int stop_tick)
2850 int cpu = smp_processor_id();
2853 cpu_set(cpu, nohz.cpu_mask);
2854 cpu_rq(cpu)->in_nohz_recently = 1;
2857 * If we are going offline and still the leader, give up!
2859 if (cpu_is_offline(cpu) &&
2860 atomic_read(&nohz.load_balancer) == cpu) {
2861 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2866 /* time for ilb owner also to sleep */
2867 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2868 if (atomic_read(&nohz.load_balancer) == cpu)
2869 atomic_set(&nohz.load_balancer, -1);
2873 if (atomic_read(&nohz.load_balancer) == -1) {
2874 /* make me the ilb owner */
2875 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2877 } else if (atomic_read(&nohz.load_balancer) == cpu)
2880 if (!cpu_isset(cpu, nohz.cpu_mask))
2883 cpu_clear(cpu, nohz.cpu_mask);
2885 if (atomic_read(&nohz.load_balancer) == cpu)
2886 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2893 static DEFINE_SPINLOCK(balancing);
2896 * It checks each scheduling domain to see if it is due to be balanced,
2897 * and initiates a balancing operation if so.
2899 * Balancing parameters are set up in arch_init_sched_domains.
2901 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2904 struct rq *rq = cpu_rq(cpu);
2905 unsigned long interval;
2906 struct sched_domain *sd;
2907 /* Earliest time when we have to do rebalance again */
2908 unsigned long next_balance = jiffies + 60*HZ;
2910 for_each_domain(cpu, sd) {
2911 if (!(sd->flags & SD_LOAD_BALANCE))
2914 interval = sd->balance_interval;
2915 if (idle != CPU_IDLE)
2916 interval *= sd->busy_factor;
2918 /* scale ms to jiffies */
2919 interval = msecs_to_jiffies(interval);
2920 if (unlikely(!interval))
2922 if (interval > HZ*NR_CPUS/10)
2923 interval = HZ*NR_CPUS/10;
2926 if (sd->flags & SD_SERIALIZE) {
2927 if (!spin_trylock(&balancing))
2931 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2932 if (load_balance(cpu, rq, sd, idle, &balance)) {
2934 * We've pulled tasks over so either we're no
2935 * longer idle, or one of our SMT siblings is
2938 idle = CPU_NOT_IDLE;
2940 sd->last_balance = jiffies;
2942 if (sd->flags & SD_SERIALIZE)
2943 spin_unlock(&balancing);
2945 if (time_after(next_balance, sd->last_balance + interval))
2946 next_balance = sd->last_balance + interval;
2949 * Stop the load balance at this level. There is another
2950 * CPU in our sched group which is doing load balancing more
2956 rq->next_balance = next_balance;
2960 * run_rebalance_domains is triggered when needed from the scheduler tick.
2961 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2962 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2964 static void run_rebalance_domains(struct softirq_action *h)
2966 int this_cpu = smp_processor_id();
2967 struct rq *this_rq = cpu_rq(this_cpu);
2968 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2969 CPU_IDLE : CPU_NOT_IDLE;
2971 rebalance_domains(this_cpu, idle);
2975 * If this cpu is the owner for idle load balancing, then do the
2976 * balancing on behalf of the other idle cpus whose ticks are
2979 if (this_rq->idle_at_tick &&
2980 atomic_read(&nohz.load_balancer) == this_cpu) {
2981 cpumask_t cpus = nohz.cpu_mask;
2985 cpu_clear(this_cpu, cpus);
2986 for_each_cpu_mask(balance_cpu, cpus) {
2988 * If this cpu gets work to do, stop the load balancing
2989 * work being done for other cpus. Next load
2990 * balancing owner will pick it up.
2995 rebalance_domains(balance_cpu, SCHED_IDLE);
2997 rq = cpu_rq(balance_cpu);
2998 if (time_after(this_rq->next_balance, rq->next_balance))
2999 this_rq->next_balance = rq->next_balance;
3006 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3008 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3009 * idle load balancing owner or decide to stop the periodic load balancing,
3010 * if the whole system is idle.
3012 static inline void trigger_load_balance(struct rq *rq, int cpu)
3016 * If we were in the nohz mode recently and busy at the current
3017 * scheduler tick, then check if we need to nominate new idle
3020 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3021 rq->in_nohz_recently = 0;
3023 if (atomic_read(&nohz.load_balancer) == cpu) {
3024 cpu_clear(cpu, nohz.cpu_mask);
3025 atomic_set(&nohz.load_balancer, -1);
3028 if (atomic_read(&nohz.load_balancer) == -1) {
3030 * simple selection for now: Nominate the
3031 * first cpu in the nohz list to be the next
3034 * TBD: Traverse the sched domains and nominate
3035 * the nearest cpu in the nohz.cpu_mask.
3037 int ilb = first_cpu(nohz.cpu_mask);
3045 * If this cpu is idle and doing idle load balancing for all the
3046 * cpus with ticks stopped, is it time for that to stop?
3048 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3049 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3055 * If this cpu is idle and the idle load balancing is done by
3056 * someone else, then no need raise the SCHED_SOFTIRQ
3058 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3059 cpu_isset(cpu, nohz.cpu_mask))
3062 if (time_after_eq(jiffies, rq->next_balance))
3063 raise_softirq(SCHED_SOFTIRQ);
3066 #else /* CONFIG_SMP */
3069 * on UP we do not need to balance between CPUs:
3071 static inline void idle_balance(int cpu, struct rq *rq)
3075 /* Avoid "used but not defined" warning on UP */
3076 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3077 unsigned long max_nr_move, unsigned long max_load_move,
3078 struct sched_domain *sd, enum cpu_idle_type idle,
3079 int *all_pinned, unsigned long *load_moved,
3080 int this_best_prio, int best_prio, int best_prio_seen,
3081 struct rq_iterator *iterator)
3090 DEFINE_PER_CPU(struct kernel_stat, kstat);
3092 EXPORT_PER_CPU_SYMBOL(kstat);
3095 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3096 * that have not yet been banked in case the task is currently running.
3098 unsigned long long task_sched_runtime(struct task_struct *p)
3100 unsigned long flags;
3104 rq = task_rq_lock(p, &flags);
3105 ns = p->se.sum_exec_runtime;
3106 if (rq->curr == p) {
3107 delta_exec = rq_clock(rq) - p->se.exec_start;
3108 if ((s64)delta_exec > 0)
3111 task_rq_unlock(rq, &flags);
3117 * Account user cpu time to a process.
3118 * @p: the process that the cpu time gets accounted to
3119 * @hardirq_offset: the offset to subtract from hardirq_count()
3120 * @cputime: the cpu time spent in user space since the last update
3122 void account_user_time(struct task_struct *p, cputime_t cputime)
3124 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3127 p->utime = cputime_add(p->utime, cputime);
3129 /* Add user time to cpustat. */
3130 tmp = cputime_to_cputime64(cputime);
3131 if (TASK_NICE(p) > 0)
3132 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3134 cpustat->user = cputime64_add(cpustat->user, tmp);
3138 * Account system cpu time to a process.
3139 * @p: the process that the cpu time gets accounted to
3140 * @hardirq_offset: the offset to subtract from hardirq_count()
3141 * @cputime: the cpu time spent in kernel space since the last update
3143 void account_system_time(struct task_struct *p, int hardirq_offset,
3146 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3147 struct rq *rq = this_rq();
3150 p->stime = cputime_add(p->stime, cputime);
3152 /* Add system time to cpustat. */
3153 tmp = cputime_to_cputime64(cputime);
3154 if (hardirq_count() - hardirq_offset)
3155 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3156 else if (softirq_count())
3157 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3158 else if (p != rq->idle)
3159 cpustat->system = cputime64_add(cpustat->system, tmp);
3160 else if (atomic_read(&rq->nr_iowait) > 0)
3161 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3163 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3164 /* Account for system time used */
3165 acct_update_integrals(p);
3169 * Account for involuntary wait time.
3170 * @p: the process from which the cpu time has been stolen
3171 * @steal: the cpu time spent in involuntary wait
3173 void account_steal_time(struct task_struct *p, cputime_t steal)
3175 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3176 cputime64_t tmp = cputime_to_cputime64(steal);
3177 struct rq *rq = this_rq();
3179 if (p == rq->idle) {
3180 p->stime = cputime_add(p->stime, steal);
3181 if (atomic_read(&rq->nr_iowait) > 0)
3182 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3184 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3186 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3190 * This function gets called by the timer code, with HZ frequency.
3191 * We call it with interrupts disabled.
3193 * It also gets called by the fork code, when changing the parent's
3196 void scheduler_tick(void)
3198 int cpu = smp_processor_id();
3199 struct rq *rq = cpu_rq(cpu);
3200 struct task_struct *curr = rq->curr;
3202 spin_lock(&rq->lock);
3203 if (curr != rq->idle) /* FIXME: needed? */
3204 curr->sched_class->task_tick(rq, curr);
3205 update_cpu_load(rq);
3206 spin_unlock(&rq->lock);
3209 rq->idle_at_tick = idle_cpu(cpu);
3210 trigger_load_balance(rq, cpu);
3214 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3216 void fastcall add_preempt_count(int val)
3221 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3223 preempt_count() += val;
3225 * Spinlock count overflowing soon?
3227 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3230 EXPORT_SYMBOL(add_preempt_count);
3232 void fastcall sub_preempt_count(int val)
3237 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3240 * Is the spinlock portion underflowing?
3242 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3243 !(preempt_count() & PREEMPT_MASK)))
3246 preempt_count() -= val;
3248 EXPORT_SYMBOL(sub_preempt_count);
3253 * Print scheduling while atomic bug:
3255 static noinline void __schedule_bug(struct task_struct *prev)
3257 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3258 prev->comm, preempt_count(), prev->pid);
3259 debug_show_held_locks(prev);
3260 if (irqs_disabled())
3261 print_irqtrace_events(prev);
3266 * Various schedule()-time debugging checks and statistics:
3268 static inline void schedule_debug(struct task_struct *prev)
3271 * Test if we are atomic. Since do_exit() needs to call into
3272 * schedule() atomically, we ignore that path for now.
3273 * Otherwise, whine if we are scheduling when we should not be.
3275 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3276 __schedule_bug(prev);
3278 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3280 schedstat_inc(this_rq(), sched_cnt);
3284 * Pick up the highest-prio task:
3286 static inline struct task_struct *
3287 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3289 struct sched_class *class;
3290 struct task_struct *p;
3293 * Optimization: we know that if all tasks are in
3294 * the fair class we can call that function directly:
3296 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3297 p = fair_sched_class.pick_next_task(rq, now);
3302 class = sched_class_highest;
3304 p = class->pick_next_task(rq, now);
3308 * Will never be NULL as the idle class always
3309 * returns a non-NULL p:
3311 class = class->next;
3316 * schedule() is the main scheduler function.
3318 asmlinkage void __sched schedule(void)
3320 struct task_struct *prev, *next;
3328 cpu = smp_processor_id();
3332 switch_count = &prev->nivcsw;
3334 release_kernel_lock(prev);
3335 need_resched_nonpreemptible:
3337 schedule_debug(prev);
3339 spin_lock_irq(&rq->lock);
3340 clear_tsk_need_resched(prev);
3342 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3343 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3344 unlikely(signal_pending(prev)))) {
3345 prev->state = TASK_RUNNING;
3347 deactivate_task(rq, prev, 1);
3349 switch_count = &prev->nvcsw;
3352 if (unlikely(!rq->nr_running))
3353 idle_balance(cpu, rq);
3355 now = __rq_clock(rq);
3356 prev->sched_class->put_prev_task(rq, prev, now);
3357 next = pick_next_task(rq, prev, now);
3359 sched_info_switch(prev, next);
3361 if (likely(prev != next)) {
3366 context_switch(rq, prev, next); /* unlocks the rq */
3368 spin_unlock_irq(&rq->lock);
3370 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3371 cpu = smp_processor_id();
3373 goto need_resched_nonpreemptible;
3375 preempt_enable_no_resched();
3376 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3379 EXPORT_SYMBOL(schedule);
3381 #ifdef CONFIG_PREEMPT
3383 * this is the entry point to schedule() from in-kernel preemption
3384 * off of preempt_enable. Kernel preemptions off return from interrupt
3385 * occur there and call schedule directly.
3387 asmlinkage void __sched preempt_schedule(void)
3389 struct thread_info *ti = current_thread_info();
3390 #ifdef CONFIG_PREEMPT_BKL
3391 struct task_struct *task = current;
3392 int saved_lock_depth;
3395 * If there is a non-zero preempt_count or interrupts are disabled,
3396 * we do not want to preempt the current task. Just return..
3398 if (likely(ti->preempt_count || irqs_disabled()))
3402 add_preempt_count(PREEMPT_ACTIVE);
3404 * We keep the big kernel semaphore locked, but we
3405 * clear ->lock_depth so that schedule() doesnt
3406 * auto-release the semaphore:
3408 #ifdef CONFIG_PREEMPT_BKL
3409 saved_lock_depth = task->lock_depth;
3410 task->lock_depth = -1;
3413 #ifdef CONFIG_PREEMPT_BKL
3414 task->lock_depth = saved_lock_depth;
3416 sub_preempt_count(PREEMPT_ACTIVE);
3418 /* we could miss a preemption opportunity between schedule and now */
3420 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3423 EXPORT_SYMBOL(preempt_schedule);
3426 * this is the entry point to schedule() from kernel preemption
3427 * off of irq context.
3428 * Note, that this is called and return with irqs disabled. This will
3429 * protect us against recursive calling from irq.
3431 asmlinkage void __sched preempt_schedule_irq(void)
3433 struct thread_info *ti = current_thread_info();
3434 #ifdef CONFIG_PREEMPT_BKL
3435 struct task_struct *task = current;
3436 int saved_lock_depth;
3438 /* Catch callers which need to be fixed */
3439 BUG_ON(ti->preempt_count || !irqs_disabled());
3442 add_preempt_count(PREEMPT_ACTIVE);
3444 * We keep the big kernel semaphore locked, but we
3445 * clear ->lock_depth so that schedule() doesnt
3446 * auto-release the semaphore:
3448 #ifdef CONFIG_PREEMPT_BKL
3449 saved_lock_depth = task->lock_depth;
3450 task->lock_depth = -1;
3454 local_irq_disable();
3455 #ifdef CONFIG_PREEMPT_BKL
3456 task->lock_depth = saved_lock_depth;
3458 sub_preempt_count(PREEMPT_ACTIVE);
3460 /* we could miss a preemption opportunity between schedule and now */
3462 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3466 #endif /* CONFIG_PREEMPT */
3468 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3471 return try_to_wake_up(curr->private, mode, sync);
3473 EXPORT_SYMBOL(default_wake_function);
3476 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3477 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3478 * number) then we wake all the non-exclusive tasks and one exclusive task.
3480 * There are circumstances in which we can try to wake a task which has already
3481 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3482 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3484 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3485 int nr_exclusive, int sync, void *key)
3487 struct list_head *tmp, *next;
3489 list_for_each_safe(tmp, next, &q->task_list) {
3490 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3491 unsigned flags = curr->flags;
3493 if (curr->func(curr, mode, sync, key) &&
3494 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3500 * __wake_up - wake up threads blocked on a waitqueue.
3502 * @mode: which threads
3503 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3504 * @key: is directly passed to the wakeup function
3506 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3507 int nr_exclusive, void *key)
3509 unsigned long flags;
3511 spin_lock_irqsave(&q->lock, flags);
3512 __wake_up_common(q, mode, nr_exclusive, 0, key);
3513 spin_unlock_irqrestore(&q->lock, flags);
3515 EXPORT_SYMBOL(__wake_up);
3518 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3520 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3522 __wake_up_common(q, mode, 1, 0, NULL);
3526 * __wake_up_sync - wake up threads blocked on a waitqueue.
3528 * @mode: which threads
3529 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3531 * The sync wakeup differs that the waker knows that it will schedule
3532 * away soon, so while the target thread will be woken up, it will not
3533 * be migrated to another CPU - ie. the two threads are 'synchronized'
3534 * with each other. This can prevent needless bouncing between CPUs.
3536 * On UP it can prevent extra preemption.
3539 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3541 unsigned long flags;
3547 if (unlikely(!nr_exclusive))
3550 spin_lock_irqsave(&q->lock, flags);
3551 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3552 spin_unlock_irqrestore(&q->lock, flags);
3554 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3556 void fastcall complete(struct completion *x)
3558 unsigned long flags;
3560 spin_lock_irqsave(&x->wait.lock, flags);
3562 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3564 spin_unlock_irqrestore(&x->wait.lock, flags);
3566 EXPORT_SYMBOL(complete);
3568 void fastcall complete_all(struct completion *x)
3570 unsigned long flags;
3572 spin_lock_irqsave(&x->wait.lock, flags);
3573 x->done += UINT_MAX/2;
3574 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3576 spin_unlock_irqrestore(&x->wait.lock, flags);
3578 EXPORT_SYMBOL(complete_all);
3580 void fastcall __sched wait_for_completion(struct completion *x)
3584 spin_lock_irq(&x->wait.lock);
3586 DECLARE_WAITQUEUE(wait, current);
3588 wait.flags |= WQ_FLAG_EXCLUSIVE;
3589 __add_wait_queue_tail(&x->wait, &wait);
3591 __set_current_state(TASK_UNINTERRUPTIBLE);
3592 spin_unlock_irq(&x->wait.lock);
3594 spin_lock_irq(&x->wait.lock);
3596 __remove_wait_queue(&x->wait, &wait);
3599 spin_unlock_irq(&x->wait.lock);
3601 EXPORT_SYMBOL(wait_for_completion);
3603 unsigned long fastcall __sched
3604 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3608 spin_lock_irq(&x->wait.lock);
3610 DECLARE_WAITQUEUE(wait, current);
3612 wait.flags |= WQ_FLAG_EXCLUSIVE;
3613 __add_wait_queue_tail(&x->wait, &wait);
3615 __set_current_state(TASK_UNINTERRUPTIBLE);
3616 spin_unlock_irq(&x->wait.lock);
3617 timeout = schedule_timeout(timeout);
3618 spin_lock_irq(&x->wait.lock);
3620 __remove_wait_queue(&x->wait, &wait);
3624 __remove_wait_queue(&x->wait, &wait);
3628 spin_unlock_irq(&x->wait.lock);
3631 EXPORT_SYMBOL(wait_for_completion_timeout);
3633 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3639 spin_lock_irq(&x->wait.lock);
3641 DECLARE_WAITQUEUE(wait, current);
3643 wait.flags |= WQ_FLAG_EXCLUSIVE;
3644 __add_wait_queue_tail(&x->wait, &wait);
3646 if (signal_pending(current)) {
3648 __remove_wait_queue(&x->wait, &wait);
3651 __set_current_state(TASK_INTERRUPTIBLE);
3652 spin_unlock_irq(&x->wait.lock);
3654 spin_lock_irq(&x->wait.lock);
3656 __remove_wait_queue(&x->wait, &wait);
3660 spin_unlock_irq(&x->wait.lock);
3664 EXPORT_SYMBOL(wait_for_completion_interruptible);
3666 unsigned long fastcall __sched
3667 wait_for_completion_interruptible_timeout(struct completion *x,
3668 unsigned long timeout)
3672 spin_lock_irq(&x->wait.lock);
3674 DECLARE_WAITQUEUE(wait, current);
3676 wait.flags |= WQ_FLAG_EXCLUSIVE;
3677 __add_wait_queue_tail(&x->wait, &wait);
3679 if (signal_pending(current)) {
3680 timeout = -ERESTARTSYS;
3681 __remove_wait_queue(&x->wait, &wait);
3684 __set_current_state(TASK_INTERRUPTIBLE);
3685 spin_unlock_irq(&x->wait.lock);
3686 timeout = schedule_timeout(timeout);
3687 spin_lock_irq(&x->wait.lock);
3689 __remove_wait_queue(&x->wait, &wait);
3693 __remove_wait_queue(&x->wait, &wait);
3697 spin_unlock_irq(&x->wait.lock);
3700 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3703 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3705 spin_lock_irqsave(&q->lock, *flags);
3706 __add_wait_queue(q, wait);
3707 spin_unlock(&q->lock);
3711 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3713 spin_lock_irq(&q->lock);
3714 __remove_wait_queue(q, wait);
3715 spin_unlock_irqrestore(&q->lock, *flags);
3718 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3720 unsigned long flags;
3723 init_waitqueue_entry(&wait, current);
3725 current->state = TASK_INTERRUPTIBLE;
3727 sleep_on_head(q, &wait, &flags);
3729 sleep_on_tail(q, &wait, &flags);
3731 EXPORT_SYMBOL(interruptible_sleep_on);
3734 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3736 unsigned long flags;
3739 init_waitqueue_entry(&wait, current);
3741 current->state = TASK_INTERRUPTIBLE;
3743 sleep_on_head(q, &wait, &flags);
3744 timeout = schedule_timeout(timeout);
3745 sleep_on_tail(q, &wait, &flags);
3749 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3751 void __sched sleep_on(wait_queue_head_t *q)
3753 unsigned long flags;
3756 init_waitqueue_entry(&wait, current);
3758 current->state = TASK_UNINTERRUPTIBLE;
3760 sleep_on_head(q, &wait, &flags);
3762 sleep_on_tail(q, &wait, &flags);
3764 EXPORT_SYMBOL(sleep_on);
3766 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3768 unsigned long flags;
3771 init_waitqueue_entry(&wait, current);
3773 current->state = TASK_UNINTERRUPTIBLE;
3775 sleep_on_head(q, &wait, &flags);
3776 timeout = schedule_timeout(timeout);
3777 sleep_on_tail(q, &wait, &flags);
3781 EXPORT_SYMBOL(sleep_on_timeout);
3783 #ifdef CONFIG_RT_MUTEXES
3786 * rt_mutex_setprio - set the current priority of a task
3788 * @prio: prio value (kernel-internal form)
3790 * This function changes the 'effective' priority of a task. It does
3791 * not touch ->normal_prio like __setscheduler().
3793 * Used by the rt_mutex code to implement priority inheritance logic.
3795 void rt_mutex_setprio(struct task_struct *p, int prio)
3797 unsigned long flags;
3802 BUG_ON(prio < 0 || prio > MAX_PRIO);
3804 rq = task_rq_lock(p, &flags);
3808 on_rq = p->se.on_rq;
3810 dequeue_task(rq, p, 0, now);
3813 p->sched_class = &rt_sched_class;
3815 p->sched_class = &fair_sched_class;
3820 enqueue_task(rq, p, 0, now);
3822 * Reschedule if we are currently running on this runqueue and
3823 * our priority decreased, or if we are not currently running on
3824 * this runqueue and our priority is higher than the current's
3826 if (task_running(rq, p)) {
3827 if (p->prio > oldprio)
3828 resched_task(rq->curr);
3830 check_preempt_curr(rq, p);
3833 task_rq_unlock(rq, &flags);
3838 void set_user_nice(struct task_struct *p, long nice)
3840 int old_prio, delta, on_rq;
3841 unsigned long flags;
3845 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3848 * We have to be careful, if called from sys_setpriority(),
3849 * the task might be in the middle of scheduling on another CPU.
3851 rq = task_rq_lock(p, &flags);
3854 * The RT priorities are set via sched_setscheduler(), but we still
3855 * allow the 'normal' nice value to be set - but as expected
3856 * it wont have any effect on scheduling until the task is
3857 * SCHED_FIFO/SCHED_RR:
3859 if (task_has_rt_policy(p)) {
3860 p->static_prio = NICE_TO_PRIO(nice);
3863 on_rq = p->se.on_rq;
3865 dequeue_task(rq, p, 0, now);
3866 dec_load(rq, p, now);
3869 p->static_prio = NICE_TO_PRIO(nice);
3872 p->prio = effective_prio(p);
3873 delta = p->prio - old_prio;
3876 enqueue_task(rq, p, 0, now);
3877 inc_load(rq, p, now);
3879 * If the task increased its priority or is running and
3880 * lowered its priority, then reschedule its CPU:
3882 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3883 resched_task(rq->curr);
3886 task_rq_unlock(rq, &flags);
3888 EXPORT_SYMBOL(set_user_nice);
3891 * can_nice - check if a task can reduce its nice value
3895 int can_nice(const struct task_struct *p, const int nice)
3897 /* convert nice value [19,-20] to rlimit style value [1,40] */
3898 int nice_rlim = 20 - nice;
3900 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3901 capable(CAP_SYS_NICE));
3904 #ifdef __ARCH_WANT_SYS_NICE
3907 * sys_nice - change the priority of the current process.
3908 * @increment: priority increment
3910 * sys_setpriority is a more generic, but much slower function that
3911 * does similar things.
3913 asmlinkage long sys_nice(int increment)
3918 * Setpriority might change our priority at the same moment.
3919 * We don't have to worry. Conceptually one call occurs first
3920 * and we have a single winner.
3922 if (increment < -40)
3927 nice = PRIO_TO_NICE(current->static_prio) + increment;
3933 if (increment < 0 && !can_nice(current, nice))
3936 retval = security_task_setnice(current, nice);
3940 set_user_nice(current, nice);
3947 * task_prio - return the priority value of a given task.
3948 * @p: the task in question.
3950 * This is the priority value as seen by users in /proc.
3951 * RT tasks are offset by -200. Normal tasks are centered
3952 * around 0, value goes from -16 to +15.
3954 int task_prio(const struct task_struct *p)
3956 return p->prio - MAX_RT_PRIO;
3960 * task_nice - return the nice value of a given task.
3961 * @p: the task in question.
3963 int task_nice(const struct task_struct *p)
3965 return TASK_NICE(p);
3967 EXPORT_SYMBOL_GPL(task_nice);
3970 * idle_cpu - is a given cpu idle currently?
3971 * @cpu: the processor in question.
3973 int idle_cpu(int cpu)
3975 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3979 * idle_task - return the idle task for a given cpu.
3980 * @cpu: the processor in question.
3982 struct task_struct *idle_task(int cpu)
3984 return cpu_rq(cpu)->idle;
3988 * find_process_by_pid - find a process with a matching PID value.
3989 * @pid: the pid in question.
3991 static inline struct task_struct *find_process_by_pid(pid_t pid)
3993 return pid ? find_task_by_pid(pid) : current;
3996 /* Actually do priority change: must hold rq lock. */
3998 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4000 BUG_ON(p->se.on_rq);
4003 switch (p->policy) {
4007 p->sched_class = &fair_sched_class;
4011 p->sched_class = &rt_sched_class;
4015 p->rt_priority = prio;
4016 p->normal_prio = normal_prio(p);
4017 /* we are holding p->pi_lock already */
4018 p->prio = rt_mutex_getprio(p);
4023 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4024 * @p: the task in question.
4025 * @policy: new policy.
4026 * @param: structure containing the new RT priority.
4028 * NOTE that the task may be already dead.
4030 int sched_setscheduler(struct task_struct *p, int policy,
4031 struct sched_param *param)
4033 int retval, oldprio, oldpolicy = -1, on_rq;
4034 unsigned long flags;
4037 /* may grab non-irq protected spin_locks */
4038 BUG_ON(in_interrupt());
4040 /* double check policy once rq lock held */
4042 policy = oldpolicy = p->policy;
4043 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4044 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4045 policy != SCHED_IDLE)
4048 * Valid priorities for SCHED_FIFO and SCHED_RR are
4049 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4050 * SCHED_BATCH and SCHED_IDLE is 0.
4052 if (param->sched_priority < 0 ||
4053 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4054 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4056 if (rt_policy(policy) != (param->sched_priority != 0))
4060 * Allow unprivileged RT tasks to decrease priority:
4062 if (!capable(CAP_SYS_NICE)) {
4063 if (rt_policy(policy)) {
4064 unsigned long rlim_rtprio;
4066 if (!lock_task_sighand(p, &flags))
4068 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4069 unlock_task_sighand(p, &flags);
4071 /* can't set/change the rt policy */
4072 if (policy != p->policy && !rlim_rtprio)
4075 /* can't increase priority */
4076 if (param->sched_priority > p->rt_priority &&
4077 param->sched_priority > rlim_rtprio)
4081 * Like positive nice levels, dont allow tasks to
4082 * move out of SCHED_IDLE either:
4084 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4087 /* can't change other user's priorities */
4088 if ((current->euid != p->euid) &&
4089 (current->euid != p->uid))
4093 retval = security_task_setscheduler(p, policy, param);
4097 * make sure no PI-waiters arrive (or leave) while we are
4098 * changing the priority of the task:
4100 spin_lock_irqsave(&p->pi_lock, flags);
4102 * To be able to change p->policy safely, the apropriate
4103 * runqueue lock must be held.
4105 rq = __task_rq_lock(p);
4106 /* recheck policy now with rq lock held */
4107 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4108 policy = oldpolicy = -1;
4109 __task_rq_unlock(rq);
4110 spin_unlock_irqrestore(&p->pi_lock, flags);
4113 on_rq = p->se.on_rq;
4115 deactivate_task(rq, p, 0);
4117 __setscheduler(rq, p, policy, param->sched_priority);
4119 activate_task(rq, p, 0);
4121 * Reschedule if we are currently running on this runqueue and
4122 * our priority decreased, or if we are not currently running on
4123 * this runqueue and our priority is higher than the current's
4125 if (task_running(rq, p)) {
4126 if (p->prio > oldprio)
4127 resched_task(rq->curr);
4129 check_preempt_curr(rq, p);
4132 __task_rq_unlock(rq);
4133 spin_unlock_irqrestore(&p->pi_lock, flags);
4135 rt_mutex_adjust_pi(p);
4139 EXPORT_SYMBOL_GPL(sched_setscheduler);
4142 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4144 struct sched_param lparam;
4145 struct task_struct *p;
4148 if (!param || pid < 0)
4150 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4155 p = find_process_by_pid(pid);
4157 retval = sched_setscheduler(p, policy, &lparam);
4164 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4165 * @pid: the pid in question.
4166 * @policy: new policy.
4167 * @param: structure containing the new RT priority.
4169 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4170 struct sched_param __user *param)
4172 /* negative values for policy are not valid */
4176 return do_sched_setscheduler(pid, policy, param);
4180 * sys_sched_setparam - set/change the RT priority of a thread
4181 * @pid: the pid in question.
4182 * @param: structure containing the new RT priority.
4184 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4186 return do_sched_setscheduler(pid, -1, param);
4190 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4191 * @pid: the pid in question.
4193 asmlinkage long sys_sched_getscheduler(pid_t pid)
4195 struct task_struct *p;
4196 int retval = -EINVAL;
4202 read_lock(&tasklist_lock);
4203 p = find_process_by_pid(pid);
4205 retval = security_task_getscheduler(p);
4209 read_unlock(&tasklist_lock);
4216 * sys_sched_getscheduler - get the RT priority of a thread
4217 * @pid: the pid in question.
4218 * @param: structure containing the RT priority.
4220 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4222 struct sched_param lp;
4223 struct task_struct *p;
4224 int retval = -EINVAL;
4226 if (!param || pid < 0)
4229 read_lock(&tasklist_lock);
4230 p = find_process_by_pid(pid);
4235 retval = security_task_getscheduler(p);
4239 lp.sched_priority = p->rt_priority;
4240 read_unlock(&tasklist_lock);
4243 * This one might sleep, we cannot do it with a spinlock held ...
4245 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4251 read_unlock(&tasklist_lock);
4255 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4257 cpumask_t cpus_allowed;
4258 struct task_struct *p;
4261 mutex_lock(&sched_hotcpu_mutex);
4262 read_lock(&tasklist_lock);
4264 p = find_process_by_pid(pid);
4266 read_unlock(&tasklist_lock);
4267 mutex_unlock(&sched_hotcpu_mutex);
4272 * It is not safe to call set_cpus_allowed with the
4273 * tasklist_lock held. We will bump the task_struct's
4274 * usage count and then drop tasklist_lock.
4277 read_unlock(&tasklist_lock);
4280 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4281 !capable(CAP_SYS_NICE))
4284 retval = security_task_setscheduler(p, 0, NULL);
4288 cpus_allowed = cpuset_cpus_allowed(p);
4289 cpus_and(new_mask, new_mask, cpus_allowed);
4290 retval = set_cpus_allowed(p, new_mask);
4294 mutex_unlock(&sched_hotcpu_mutex);
4298 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4299 cpumask_t *new_mask)
4301 if (len < sizeof(cpumask_t)) {
4302 memset(new_mask, 0, sizeof(cpumask_t));
4303 } else if (len > sizeof(cpumask_t)) {
4304 len = sizeof(cpumask_t);
4306 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4310 * sys_sched_setaffinity - set the cpu affinity of a process
4311 * @pid: pid of the process
4312 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4313 * @user_mask_ptr: user-space pointer to the new cpu mask
4315 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4316 unsigned long __user *user_mask_ptr)
4321 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4325 return sched_setaffinity(pid, new_mask);
4329 * Represents all cpu's present in the system
4330 * In systems capable of hotplug, this map could dynamically grow
4331 * as new cpu's are detected in the system via any platform specific
4332 * method, such as ACPI for e.g.
4335 cpumask_t cpu_present_map __read_mostly;
4336 EXPORT_SYMBOL(cpu_present_map);
4339 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4340 EXPORT_SYMBOL(cpu_online_map);
4342 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4343 EXPORT_SYMBOL(cpu_possible_map);
4346 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4348 struct task_struct *p;
4351 mutex_lock(&sched_hotcpu_mutex);
4352 read_lock(&tasklist_lock);
4355 p = find_process_by_pid(pid);
4359 retval = security_task_getscheduler(p);
4363 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4366 read_unlock(&tasklist_lock);
4367 mutex_unlock(&sched_hotcpu_mutex);
4375 * sys_sched_getaffinity - get the cpu affinity of a process
4376 * @pid: pid of the process
4377 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4378 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4380 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4381 unsigned long __user *user_mask_ptr)
4386 if (len < sizeof(cpumask_t))
4389 ret = sched_getaffinity(pid, &mask);
4393 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4396 return sizeof(cpumask_t);
4400 * sys_sched_yield - yield the current processor to other threads.
4402 * This function yields the current CPU to other tasks. If there are no
4403 * other threads running on this CPU then this function will return.
4405 asmlinkage long sys_sched_yield(void)
4407 struct rq *rq = this_rq_lock();
4409 schedstat_inc(rq, yld_cnt);
4410 if (unlikely(rq->nr_running == 1))
4411 schedstat_inc(rq, yld_act_empty);
4413 current->sched_class->yield_task(rq, current);
4416 * Since we are going to call schedule() anyway, there's
4417 * no need to preempt or enable interrupts:
4419 __release(rq->lock);
4420 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4421 _raw_spin_unlock(&rq->lock);
4422 preempt_enable_no_resched();
4429 static void __cond_resched(void)
4431 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4432 __might_sleep(__FILE__, __LINE__);
4435 * The BKS might be reacquired before we have dropped
4436 * PREEMPT_ACTIVE, which could trigger a second
4437 * cond_resched() call.
4440 add_preempt_count(PREEMPT_ACTIVE);
4442 sub_preempt_count(PREEMPT_ACTIVE);
4443 } while (need_resched());
4446 int __sched cond_resched(void)
4448 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4449 system_state == SYSTEM_RUNNING) {
4455 EXPORT_SYMBOL(cond_resched);
4458 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4459 * call schedule, and on return reacquire the lock.
4461 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4462 * operations here to prevent schedule() from being called twice (once via
4463 * spin_unlock(), once by hand).
4465 int cond_resched_lock(spinlock_t *lock)
4469 if (need_lockbreak(lock)) {
4475 if (need_resched() && system_state == SYSTEM_RUNNING) {
4476 spin_release(&lock->dep_map, 1, _THIS_IP_);
4477 _raw_spin_unlock(lock);
4478 preempt_enable_no_resched();
4485 EXPORT_SYMBOL(cond_resched_lock);
4487 int __sched cond_resched_softirq(void)
4489 BUG_ON(!in_softirq());
4491 if (need_resched() && system_state == SYSTEM_RUNNING) {
4499 EXPORT_SYMBOL(cond_resched_softirq);
4502 * yield - yield the current processor to other threads.
4504 * This is a shortcut for kernel-space yielding - it marks the
4505 * thread runnable and calls sys_sched_yield().
4507 void __sched yield(void)
4509 set_current_state(TASK_RUNNING);
4512 EXPORT_SYMBOL(yield);
4515 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4516 * that process accounting knows that this is a task in IO wait state.
4518 * But don't do that if it is a deliberate, throttling IO wait (this task
4519 * has set its backing_dev_info: the queue against which it should throttle)
4521 void __sched io_schedule(void)
4523 struct rq *rq = &__raw_get_cpu_var(runqueues);
4525 delayacct_blkio_start();
4526 atomic_inc(&rq->nr_iowait);
4528 atomic_dec(&rq->nr_iowait);
4529 delayacct_blkio_end();
4531 EXPORT_SYMBOL(io_schedule);
4533 long __sched io_schedule_timeout(long timeout)
4535 struct rq *rq = &__raw_get_cpu_var(runqueues);
4538 delayacct_blkio_start();
4539 atomic_inc(&rq->nr_iowait);
4540 ret = schedule_timeout(timeout);
4541 atomic_dec(&rq->nr_iowait);
4542 delayacct_blkio_end();
4547 * sys_sched_get_priority_max - return maximum RT priority.
4548 * @policy: scheduling class.
4550 * this syscall returns the maximum rt_priority that can be used
4551 * by a given scheduling class.
4553 asmlinkage long sys_sched_get_priority_max(int policy)
4560 ret = MAX_USER_RT_PRIO-1;
4572 * sys_sched_get_priority_min - return minimum RT priority.
4573 * @policy: scheduling class.
4575 * this syscall returns the minimum rt_priority that can be used
4576 * by a given scheduling class.
4578 asmlinkage long sys_sched_get_priority_min(int policy)
4596 * sys_sched_rr_get_interval - return the default timeslice of a process.
4597 * @pid: pid of the process.
4598 * @interval: userspace pointer to the timeslice value.
4600 * this syscall writes the default timeslice value of a given process
4601 * into the user-space timespec buffer. A value of '0' means infinity.
4604 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4606 struct task_struct *p;
4607 int retval = -EINVAL;
4614 read_lock(&tasklist_lock);
4615 p = find_process_by_pid(pid);
4619 retval = security_task_getscheduler(p);
4623 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4624 0 : static_prio_timeslice(p->static_prio), &t);
4625 read_unlock(&tasklist_lock);
4626 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4630 read_unlock(&tasklist_lock);
4634 static const char stat_nam[] = "RSDTtZX";
4636 static void show_task(struct task_struct *p)
4638 unsigned long free = 0;
4641 state = p->state ? __ffs(p->state) + 1 : 0;
4642 printk("%-13.13s %c", p->comm,
4643 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4644 #if (BITS_PER_LONG == 32)
4645 if (state == TASK_RUNNING)
4646 printk(" running ");
4648 printk(" %08lX ", thread_saved_pc(p));
4650 if (state == TASK_RUNNING)
4651 printk(" running task ");
4653 printk(" %016lx ", thread_saved_pc(p));
4655 #ifdef CONFIG_DEBUG_STACK_USAGE
4657 unsigned long *n = end_of_stack(p);
4660 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4663 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4665 printk(" (L-TLB)\n");
4667 printk(" (NOTLB)\n");
4669 if (state != TASK_RUNNING)
4670 show_stack(p, NULL);
4673 void show_state_filter(unsigned long state_filter)
4675 struct task_struct *g, *p;
4677 #if (BITS_PER_LONG == 32)
4680 printk(" task PC stack pid father child younger older\n");
4684 printk(" task PC stack pid father child younger older\n");
4686 read_lock(&tasklist_lock);
4687 do_each_thread(g, p) {
4689 * reset the NMI-timeout, listing all files on a slow
4690 * console might take alot of time:
4692 touch_nmi_watchdog();
4693 if (!state_filter || (p->state & state_filter))
4695 } while_each_thread(g, p);
4697 touch_all_softlockup_watchdogs();
4699 #ifdef CONFIG_SCHED_DEBUG
4700 sysrq_sched_debug_show();
4702 read_unlock(&tasklist_lock);
4704 * Only show locks if all tasks are dumped:
4706 if (state_filter == -1)
4707 debug_show_all_locks();
4710 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4712 idle->sched_class = &idle_sched_class;
4716 * init_idle - set up an idle thread for a given CPU
4717 * @idle: task in question
4718 * @cpu: cpu the idle task belongs to
4720 * NOTE: this function does not set the idle thread's NEED_RESCHED
4721 * flag, to make booting more robust.
4723 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4725 struct rq *rq = cpu_rq(cpu);
4726 unsigned long flags;
4729 idle->se.exec_start = sched_clock();
4731 idle->prio = idle->normal_prio = MAX_PRIO;
4732 idle->cpus_allowed = cpumask_of_cpu(cpu);
4733 __set_task_cpu(idle, cpu);
4735 spin_lock_irqsave(&rq->lock, flags);
4736 rq->curr = rq->idle = idle;
4737 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4740 spin_unlock_irqrestore(&rq->lock, flags);
4742 /* Set the preempt count _outside_ the spinlocks! */
4743 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4744 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4746 task_thread_info(idle)->preempt_count = 0;
4749 * The idle tasks have their own, simple scheduling class:
4751 idle->sched_class = &idle_sched_class;
4755 * In a system that switches off the HZ timer nohz_cpu_mask
4756 * indicates which cpus entered this state. This is used
4757 * in the rcu update to wait only for active cpus. For system
4758 * which do not switch off the HZ timer nohz_cpu_mask should
4759 * always be CPU_MASK_NONE.
4761 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4764 * Increase the granularity value when there are more CPUs,
4765 * because with more CPUs the 'effective latency' as visible
4766 * to users decreases. But the relationship is not linear,
4767 * so pick a second-best guess by going with the log2 of the
4770 * This idea comes from the SD scheduler of Con Kolivas:
4772 static inline void sched_init_granularity(void)
4774 unsigned int factor = 1 + ilog2(num_online_cpus());
4775 const unsigned long gran_limit = 10000000;
4777 sysctl_sched_granularity *= factor;
4778 if (sysctl_sched_granularity > gran_limit)
4779 sysctl_sched_granularity = gran_limit;
4781 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4782 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4787 * This is how migration works:
4789 * 1) we queue a struct migration_req structure in the source CPU's
4790 * runqueue and wake up that CPU's migration thread.
4791 * 2) we down() the locked semaphore => thread blocks.
4792 * 3) migration thread wakes up (implicitly it forces the migrated
4793 * thread off the CPU)
4794 * 4) it gets the migration request and checks whether the migrated
4795 * task is still in the wrong runqueue.
4796 * 5) if it's in the wrong runqueue then the migration thread removes
4797 * it and puts it into the right queue.
4798 * 6) migration thread up()s the semaphore.
4799 * 7) we wake up and the migration is done.
4803 * Change a given task's CPU affinity. Migrate the thread to a
4804 * proper CPU and schedule it away if the CPU it's executing on
4805 * is removed from the allowed bitmask.
4807 * NOTE: the caller must have a valid reference to the task, the
4808 * task must not exit() & deallocate itself prematurely. The
4809 * call is not atomic; no spinlocks may be held.
4811 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4813 struct migration_req req;
4814 unsigned long flags;
4818 rq = task_rq_lock(p, &flags);
4819 if (!cpus_intersects(new_mask, cpu_online_map)) {
4824 p->cpus_allowed = new_mask;
4825 /* Can the task run on the task's current CPU? If so, we're done */
4826 if (cpu_isset(task_cpu(p), new_mask))
4829 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4830 /* Need help from migration thread: drop lock and wait. */
4831 task_rq_unlock(rq, &flags);
4832 wake_up_process(rq->migration_thread);
4833 wait_for_completion(&req.done);
4834 tlb_migrate_finish(p->mm);
4838 task_rq_unlock(rq, &flags);
4842 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4845 * Move (not current) task off this cpu, onto dest cpu. We're doing
4846 * this because either it can't run here any more (set_cpus_allowed()
4847 * away from this CPU, or CPU going down), or because we're
4848 * attempting to rebalance this task on exec (sched_exec).
4850 * So we race with normal scheduler movements, but that's OK, as long
4851 * as the task is no longer on this CPU.
4853 * Returns non-zero if task was successfully migrated.
4855 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4857 struct rq *rq_dest, *rq_src;
4860 if (unlikely(cpu_is_offline(dest_cpu)))
4863 rq_src = cpu_rq(src_cpu);
4864 rq_dest = cpu_rq(dest_cpu);
4866 double_rq_lock(rq_src, rq_dest);
4867 /* Already moved. */
4868 if (task_cpu(p) != src_cpu)
4870 /* Affinity changed (again). */
4871 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4874 on_rq = p->se.on_rq;
4876 deactivate_task(rq_src, p, 0);
4877 set_task_cpu(p, dest_cpu);
4879 activate_task(rq_dest, p, 0);
4880 check_preempt_curr(rq_dest, p);
4884 double_rq_unlock(rq_src, rq_dest);
4889 * migration_thread - this is a highprio system thread that performs
4890 * thread migration by bumping thread off CPU then 'pushing' onto
4893 static int migration_thread(void *data)
4895 int cpu = (long)data;
4899 BUG_ON(rq->migration_thread != current);
4901 set_current_state(TASK_INTERRUPTIBLE);
4902 while (!kthread_should_stop()) {
4903 struct migration_req *req;
4904 struct list_head *head;
4908 spin_lock_irq(&rq->lock);
4910 if (cpu_is_offline(cpu)) {
4911 spin_unlock_irq(&rq->lock);
4915 if (rq->active_balance) {
4916 active_load_balance(rq, cpu);
4917 rq->active_balance = 0;
4920 head = &rq->migration_queue;
4922 if (list_empty(head)) {
4923 spin_unlock_irq(&rq->lock);
4925 set_current_state(TASK_INTERRUPTIBLE);
4928 req = list_entry(head->next, struct migration_req, list);
4929 list_del_init(head->next);
4931 spin_unlock(&rq->lock);
4932 __migrate_task(req->task, cpu, req->dest_cpu);
4935 complete(&req->done);
4937 __set_current_state(TASK_RUNNING);
4941 /* Wait for kthread_stop */
4942 set_current_state(TASK_INTERRUPTIBLE);
4943 while (!kthread_should_stop()) {
4945 set_current_state(TASK_INTERRUPTIBLE);
4947 __set_current_state(TASK_RUNNING);
4951 #ifdef CONFIG_HOTPLUG_CPU
4953 * Figure out where task on dead CPU should go, use force if neccessary.
4954 * NOTE: interrupts should be disabled by the caller
4956 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4958 unsigned long flags;
4965 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4966 cpus_and(mask, mask, p->cpus_allowed);
4967 dest_cpu = any_online_cpu(mask);
4969 /* On any allowed CPU? */
4970 if (dest_cpu == NR_CPUS)
4971 dest_cpu = any_online_cpu(p->cpus_allowed);
4973 /* No more Mr. Nice Guy. */
4974 if (dest_cpu == NR_CPUS) {
4975 rq = task_rq_lock(p, &flags);
4976 cpus_setall(p->cpus_allowed);
4977 dest_cpu = any_online_cpu(p->cpus_allowed);
4978 task_rq_unlock(rq, &flags);
4981 * Don't tell them about moving exiting tasks or
4982 * kernel threads (both mm NULL), since they never
4985 if (p->mm && printk_ratelimit())
4986 printk(KERN_INFO "process %d (%s) no "
4987 "longer affine to cpu%d\n",
4988 p->pid, p->comm, dead_cpu);
4990 if (!__migrate_task(p, dead_cpu, dest_cpu))
4995 * While a dead CPU has no uninterruptible tasks queued at this point,
4996 * it might still have a nonzero ->nr_uninterruptible counter, because
4997 * for performance reasons the counter is not stricly tracking tasks to
4998 * their home CPUs. So we just add the counter to another CPU's counter,
4999 * to keep the global sum constant after CPU-down:
5001 static void migrate_nr_uninterruptible(struct rq *rq_src)
5003 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5004 unsigned long flags;
5006 local_irq_save(flags);
5007 double_rq_lock(rq_src, rq_dest);
5008 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5009 rq_src->nr_uninterruptible = 0;
5010 double_rq_unlock(rq_src, rq_dest);
5011 local_irq_restore(flags);
5014 /* Run through task list and migrate tasks from the dead cpu. */
5015 static void migrate_live_tasks(int src_cpu)
5017 struct task_struct *p, *t;
5019 write_lock_irq(&tasklist_lock);
5021 do_each_thread(t, p) {
5025 if (task_cpu(p) == src_cpu)
5026 move_task_off_dead_cpu(src_cpu, p);
5027 } while_each_thread(t, p);
5029 write_unlock_irq(&tasklist_lock);
5033 * Schedules idle task to be the next runnable task on current CPU.
5034 * It does so by boosting its priority to highest possible and adding it to
5035 * the _front_ of the runqueue. Used by CPU offline code.
5037 void sched_idle_next(void)
5039 int this_cpu = smp_processor_id();
5040 struct rq *rq = cpu_rq(this_cpu);
5041 struct task_struct *p = rq->idle;
5042 unsigned long flags;
5044 /* cpu has to be offline */
5045 BUG_ON(cpu_online(this_cpu));
5048 * Strictly not necessary since rest of the CPUs are stopped by now
5049 * and interrupts disabled on the current cpu.
5051 spin_lock_irqsave(&rq->lock, flags);
5053 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5055 /* Add idle task to the _front_ of its priority queue: */
5056 activate_idle_task(p, rq);
5058 spin_unlock_irqrestore(&rq->lock, flags);
5062 * Ensures that the idle task is using init_mm right before its cpu goes
5065 void idle_task_exit(void)
5067 struct mm_struct *mm = current->active_mm;
5069 BUG_ON(cpu_online(smp_processor_id()));
5072 switch_mm(mm, &init_mm, current);
5076 /* called under rq->lock with disabled interrupts */
5077 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5079 struct rq *rq = cpu_rq(dead_cpu);
5081 /* Must be exiting, otherwise would be on tasklist. */
5082 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5084 /* Cannot have done final schedule yet: would have vanished. */
5085 BUG_ON(p->state == TASK_DEAD);
5090 * Drop lock around migration; if someone else moves it,
5091 * that's OK. No task can be added to this CPU, so iteration is
5093 * NOTE: interrupts should be left disabled --dev@
5095 spin_unlock(&rq->lock);
5096 move_task_off_dead_cpu(dead_cpu, p);
5097 spin_lock(&rq->lock);
5102 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5103 static void migrate_dead_tasks(unsigned int dead_cpu)
5105 struct rq *rq = cpu_rq(dead_cpu);
5106 struct task_struct *next;
5109 if (!rq->nr_running)
5111 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5114 migrate_dead(dead_cpu, next);
5117 #endif /* CONFIG_HOTPLUG_CPU */
5120 * migration_call - callback that gets triggered when a CPU is added.
5121 * Here we can start up the necessary migration thread for the new CPU.
5123 static int __cpuinit
5124 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5126 struct task_struct *p;
5127 int cpu = (long)hcpu;
5128 unsigned long flags;
5132 case CPU_LOCK_ACQUIRE:
5133 mutex_lock(&sched_hotcpu_mutex);
5136 case CPU_UP_PREPARE:
5137 case CPU_UP_PREPARE_FROZEN:
5138 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5141 p->flags |= PF_NOFREEZE;
5142 kthread_bind(p, cpu);
5143 /* Must be high prio: stop_machine expects to yield to it. */
5144 rq = task_rq_lock(p, &flags);
5145 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5146 task_rq_unlock(rq, &flags);
5147 cpu_rq(cpu)->migration_thread = p;
5151 case CPU_ONLINE_FROZEN:
5152 /* Strictly unneccessary, as first user will wake it. */
5153 wake_up_process(cpu_rq(cpu)->migration_thread);
5156 #ifdef CONFIG_HOTPLUG_CPU
5157 case CPU_UP_CANCELED:
5158 case CPU_UP_CANCELED_FROZEN:
5159 if (!cpu_rq(cpu)->migration_thread)
5161 /* Unbind it from offline cpu so it can run. Fall thru. */
5162 kthread_bind(cpu_rq(cpu)->migration_thread,
5163 any_online_cpu(cpu_online_map));
5164 kthread_stop(cpu_rq(cpu)->migration_thread);
5165 cpu_rq(cpu)->migration_thread = NULL;
5169 case CPU_DEAD_FROZEN:
5170 migrate_live_tasks(cpu);
5172 kthread_stop(rq->migration_thread);
5173 rq->migration_thread = NULL;
5174 /* Idle task back to normal (off runqueue, low prio) */
5175 rq = task_rq_lock(rq->idle, &flags);
5176 deactivate_task(rq, rq->idle, 0);
5177 rq->idle->static_prio = MAX_PRIO;
5178 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5179 rq->idle->sched_class = &idle_sched_class;
5180 migrate_dead_tasks(cpu);
5181 task_rq_unlock(rq, &flags);
5182 migrate_nr_uninterruptible(rq);
5183 BUG_ON(rq->nr_running != 0);
5185 /* No need to migrate the tasks: it was best-effort if
5186 * they didn't take sched_hotcpu_mutex. Just wake up
5187 * the requestors. */
5188 spin_lock_irq(&rq->lock);
5189 while (!list_empty(&rq->migration_queue)) {
5190 struct migration_req *req;
5192 req = list_entry(rq->migration_queue.next,
5193 struct migration_req, list);
5194 list_del_init(&req->list);
5195 complete(&req->done);
5197 spin_unlock_irq(&rq->lock);
5200 case CPU_LOCK_RELEASE:
5201 mutex_unlock(&sched_hotcpu_mutex);
5207 /* Register at highest priority so that task migration (migrate_all_tasks)
5208 * happens before everything else.
5210 static struct notifier_block __cpuinitdata migration_notifier = {
5211 .notifier_call = migration_call,
5215 int __init migration_init(void)
5217 void *cpu = (void *)(long)smp_processor_id();
5220 /* Start one for the boot CPU: */
5221 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5222 BUG_ON(err == NOTIFY_BAD);
5223 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5224 register_cpu_notifier(&migration_notifier);
5232 /* Number of possible processor ids */
5233 int nr_cpu_ids __read_mostly = NR_CPUS;
5234 EXPORT_SYMBOL(nr_cpu_ids);
5236 #undef SCHED_DOMAIN_DEBUG
5237 #ifdef SCHED_DOMAIN_DEBUG
5238 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5243 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5247 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5252 struct sched_group *group = sd->groups;
5253 cpumask_t groupmask;
5255 cpumask_scnprintf(str, NR_CPUS, sd->span);
5256 cpus_clear(groupmask);
5259 for (i = 0; i < level + 1; i++)
5261 printk("domain %d: ", level);
5263 if (!(sd->flags & SD_LOAD_BALANCE)) {
5264 printk("does not load-balance\n");
5266 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5271 printk("span %s\n", str);
5273 if (!cpu_isset(cpu, sd->span))
5274 printk(KERN_ERR "ERROR: domain->span does not contain "
5276 if (!cpu_isset(cpu, group->cpumask))
5277 printk(KERN_ERR "ERROR: domain->groups does not contain"
5281 for (i = 0; i < level + 2; i++)
5287 printk(KERN_ERR "ERROR: group is NULL\n");
5291 if (!group->__cpu_power) {
5293 printk(KERN_ERR "ERROR: domain->cpu_power not "
5297 if (!cpus_weight(group->cpumask)) {
5299 printk(KERN_ERR "ERROR: empty group\n");
5302 if (cpus_intersects(groupmask, group->cpumask)) {
5304 printk(KERN_ERR "ERROR: repeated CPUs\n");
5307 cpus_or(groupmask, groupmask, group->cpumask);
5309 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5312 group = group->next;
5313 } while (group != sd->groups);
5316 if (!cpus_equal(sd->span, groupmask))
5317 printk(KERN_ERR "ERROR: groups don't span "
5325 if (!cpus_subset(groupmask, sd->span))
5326 printk(KERN_ERR "ERROR: parent span is not a superset "
5327 "of domain->span\n");
5332 # define sched_domain_debug(sd, cpu) do { } while (0)
5335 static int sd_degenerate(struct sched_domain *sd)
5337 if (cpus_weight(sd->span) == 1)
5340 /* Following flags need at least 2 groups */
5341 if (sd->flags & (SD_LOAD_BALANCE |
5342 SD_BALANCE_NEWIDLE |
5346 SD_SHARE_PKG_RESOURCES)) {
5347 if (sd->groups != sd->groups->next)
5351 /* Following flags don't use groups */
5352 if (sd->flags & (SD_WAKE_IDLE |
5361 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5363 unsigned long cflags = sd->flags, pflags = parent->flags;
5365 if (sd_degenerate(parent))
5368 if (!cpus_equal(sd->span, parent->span))
5371 /* Does parent contain flags not in child? */
5372 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5373 if (cflags & SD_WAKE_AFFINE)
5374 pflags &= ~SD_WAKE_BALANCE;
5375 /* Flags needing groups don't count if only 1 group in parent */
5376 if (parent->groups == parent->groups->next) {
5377 pflags &= ~(SD_LOAD_BALANCE |
5378 SD_BALANCE_NEWIDLE |
5382 SD_SHARE_PKG_RESOURCES);
5384 if (~cflags & pflags)
5391 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5392 * hold the hotplug lock.
5394 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5396 struct rq *rq = cpu_rq(cpu);
5397 struct sched_domain *tmp;
5399 /* Remove the sched domains which do not contribute to scheduling. */
5400 for (tmp = sd; tmp; tmp = tmp->parent) {
5401 struct sched_domain *parent = tmp->parent;
5404 if (sd_parent_degenerate(tmp, parent)) {
5405 tmp->parent = parent->parent;
5407 parent->parent->child = tmp;
5411 if (sd && sd_degenerate(sd)) {
5417 sched_domain_debug(sd, cpu);
5419 rcu_assign_pointer(rq->sd, sd);
5422 /* cpus with isolated domains */
5423 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5425 /* Setup the mask of cpus configured for isolated domains */
5426 static int __init isolated_cpu_setup(char *str)
5428 int ints[NR_CPUS], i;
5430 str = get_options(str, ARRAY_SIZE(ints), ints);
5431 cpus_clear(cpu_isolated_map);
5432 for (i = 1; i <= ints[0]; i++)
5433 if (ints[i] < NR_CPUS)
5434 cpu_set(ints[i], cpu_isolated_map);
5438 __setup ("isolcpus=", isolated_cpu_setup);
5441 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5442 * to a function which identifies what group(along with sched group) a CPU
5443 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5444 * (due to the fact that we keep track of groups covered with a cpumask_t).
5446 * init_sched_build_groups will build a circular linked list of the groups
5447 * covered by the given span, and will set each group's ->cpumask correctly,
5448 * and ->cpu_power to 0.
5451 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5452 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5453 struct sched_group **sg))
5455 struct sched_group *first = NULL, *last = NULL;
5456 cpumask_t covered = CPU_MASK_NONE;
5459 for_each_cpu_mask(i, span) {
5460 struct sched_group *sg;
5461 int group = group_fn(i, cpu_map, &sg);
5464 if (cpu_isset(i, covered))
5467 sg->cpumask = CPU_MASK_NONE;
5468 sg->__cpu_power = 0;
5470 for_each_cpu_mask(j, span) {
5471 if (group_fn(j, cpu_map, NULL) != group)
5474 cpu_set(j, covered);
5475 cpu_set(j, sg->cpumask);
5486 #define SD_NODES_PER_DOMAIN 16
5491 * find_next_best_node - find the next node to include in a sched_domain
5492 * @node: node whose sched_domain we're building
5493 * @used_nodes: nodes already in the sched_domain
5495 * Find the next node to include in a given scheduling domain. Simply
5496 * finds the closest node not already in the @used_nodes map.
5498 * Should use nodemask_t.
5500 static int find_next_best_node(int node, unsigned long *used_nodes)
5502 int i, n, val, min_val, best_node = 0;
5506 for (i = 0; i < MAX_NUMNODES; i++) {
5507 /* Start at @node */
5508 n = (node + i) % MAX_NUMNODES;
5510 if (!nr_cpus_node(n))
5513 /* Skip already used nodes */
5514 if (test_bit(n, used_nodes))
5517 /* Simple min distance search */
5518 val = node_distance(node, n);
5520 if (val < min_val) {
5526 set_bit(best_node, used_nodes);
5531 * sched_domain_node_span - get a cpumask for a node's sched_domain
5532 * @node: node whose cpumask we're constructing
5533 * @size: number of nodes to include in this span
5535 * Given a node, construct a good cpumask for its sched_domain to span. It
5536 * should be one that prevents unnecessary balancing, but also spreads tasks
5539 static cpumask_t sched_domain_node_span(int node)
5541 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5542 cpumask_t span, nodemask;
5546 bitmap_zero(used_nodes, MAX_NUMNODES);
5548 nodemask = node_to_cpumask(node);
5549 cpus_or(span, span, nodemask);
5550 set_bit(node, used_nodes);
5552 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5553 int next_node = find_next_best_node(node, used_nodes);
5555 nodemask = node_to_cpumask(next_node);
5556 cpus_or(span, span, nodemask);
5563 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5566 * SMT sched-domains:
5568 #ifdef CONFIG_SCHED_SMT
5569 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5570 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5572 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5573 struct sched_group **sg)
5576 *sg = &per_cpu(sched_group_cpus, cpu);
5582 * multi-core sched-domains:
5584 #ifdef CONFIG_SCHED_MC
5585 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5586 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5589 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5590 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5591 struct sched_group **sg)
5594 cpumask_t mask = cpu_sibling_map[cpu];
5595 cpus_and(mask, mask, *cpu_map);
5596 group = first_cpu(mask);
5598 *sg = &per_cpu(sched_group_core, group);
5601 #elif defined(CONFIG_SCHED_MC)
5602 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5603 struct sched_group **sg)
5606 *sg = &per_cpu(sched_group_core, cpu);
5611 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5612 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5614 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5615 struct sched_group **sg)
5618 #ifdef CONFIG_SCHED_MC
5619 cpumask_t mask = cpu_coregroup_map(cpu);
5620 cpus_and(mask, mask, *cpu_map);
5621 group = first_cpu(mask);
5622 #elif defined(CONFIG_SCHED_SMT)
5623 cpumask_t mask = cpu_sibling_map[cpu];
5624 cpus_and(mask, mask, *cpu_map);
5625 group = first_cpu(mask);
5630 *sg = &per_cpu(sched_group_phys, group);
5636 * The init_sched_build_groups can't handle what we want to do with node
5637 * groups, so roll our own. Now each node has its own list of groups which
5638 * gets dynamically allocated.
5640 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5641 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5643 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5644 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5646 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5647 struct sched_group **sg)
5649 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5652 cpus_and(nodemask, nodemask, *cpu_map);
5653 group = first_cpu(nodemask);
5656 *sg = &per_cpu(sched_group_allnodes, group);
5660 static void init_numa_sched_groups_power(struct sched_group *group_head)
5662 struct sched_group *sg = group_head;
5668 for_each_cpu_mask(j, sg->cpumask) {
5669 struct sched_domain *sd;
5671 sd = &per_cpu(phys_domains, j);
5672 if (j != first_cpu(sd->groups->cpumask)) {
5674 * Only add "power" once for each
5680 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5683 if (sg != group_head)
5689 /* Free memory allocated for various sched_group structures */
5690 static void free_sched_groups(const cpumask_t *cpu_map)
5694 for_each_cpu_mask(cpu, *cpu_map) {
5695 struct sched_group **sched_group_nodes
5696 = sched_group_nodes_bycpu[cpu];
5698 if (!sched_group_nodes)
5701 for (i = 0; i < MAX_NUMNODES; i++) {
5702 cpumask_t nodemask = node_to_cpumask(i);
5703 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5705 cpus_and(nodemask, nodemask, *cpu_map);
5706 if (cpus_empty(nodemask))
5716 if (oldsg != sched_group_nodes[i])
5719 kfree(sched_group_nodes);
5720 sched_group_nodes_bycpu[cpu] = NULL;
5724 static void free_sched_groups(const cpumask_t *cpu_map)
5730 * Initialize sched groups cpu_power.
5732 * cpu_power indicates the capacity of sched group, which is used while
5733 * distributing the load between different sched groups in a sched domain.
5734 * Typically cpu_power for all the groups in a sched domain will be same unless
5735 * there are asymmetries in the topology. If there are asymmetries, group
5736 * having more cpu_power will pickup more load compared to the group having
5739 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5740 * the maximum number of tasks a group can handle in the presence of other idle
5741 * or lightly loaded groups in the same sched domain.
5743 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5745 struct sched_domain *child;
5746 struct sched_group *group;
5748 WARN_ON(!sd || !sd->groups);
5750 if (cpu != first_cpu(sd->groups->cpumask))
5755 sd->groups->__cpu_power = 0;
5758 * For perf policy, if the groups in child domain share resources
5759 * (for example cores sharing some portions of the cache hierarchy
5760 * or SMT), then set this domain groups cpu_power such that each group
5761 * can handle only one task, when there are other idle groups in the
5762 * same sched domain.
5764 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5766 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5767 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5772 * add cpu_power of each child group to this groups cpu_power
5774 group = child->groups;
5776 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5777 group = group->next;
5778 } while (group != child->groups);
5782 * Build sched domains for a given set of cpus and attach the sched domains
5783 * to the individual cpus
5785 static int build_sched_domains(const cpumask_t *cpu_map)
5789 struct sched_group **sched_group_nodes = NULL;
5790 int sd_allnodes = 0;
5793 * Allocate the per-node list of sched groups
5795 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5797 if (!sched_group_nodes) {
5798 printk(KERN_WARNING "Can not alloc sched group node list\n");
5801 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5805 * Set up domains for cpus specified by the cpu_map.
5807 for_each_cpu_mask(i, *cpu_map) {
5808 struct sched_domain *sd = NULL, *p;
5809 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5811 cpus_and(nodemask, nodemask, *cpu_map);
5814 if (cpus_weight(*cpu_map) >
5815 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5816 sd = &per_cpu(allnodes_domains, i);
5817 *sd = SD_ALLNODES_INIT;
5818 sd->span = *cpu_map;
5819 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5825 sd = &per_cpu(node_domains, i);
5827 sd->span = sched_domain_node_span(cpu_to_node(i));
5831 cpus_and(sd->span, sd->span, *cpu_map);
5835 sd = &per_cpu(phys_domains, i);
5837 sd->span = nodemask;
5841 cpu_to_phys_group(i, cpu_map, &sd->groups);
5843 #ifdef CONFIG_SCHED_MC
5845 sd = &per_cpu(core_domains, i);
5847 sd->span = cpu_coregroup_map(i);
5848 cpus_and(sd->span, sd->span, *cpu_map);
5851 cpu_to_core_group(i, cpu_map, &sd->groups);
5854 #ifdef CONFIG_SCHED_SMT
5856 sd = &per_cpu(cpu_domains, i);
5857 *sd = SD_SIBLING_INIT;
5858 sd->span = cpu_sibling_map[i];
5859 cpus_and(sd->span, sd->span, *cpu_map);
5862 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5866 #ifdef CONFIG_SCHED_SMT
5867 /* Set up CPU (sibling) groups */
5868 for_each_cpu_mask(i, *cpu_map) {
5869 cpumask_t this_sibling_map = cpu_sibling_map[i];
5870 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5871 if (i != first_cpu(this_sibling_map))
5874 init_sched_build_groups(this_sibling_map, cpu_map,
5879 #ifdef CONFIG_SCHED_MC
5880 /* Set up multi-core groups */
5881 for_each_cpu_mask(i, *cpu_map) {
5882 cpumask_t this_core_map = cpu_coregroup_map(i);
5883 cpus_and(this_core_map, this_core_map, *cpu_map);
5884 if (i != first_cpu(this_core_map))
5886 init_sched_build_groups(this_core_map, cpu_map,
5887 &cpu_to_core_group);
5891 /* Set up physical groups */
5892 for (i = 0; i < MAX_NUMNODES; i++) {
5893 cpumask_t nodemask = node_to_cpumask(i);
5895 cpus_and(nodemask, nodemask, *cpu_map);
5896 if (cpus_empty(nodemask))
5899 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5903 /* Set up node groups */
5905 init_sched_build_groups(*cpu_map, cpu_map,
5906 &cpu_to_allnodes_group);
5908 for (i = 0; i < MAX_NUMNODES; i++) {
5909 /* Set up node groups */
5910 struct sched_group *sg, *prev;
5911 cpumask_t nodemask = node_to_cpumask(i);
5912 cpumask_t domainspan;
5913 cpumask_t covered = CPU_MASK_NONE;
5916 cpus_and(nodemask, nodemask, *cpu_map);
5917 if (cpus_empty(nodemask)) {
5918 sched_group_nodes[i] = NULL;
5922 domainspan = sched_domain_node_span(i);
5923 cpus_and(domainspan, domainspan, *cpu_map);
5925 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5927 printk(KERN_WARNING "Can not alloc domain group for "
5931 sched_group_nodes[i] = sg;
5932 for_each_cpu_mask(j, nodemask) {
5933 struct sched_domain *sd;
5935 sd = &per_cpu(node_domains, j);
5938 sg->__cpu_power = 0;
5939 sg->cpumask = nodemask;
5941 cpus_or(covered, covered, nodemask);
5944 for (j = 0; j < MAX_NUMNODES; j++) {
5945 cpumask_t tmp, notcovered;
5946 int n = (i + j) % MAX_NUMNODES;
5948 cpus_complement(notcovered, covered);
5949 cpus_and(tmp, notcovered, *cpu_map);
5950 cpus_and(tmp, tmp, domainspan);
5951 if (cpus_empty(tmp))
5954 nodemask = node_to_cpumask(n);
5955 cpus_and(tmp, tmp, nodemask);
5956 if (cpus_empty(tmp))
5959 sg = kmalloc_node(sizeof(struct sched_group),
5963 "Can not alloc domain group for node %d\n", j);
5966 sg->__cpu_power = 0;
5968 sg->next = prev->next;
5969 cpus_or(covered, covered, tmp);
5976 /* Calculate CPU power for physical packages and nodes */
5977 #ifdef CONFIG_SCHED_SMT
5978 for_each_cpu_mask(i, *cpu_map) {
5979 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5981 init_sched_groups_power(i, sd);
5984 #ifdef CONFIG_SCHED_MC
5985 for_each_cpu_mask(i, *cpu_map) {
5986 struct sched_domain *sd = &per_cpu(core_domains, i);
5988 init_sched_groups_power(i, sd);
5992 for_each_cpu_mask(i, *cpu_map) {
5993 struct sched_domain *sd = &per_cpu(phys_domains, i);
5995 init_sched_groups_power(i, sd);
5999 for (i = 0; i < MAX_NUMNODES; i++)
6000 init_numa_sched_groups_power(sched_group_nodes[i]);
6003 struct sched_group *sg;
6005 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6006 init_numa_sched_groups_power(sg);
6010 /* Attach the domains */
6011 for_each_cpu_mask(i, *cpu_map) {
6012 struct sched_domain *sd;
6013 #ifdef CONFIG_SCHED_SMT
6014 sd = &per_cpu(cpu_domains, i);
6015 #elif defined(CONFIG_SCHED_MC)
6016 sd = &per_cpu(core_domains, i);
6018 sd = &per_cpu(phys_domains, i);
6020 cpu_attach_domain(sd, i);
6027 free_sched_groups(cpu_map);
6032 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6034 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6036 cpumask_t cpu_default_map;
6040 * Setup mask for cpus without special case scheduling requirements.
6041 * For now this just excludes isolated cpus, but could be used to
6042 * exclude other special cases in the future.
6044 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6046 err = build_sched_domains(&cpu_default_map);
6051 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6053 free_sched_groups(cpu_map);
6057 * Detach sched domains from a group of cpus specified in cpu_map
6058 * These cpus will now be attached to the NULL domain
6060 static void detach_destroy_domains(const cpumask_t *cpu_map)
6064 for_each_cpu_mask(i, *cpu_map)
6065 cpu_attach_domain(NULL, i);
6066 synchronize_sched();
6067 arch_destroy_sched_domains(cpu_map);
6071 * Partition sched domains as specified by the cpumasks below.
6072 * This attaches all cpus from the cpumasks to the NULL domain,
6073 * waits for a RCU quiescent period, recalculates sched
6074 * domain information and then attaches them back to the
6075 * correct sched domains
6076 * Call with hotplug lock held
6078 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6080 cpumask_t change_map;
6083 cpus_and(*partition1, *partition1, cpu_online_map);
6084 cpus_and(*partition2, *partition2, cpu_online_map);
6085 cpus_or(change_map, *partition1, *partition2);
6087 /* Detach sched domains from all of the affected cpus */
6088 detach_destroy_domains(&change_map);
6089 if (!cpus_empty(*partition1))
6090 err = build_sched_domains(partition1);
6091 if (!err && !cpus_empty(*partition2))
6092 err = build_sched_domains(partition2);
6097 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6098 int arch_reinit_sched_domains(void)
6102 mutex_lock(&sched_hotcpu_mutex);
6103 detach_destroy_domains(&cpu_online_map);
6104 err = arch_init_sched_domains(&cpu_online_map);
6105 mutex_unlock(&sched_hotcpu_mutex);
6110 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6114 if (buf[0] != '0' && buf[0] != '1')
6118 sched_smt_power_savings = (buf[0] == '1');
6120 sched_mc_power_savings = (buf[0] == '1');
6122 ret = arch_reinit_sched_domains();
6124 return ret ? ret : count;
6127 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6131 #ifdef CONFIG_SCHED_SMT
6133 err = sysfs_create_file(&cls->kset.kobj,
6134 &attr_sched_smt_power_savings.attr);
6136 #ifdef CONFIG_SCHED_MC
6137 if (!err && mc_capable())
6138 err = sysfs_create_file(&cls->kset.kobj,
6139 &attr_sched_mc_power_savings.attr);
6145 #ifdef CONFIG_SCHED_MC
6146 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6148 return sprintf(page, "%u\n", sched_mc_power_savings);
6150 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6151 const char *buf, size_t count)
6153 return sched_power_savings_store(buf, count, 0);
6155 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6156 sched_mc_power_savings_store);
6159 #ifdef CONFIG_SCHED_SMT
6160 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6162 return sprintf(page, "%u\n", sched_smt_power_savings);
6164 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6165 const char *buf, size_t count)
6167 return sched_power_savings_store(buf, count, 1);
6169 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6170 sched_smt_power_savings_store);
6174 * Force a reinitialization of the sched domains hierarchy. The domains
6175 * and groups cannot be updated in place without racing with the balancing
6176 * code, so we temporarily attach all running cpus to the NULL domain
6177 * which will prevent rebalancing while the sched domains are recalculated.
6179 static int update_sched_domains(struct notifier_block *nfb,
6180 unsigned long action, void *hcpu)
6183 case CPU_UP_PREPARE:
6184 case CPU_UP_PREPARE_FROZEN:
6185 case CPU_DOWN_PREPARE:
6186 case CPU_DOWN_PREPARE_FROZEN:
6187 detach_destroy_domains(&cpu_online_map);
6190 case CPU_UP_CANCELED:
6191 case CPU_UP_CANCELED_FROZEN:
6192 case CPU_DOWN_FAILED:
6193 case CPU_DOWN_FAILED_FROZEN:
6195 case CPU_ONLINE_FROZEN:
6197 case CPU_DEAD_FROZEN:
6199 * Fall through and re-initialise the domains.
6206 /* The hotplug lock is already held by cpu_up/cpu_down */
6207 arch_init_sched_domains(&cpu_online_map);
6212 void __init sched_init_smp(void)
6214 cpumask_t non_isolated_cpus;
6216 mutex_lock(&sched_hotcpu_mutex);
6217 arch_init_sched_domains(&cpu_online_map);
6218 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6219 if (cpus_empty(non_isolated_cpus))
6220 cpu_set(smp_processor_id(), non_isolated_cpus);
6221 mutex_unlock(&sched_hotcpu_mutex);
6222 /* XXX: Theoretical race here - CPU may be hotplugged now */
6223 hotcpu_notifier(update_sched_domains, 0);
6225 /* Move init over to a non-isolated CPU */
6226 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6228 sched_init_granularity();
6231 void __init sched_init_smp(void)
6233 sched_init_granularity();
6235 #endif /* CONFIG_SMP */
6237 int in_sched_functions(unsigned long addr)
6239 /* Linker adds these: start and end of __sched functions */
6240 extern char __sched_text_start[], __sched_text_end[];
6242 return in_lock_functions(addr) ||
6243 (addr >= (unsigned long)__sched_text_start
6244 && addr < (unsigned long)__sched_text_end);
6247 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6249 cfs_rq->tasks_timeline = RB_ROOT;
6250 cfs_rq->fair_clock = 1;
6251 #ifdef CONFIG_FAIR_GROUP_SCHED
6256 void __init sched_init(void)
6258 u64 now = sched_clock();
6259 int highest_cpu = 0;
6263 * Link up the scheduling class hierarchy:
6265 rt_sched_class.next = &fair_sched_class;
6266 fair_sched_class.next = &idle_sched_class;
6267 idle_sched_class.next = NULL;
6269 for_each_possible_cpu(i) {
6270 struct rt_prio_array *array;
6274 spin_lock_init(&rq->lock);
6275 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6278 init_cfs_rq(&rq->cfs, rq);
6279 #ifdef CONFIG_FAIR_GROUP_SCHED
6280 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6281 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6283 rq->ls.load_update_last = now;
6284 rq->ls.load_update_start = now;
6286 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6287 rq->cpu_load[j] = 0;
6290 rq->active_balance = 0;
6291 rq->next_balance = jiffies;
6294 rq->migration_thread = NULL;
6295 INIT_LIST_HEAD(&rq->migration_queue);
6297 atomic_set(&rq->nr_iowait, 0);
6299 array = &rq->rt.active;
6300 for (j = 0; j < MAX_RT_PRIO; j++) {
6301 INIT_LIST_HEAD(array->queue + j);
6302 __clear_bit(j, array->bitmap);
6305 /* delimiter for bitsearch: */
6306 __set_bit(MAX_RT_PRIO, array->bitmap);
6309 set_load_weight(&init_task);
6312 nr_cpu_ids = highest_cpu + 1;
6313 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6316 #ifdef CONFIG_RT_MUTEXES
6317 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6321 * The boot idle thread does lazy MMU switching as well:
6323 atomic_inc(&init_mm.mm_count);
6324 enter_lazy_tlb(&init_mm, current);
6327 * Make us the idle thread. Technically, schedule() should not be
6328 * called from this thread, however somewhere below it might be,
6329 * but because we are the idle thread, we just pick up running again
6330 * when this runqueue becomes "idle".
6332 init_idle(current, smp_processor_id());
6334 * During early bootup we pretend to be a normal task:
6336 current->sched_class = &fair_sched_class;
6339 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6340 void __might_sleep(char *file, int line)
6343 static unsigned long prev_jiffy; /* ratelimiting */
6345 if ((in_atomic() || irqs_disabled()) &&
6346 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6347 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6349 prev_jiffy = jiffies;
6350 printk(KERN_ERR "BUG: sleeping function called from invalid"
6351 " context at %s:%d\n", file, line);
6352 printk("in_atomic():%d, irqs_disabled():%d\n",
6353 in_atomic(), irqs_disabled());
6354 debug_show_held_locks(current);
6355 if (irqs_disabled())
6356 print_irqtrace_events(current);
6361 EXPORT_SYMBOL(__might_sleep);
6364 #ifdef CONFIG_MAGIC_SYSRQ
6365 void normalize_rt_tasks(void)
6367 struct task_struct *g, *p;
6368 unsigned long flags;
6372 read_lock_irq(&tasklist_lock);
6373 do_each_thread(g, p) {
6375 p->se.wait_runtime = 0;
6376 p->se.wait_start_fair = 0;
6377 p->se.wait_start = 0;
6378 p->se.exec_start = 0;
6379 p->se.sleep_start = 0;
6380 p->se.sleep_start_fair = 0;
6381 p->se.block_start = 0;
6382 task_rq(p)->cfs.fair_clock = 0;
6383 task_rq(p)->clock = 0;
6387 * Renice negative nice level userspace
6390 if (TASK_NICE(p) < 0 && p->mm)
6391 set_user_nice(p, 0);
6395 spin_lock_irqsave(&p->pi_lock, flags);
6396 rq = __task_rq_lock(p);
6399 * Do not touch the migration thread:
6401 if (p == rq->migration_thread)
6405 on_rq = p->se.on_rq;
6407 deactivate_task(task_rq(p), p, 0);
6408 __setscheduler(rq, p, SCHED_NORMAL, 0);
6410 activate_task(task_rq(p), p, 0);
6411 resched_task(rq->curr);
6416 __task_rq_unlock(rq);
6417 spin_unlock_irqrestore(&p->pi_lock, flags);
6418 } while_each_thread(g, p);
6420 read_unlock_irq(&tasklist_lock);
6423 #endif /* CONFIG_MAGIC_SYSRQ */
6427 * These functions are only useful for the IA64 MCA handling.
6429 * They can only be called when the whole system has been
6430 * stopped - every CPU needs to be quiescent, and no scheduling
6431 * activity can take place. Using them for anything else would
6432 * be a serious bug, and as a result, they aren't even visible
6433 * under any other configuration.
6437 * curr_task - return the current task for a given cpu.
6438 * @cpu: the processor in question.
6440 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6442 struct task_struct *curr_task(int cpu)
6444 return cpu_curr(cpu);
6448 * set_curr_task - set the current task for a given cpu.
6449 * @cpu: the processor in question.
6450 * @p: the task pointer to set.
6452 * Description: This function must only be used when non-maskable interrupts
6453 * are serviced on a separate stack. It allows the architecture to switch the
6454 * notion of the current task on a cpu in a non-blocking manner. This function
6455 * must be called with all CPU's synchronized, and interrupts disabled, the
6456 * and caller must save the original value of the current task (see
6457 * curr_task() above) and restore that value before reenabling interrupts and
6458 * re-starting the system.
6460 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6462 void set_curr_task(int cpu, struct task_struct *p)