kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  */
  20
  21 #include <linux/mm.h>
  22 #include <linux/module.h>
  23 #include <linux/nmi.h>
  24 #include <linux/init.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/highmem.h>
  27 #include <linux/smp_lock.h>
  28 #include <asm/mmu_context.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/capability.h>
  31 #include <linux/completion.h>
  32 #include <linux/kernel_stat.h>
  33 #include <linux/debug_locks.h>
  34 #include <linux/security.h>
  35 #include <linux/notifier.h>
  36 #include <linux/profile.h>
  37 #include <linux/freezer.h>
  38 #include <linux/vmalloc.h>
  39 #include <linux/blkdev.h>
  40 #include <linux/delay.h>
  41 #include <linux/smp.h>
  42 #include <linux/threads.h>
  43 #include <linux/timer.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/cpu.h>
  46 #include <linux/cpuset.h>
  47 #include <linux/percpu.h>
  48 #include <linux/kthread.h>
  49 #include <linux/seq_file.h>
  50 #include <linux/syscalls.h>
  51 #include <linux/times.h>
  52 #include <linux/tsacct_kern.h>
  53 #include <linux/kprobes.h>
  54 #include <linux/delayacct.h>
  55 #include <linux/reciprocal_div.h>
  56 #include <linux/unistd.h>
  57
  58 #include <asm/tlb.h>
  59
  60 /*
  61  * Scheduler clock - returns current time in nanosec units.
  62  * This is default implementation.
  63  * Architectures and sub-architectures can override this.
  64  */
  65 unsigned long long __attribute__((weak)) sched_clock(void)
  66 {
  67         return (unsigned long long)jiffies * (1000000000 / HZ);
  68 }
  69
  70 /*
  71  * Convert user-nice values [ -20 ... 0 ... 19 ]
  72  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  73  * and back.
  74  */
  75 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  76 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  77 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  78
  79 /*
  80  * 'User priority' is the nice value converted to something we
  81  * can work with better when scaling various scheduler parameters,
  82  * it's a [ 0 ... 39 ] range.
  83  */
  84 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  85 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  86 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  87
  88 /*
  89  * Some helpers for converting nanosecond timing to jiffy resolution
  90  */
  91 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  92 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  93
  94 #define NICE_0_LOAD             SCHED_LOAD_SCALE
  95 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
  96
  97 /*
  98  * These are the 'tuning knobs' of the scheduler:
  99  *
 100  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
 101  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
 102  * Timeslices get refilled after they expire.
 103  */
 104 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 105 #define DEF_TIMESLICE           (100 * HZ / 1000)
 106
 107 #ifdef CONFIG_SMP
 108 /*
 109  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 110  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 111  */
 112 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 113 {
 114         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 115 }
 116
 117 /*
 118  * Each time a sched group cpu_power is changed,
 119  * we must compute its reciprocal value
 120  */
 121 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 122 {
 123         sg->__cpu_power += val;
 124         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 125 }
 126 #endif
 127
 128 #define SCALE_PRIO(x, prio) \
 129         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 130
 131 /*
 132  * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 133  * to time slice values: [800ms ... 100ms ... 5ms]
 134  */
 135 static unsigned int static_prio_timeslice(int static_prio)
 136 {
 137         if (static_prio == NICE_TO_PRIO(19))
 138                 return 1;
 139
 140         if (static_prio < NICE_TO_PRIO(0))
 141                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 142         else
 143                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 144 }
 145
 146 static inline int rt_policy(int policy)
 147 {
 148         if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 149                 return 1;
 150         return 0;
 151 }
 152
 153 static inline int task_has_rt_policy(struct task_struct *p)
 154 {
 155         return rt_policy(p->policy);
 156 }
 157
 158 /*
 159  * This is the priority-queue data structure of the RT scheduling class:
 160  */
 161 struct rt_prio_array {
 162         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 163         struct list_head queue[MAX_RT_PRIO];
 164 };
 165
 166 struct load_stat {
 167         struct load_weight load;
 168         u64 load_update_start, load_update_last;
 169         unsigned long delta_fair, delta_exec, delta_stat;
 170 };
 171
 172 /* CFS-related fields in a runqueue */
 173 struct cfs_rq {
 174         struct load_weight load;
 175         unsigned long nr_running;
 176
 177         s64 fair_clock;
 178         u64 exec_clock;
 179         s64 wait_runtime;
 180         u64 sleeper_bonus;
 181         unsigned long wait_runtime_overruns, wait_runtime_underruns;
 182
 183         struct rb_root tasks_timeline;
 184         struct rb_node *rb_leftmost;
 185         struct rb_node *rb_load_balance_curr;
 186 #ifdef CONFIG_FAIR_GROUP_SCHED
 187         /* 'curr' points to currently running entity on this cfs_rq.
 188          * It is set to NULL otherwise (i.e when none are currently running).
 189          */
 190         struct sched_entity *curr;
 191         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 192
 193         /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 194          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 195          * (like users, containers etc.)
 196          *
 197          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 198          * list is used during load balance.
 199          */
 200         struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 201 #endif
 202 };
 203
 204 /* Real-Time classes' related field in a runqueue: */
 205 struct rt_rq {
 206         struct rt_prio_array active;
 207         int rt_load_balance_idx;
 208         struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 209 };
 210
 211 /*
 212  * This is the main, per-CPU runqueue data structure.
 213  *
 214  * Locking rule: those places that want to lock multiple runqueues
 215  * (such as the load balancing or the thread migration code), lock
 216  * acquire operations must be ordered by ascending &runqueue.
 217  */
 218 struct rq {
 219         spinlock_t lock;        /* runqueue lock */
 220
 221         /*
 222          * nr_running and cpu_load should be in the same cacheline because
 223          * remote CPUs use both these fields when doing load calculation.
 224          */
 225         unsigned long nr_running;
 226         #define CPU_LOAD_IDX_MAX 5
 227         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 228         unsigned char idle_at_tick;
 229 #ifdef CONFIG_NO_HZ
 230         unsigned char in_nohz_recently;
 231 #endif
 232         struct load_stat ls;    /* capture load from *all* tasks on this cpu */
 233         unsigned long nr_load_updates;
 234         u64 nr_switches;
 235
 236         struct cfs_rq cfs;
 237 #ifdef CONFIG_FAIR_GROUP_SCHED
 238         struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 239 #endif
 240         struct rt_rq  rt;
 241
 242         /*
 243          * This is part of a global counter where only the total sum
 244          * over all CPUs matters. A task can increase this counter on
 245          * one CPU and if it got migrated afterwards it may decrease
 246          * it on another CPU. Always updated under the runqueue lock:
 247          */
 248         unsigned long nr_uninterruptible;
 249
 250         struct task_struct *curr, *idle;
 251         unsigned long next_balance;
 252         struct mm_struct *prev_mm;
 253
 254         u64 clock, prev_clock_raw;
 255         s64 clock_max_delta;
 256
 257         unsigned int clock_warps, clock_overflows;
 258         unsigned int clock_unstable_events;
 259
 260         struct sched_class *load_balance_class;
 261
 262         atomic_t nr_iowait;
 263
 264 #ifdef CONFIG_SMP
 265         struct sched_domain *sd;
 266
 267         /* For active balancing */
 268         int active_balance;
 269         int push_cpu;
 270         int cpu;                /* cpu of this runqueue */
 271
 272         struct task_struct *migration_thread;
 273         struct list_head migration_queue;
 274 #endif
 275
 276 #ifdef CONFIG_SCHEDSTATS
 277         /* latency stats */
 278         struct sched_info rq_sched_info;
 279
 280         /* sys_sched_yield() stats */
 281         unsigned long yld_exp_empty;
 282         unsigned long yld_act_empty;
 283         unsigned long yld_both_empty;
 284         unsigned long yld_cnt;
 285
 286         /* schedule() stats */
 287         unsigned long sched_switch;
 288         unsigned long sched_cnt;
 289         unsigned long sched_goidle;
 290
 291         /* try_to_wake_up() stats */
 292         unsigned long ttwu_cnt;
 293         unsigned long ttwu_local;
 294 #endif
 295         struct lock_class_key rq_lock_key;
 296 };
 297
 298 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 299 static DEFINE_MUTEX(sched_hotcpu_mutex);
 300
 301 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 302 {
 303         rq->curr->sched_class->check_preempt_curr(rq, p);
 304 }
 305
 306 static inline int cpu_of(struct rq *rq)
 307 {
 308 #ifdef CONFIG_SMP
 309         return rq->cpu;
 310 #else
 311         return 0;
 312 #endif
 313 }
 314
 315 /*
 316  * Per-runqueue clock, as finegrained as the platform can give us:
 317  */
 318 static unsigned long long __rq_clock(struct rq *rq)
 319 {
 320         u64 prev_raw = rq->prev_clock_raw;
 321         u64 now = sched_clock();
 322         s64 delta = now - prev_raw;
 323         u64 clock = rq->clock;
 324
 325         /*
 326          * Protect against sched_clock() occasionally going backwards:
 327          */
 328         if (unlikely(delta < 0)) {
 329                 clock++;
 330                 rq->clock_warps++;
 331         } else {
 332                 /*
 333                  * Catch too large forward jumps too:
 334                  */
 335                 if (unlikely(delta > 2*TICK_NSEC)) {
 336                         clock++;
 337                         rq->clock_overflows++;
 338                 } else {
 339                         if (unlikely(delta > rq->clock_max_delta))
 340                                 rq->clock_max_delta = delta;
 341                         clock += delta;
 342                 }
 343         }
 344
 345         rq->prev_clock_raw = now;
 346         rq->clock = clock;
 347
 348         return clock;
 349 }
 350
 351 static inline unsigned long long rq_clock(struct rq *rq)
 352 {
 353         int this_cpu = smp_processor_id();
 354
 355         if (this_cpu == cpu_of(rq))
 356                 return __rq_clock(rq);
 357
 358         return rq->clock;
 359 }
 360
 361 /*
 362  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 363  * See detach_destroy_domains: synchronize_sched for details.
 364  *
 365  * The domain tree of any CPU may only be accessed from within
 366  * preempt-disabled sections.
 367  */
 368 #define for_each_domain(cpu, __sd) \
 369         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 370
 371 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 372 #define this_rq()               (&__get_cpu_var(runqueues))
 373 #define task_rq(p)              cpu_rq(task_cpu(p))
 374 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 375
 376 #ifdef CONFIG_FAIR_GROUP_SCHED
 377 /* Change a task's ->cfs_rq if it moves across CPUs */
 378 static inline void set_task_cfs_rq(struct task_struct *p)
 379 {
 380         p->se.cfs_rq = &task_rq(p)->cfs;
 381 }
 382 #else
 383 static inline void set_task_cfs_rq(struct task_struct *p)
 384 {
 385 }
 386 #endif
 387
 388 #ifndef prepare_arch_switch
 389 # define prepare_arch_switch(next)      do { } while (0)
 390 #endif
 391 #ifndef finish_arch_switch
 392 # define finish_arch_switch(prev)       do { } while (0)
 393 #endif
 394
 395 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 396 static inline int task_running(struct rq *rq, struct task_struct *p)
 397 {
 398         return rq->curr == p;
 399 }
 400
 401 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 402 {
 403 }
 404
 405 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 406 {
 407 #ifdef CONFIG_DEBUG_SPINLOCK
 408         /* this is a valid case when another task releases the spinlock */
 409         rq->lock.owner = current;
 410 #endif
 411         /*
 412          * If we are tracking spinlock dependencies then we have to
 413          * fix up the runqueue lock - which gets 'carried over' from
 414          * prev into current:
 415          */
 416         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 417
 418         spin_unlock_irq(&rq->lock);
 419 }
 420
 421 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 422 static inline int task_running(struct rq *rq, struct task_struct *p)
 423 {
 424 #ifdef CONFIG_SMP
 425         return p->oncpu;
 426 #else
 427         return rq->curr == p;
 428 #endif
 429 }
 430
 431 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 432 {
 433 #ifdef CONFIG_SMP
 434         /*
 435          * We can optimise this out completely for !SMP, because the
 436          * SMP rebalancing from interrupt is the only thing that cares
 437          * here.
 438          */
 439         next->oncpu = 1;
 440 #endif
 441 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 442         spin_unlock_irq(&rq->lock);
 443 #else
 444         spin_unlock(&rq->lock);
 445 #endif
 446 }
 447
 448 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 449 {
 450 #ifdef CONFIG_SMP
 451         /*
 452          * After ->oncpu is cleared, the task can be moved to a different CPU.
 453          * We must ensure this doesn't happen until the switch is completely
 454          * finished.
 455          */
 456         smp_wmb();
 457         prev->oncpu = 0;
 458 #endif
 459 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 460         local_irq_enable();
 461 #endif
 462 }
 463 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 464
 465 /*
 466  * __task_rq_lock - lock the runqueue a given task resides on.
 467  * Must be called interrupts disabled.
 468  */
 469 static inline struct rq *__task_rq_lock(struct task_struct *p)
 470         __acquires(rq->lock)
 471 {
 472         struct rq *rq;
 473
 474 repeat_lock_task:
 475         rq = task_rq(p);
 476         spin_lock(&rq->lock);
 477         if (unlikely(rq != task_rq(p))) {
 478                 spin_unlock(&rq->lock);
 479                 goto repeat_lock_task;
 480         }
 481         return rq;
 482 }
 483
 484 /*
 485  * task_rq_lock - lock the runqueue a given task resides on and disable
 486  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 487  * explicitly disabling preemption.
 488  */
 489 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 490         __acquires(rq->lock)
 491 {
 492         struct rq *rq;
 493
 494 repeat_lock_task:
 495         local_irq_save(*flags);
 496         rq = task_rq(p);
 497         spin_lock(&rq->lock);
 498         if (unlikely(rq != task_rq(p))) {
 499                 spin_unlock_irqrestore(&rq->lock, *flags);
 500                 goto repeat_lock_task;
 501         }
 502         return rq;
 503 }
 504
 505 static inline void __task_rq_unlock(struct rq *rq)
 506         __releases(rq->lock)
 507 {
 508         spin_unlock(&rq->lock);
 509 }
 510
 511 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 512         __releases(rq->lock)
 513 {
 514         spin_unlock_irqrestore(&rq->lock, *flags);
 515 }
 516
 517 /*
 518  * this_rq_lock - lock this runqueue and disable interrupts.
 519  */
 520 static inline struct rq *this_rq_lock(void)
 521         __acquires(rq->lock)
 522 {
 523         struct rq *rq;
 524
 525         local_irq_disable();
 526         rq = this_rq();
 527         spin_lock(&rq->lock);
 528
 529         return rq;
 530 }
 531
 532 /*
 533  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
 534  */
 535 void sched_clock_unstable_event(void)
 536 {
 537         unsigned long flags;
 538         struct rq *rq;
 539
 540         rq = task_rq_lock(current, &flags);
 541         rq->prev_clock_raw = sched_clock();
 542         rq->clock_unstable_events++;
 543         task_rq_unlock(rq, &flags);
 544 }
 545
 546 /*
 547  * resched_task - mark a task 'to be rescheduled now'.
 548  *
 549  * On UP this means the setting of the need_resched flag, on SMP it
 550  * might also involve a cross-CPU call to trigger the scheduler on
 551  * the target CPU.
 552  */
 553 #ifdef CONFIG_SMP
 554
 555 #ifndef tsk_is_polling
 556 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 557 #endif
 558
 559 static void resched_task(struct task_struct *p)
 560 {
 561         int cpu;
 562
 563         assert_spin_locked(&task_rq(p)->lock);
 564
 565         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 566                 return;
 567
 568         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 569
 570         cpu = task_cpu(p);
 571         if (cpu == smp_processor_id())
 572                 return;
 573
 574         /* NEED_RESCHED must be visible before we test polling */
 575         smp_mb();
 576         if (!tsk_is_polling(p))
 577                 smp_send_reschedule(cpu);
 578 }
 579
 580 static void resched_cpu(int cpu)
 581 {
 582         struct rq *rq = cpu_rq(cpu);
 583         unsigned long flags;
 584
 585         if (!spin_trylock_irqsave(&rq->lock, flags))
 586                 return;
 587         resched_task(cpu_curr(cpu));
 588         spin_unlock_irqrestore(&rq->lock, flags);
 589 }
 590 #else
 591 static inline void resched_task(struct task_struct *p)
 592 {
 593         assert_spin_locked(&task_rq(p)->lock);
 594         set_tsk_need_resched(p);
 595 }
 596 #endif
 597
 598 static u64 div64_likely32(u64 divident, unsigned long divisor)
 599 {
 600 #if BITS_PER_LONG == 32
 601         if (likely(divident <= 0xffffffffULL))
 602                 return (u32)divident / divisor;
 603         do_div(divident, divisor);
 604
 605         return divident;
 606 #else
 607         return divident / divisor;
 608 #endif
 609 }
 610
 611 #if BITS_PER_LONG == 32
 612 # define WMULT_CONST    (~0UL)
 613 #else
 614 # define WMULT_CONST    (1UL << 32)
 615 #endif
 616
 617 #define WMULT_SHIFT     32
 618
 619 static inline unsigned long
 620 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 621                 struct load_weight *lw)
 622 {
 623         u64 tmp;
 624
 625         if (unlikely(!lw->inv_weight))
 626                 lw->inv_weight = WMULT_CONST / lw->weight;
 627
 628         tmp = (u64)delta_exec * weight;
 629         /*
 630          * Check whether we'd overflow the 64-bit multiplication:
 631          */
 632         if (unlikely(tmp > WMULT_CONST)) {
 633                 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 634                                 >> (WMULT_SHIFT/2);
 635         } else {
 636                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 637         }
 638
 639         return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
 640 }
 641
 642 static inline unsigned long
 643 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 644 {
 645         return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 646 }
 647
 648 static void update_load_add(struct load_weight *lw, unsigned long inc)
 649 {
 650         lw->weight += inc;
 651         lw->inv_weight = 0;
 652 }
 653
 654 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 655 {
 656         lw->weight -= dec;
 657         lw->inv_weight = 0;
 658 }
 659
 660 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 661 {
 662         if (rq->curr != rq->idle && ls->load.weight) {
 663                 ls->delta_exec += ls->delta_stat;
 664                 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 665                 ls->delta_stat = 0;
 666         }
 667 }
 668
 669 /*
 670  * Update delta_exec, delta_fair fields for rq.
 671  *
 672  * delta_fair clock advances at a rate inversely proportional to
 673  * total load (rq->ls.load.weight) on the runqueue, while
 674  * delta_exec advances at the same rate as wall-clock (provided
 675  * cpu is not idle).
 676  *
 677  * delta_exec / delta_fair is a measure of the (smoothened) load on this
 678  * runqueue over any given interval. This (smoothened) load is used
 679  * during load balance.
 680  *
 681  * This function is called /before/ updating rq->ls.load
 682  * and when switching tasks.
 683  */
 684 static void update_curr_load(struct rq *rq, u64 now)
 685 {
 686         struct load_stat *ls = &rq->ls;
 687         u64 start;
 688
 689         start = ls->load_update_start;
 690         ls->load_update_start = now;
 691         ls->delta_stat += now - start;
 692         /*
 693          * Stagger updates to ls->delta_fair. Very frequent updates
 694          * can be expensive.
 695          */
 696         if (ls->delta_stat >= sysctl_sched_stat_granularity)
 697                 __update_curr_load(rq, ls);
 698 }
 699
 700 /*
 701  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 702  * of tasks with abnormal "nice" values across CPUs the contribution that
 703  * each task makes to its run queue's load is weighted according to its
 704  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 705  * scaled version of the new time slice allocation that they receive on time
 706  * slice expiry etc.
 707  */
 708
 709 /*
 710  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
 711  * If static_prio_timeslice() is ever changed to break this assumption then
 712  * this code will need modification
 713  */
 714 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 715 #define load_weight(lp) \
 716         (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 717 #define PRIO_TO_LOAD_WEIGHT(prio) \
 718         load_weight(static_prio_timeslice(prio))
 719 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 720         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
 721
 722 #define WEIGHT_IDLEPRIO         2
 723 #define WMULT_IDLEPRIO          (1 << 31)
 724
 725 /*
 726  * Nice levels are multiplicative, with a gentle 10% change for every
 727  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 728  * nice 1, it will get ~10% less CPU time than another CPU-bound task
 729  * that remained on nice 0.
 730  *
 731  * The "10% effect" is relative and cumulative: from _any_ nice level,
 732  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 733  * it's +10% CPU usage.
 734  */
 735 static const int prio_to_weight[40] = {
 736 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 737 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 738 /*   0 */  NICE_0_LOAD /* 1024 */,
 739 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 740 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 741 };
 742
 743 static const u32 prio_to_wmult[40] = {
 744         48356,   60446,   75558,   94446,  118058,  147573,
 745         184467,  230589,  288233,  360285,  450347,
 746         562979,  703746,  879575, 1099582, 1374389,
 747         717986, 2147483, 2684354, 3355443, 4194304,
 748         244160, 6557201, 8196502, 10250518, 12782640,
 749         16025997, 19976592, 24970740, 31350126, 39045157,
 750         49367440, 61356675, 76695844, 95443717, 119304647,
 751         148102320, 186737708, 238609294, 286331153,
 752 };
 753
 754 static inline void
 755 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 756 {
 757         update_curr_load(rq, now);
 758         update_load_add(&rq->ls.load, p->se.load.weight);
 759 }
 760
 761 static inline void
 762 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 763 {
 764         update_curr_load(rq, now);
 765         update_load_sub(&rq->ls.load, p->se.load.weight);
 766 }
 767
 768 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 769 {
 770         rq->nr_running++;
 771         inc_load(rq, p, now);
 772 }
 773
 774 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 775 {
 776         rq->nr_running--;
 777         dec_load(rq, p, now);
 778 }
 779
 780 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 781
 782 /*
 783  * runqueue iterator, to support SMP load-balancing between different
 784  * scheduling classes, without having to expose their internal data
 785  * structures to the load-balancing proper:
 786  */
 787 struct rq_iterator {
 788         void *arg;
 789         struct task_struct *(*start)(void *);
 790         struct task_struct *(*next)(void *);
 791 };
 792
 793 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 794                       unsigned long max_nr_move, unsigned long max_load_move,
 795                       struct sched_domain *sd, enum cpu_idle_type idle,
 796                       int *all_pinned, unsigned long *load_moved,
 797                       int this_best_prio, int best_prio, int best_prio_seen,
 798                       struct rq_iterator *iterator);
 799
 800 #include "sched_stats.h"
 801 #include "sched_rt.c"
 802 #include "sched_fair.c"
 803 #include "sched_idletask.c"
 804 #ifdef CONFIG_SCHED_DEBUG
 805 # include "sched_debug.c"
 806 #endif
 807
 808 #define sched_class_highest (&rt_sched_class)
 809
 810 static void set_load_weight(struct task_struct *p)
 811 {
 812         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 813         p->se.wait_runtime = 0;
 814
 815         if (task_has_rt_policy(p)) {
 816                 p->se.load.weight = prio_to_weight[0] * 2;
 817                 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 818                 return;
 819         }
 820
 821         /*
 822          * SCHED_IDLE tasks get minimal weight:
 823          */
 824         if (p->policy == SCHED_IDLE) {
 825                 p->se.load.weight = WEIGHT_IDLEPRIO;
 826                 p->se.load.inv_weight = WMULT_IDLEPRIO;
 827                 return;
 828         }
 829
 830         p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 831         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 832 }
 833
 834 static void
 835 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 836 {
 837         sched_info_queued(p);
 838         p->sched_class->enqueue_task(rq, p, wakeup, now);
 839         p->se.on_rq = 1;
 840 }
 841
 842 static void
 843 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 844 {
 845         p->sched_class->dequeue_task(rq, p, sleep, now);
 846         p->se.on_rq = 0;
 847 }
 848
 849 /*
 850  * __normal_prio - return the priority that is based on the static prio
 851  */
 852 static inline int __normal_prio(struct task_struct *p)
 853 {
 854         return p->static_prio;
 855 }
 856
 857 /*
 858  * Calculate the expected normal priority: i.e. priority
 859  * without taking RT-inheritance into account. Might be
 860  * boosted by interactivity modifiers. Changes upon fork,
 861  * setprio syscalls, and whenever the interactivity
 862  * estimator recalculates.
 863  */
 864 static inline int normal_prio(struct task_struct *p)
 865 {
 866         int prio;
 867
 868         if (task_has_rt_policy(p))
 869                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 870         else
 871                 prio = __normal_prio(p);
 872         return prio;
 873 }
 874
 875 /*
 876  * Calculate the current priority, i.e. the priority
 877  * taken into account by the scheduler. This value might
 878  * be boosted by RT tasks, or might be boosted by
 879  * interactivity modifiers. Will be RT if the task got
 880  * RT-boosted. If not then it returns p->normal_prio.
 881  */
 882 static int effective_prio(struct task_struct *p)
 883 {
 884         p->normal_prio = normal_prio(p);
 885         /*
 886          * If we are RT tasks or we were boosted to RT priority,
 887          * keep the priority unchanged. Otherwise, update priority
 888          * to the normal priority:
 889          */
 890         if (!rt_prio(p->prio))
 891                 return p->normal_prio;
 892         return p->prio;
 893 }
 894
 895 /*
 896  * activate_task - move a task to the runqueue.
 897  */
 898 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 899 {
 900         u64 now = rq_clock(rq);
 901
 902         if (p->state == TASK_UNINTERRUPTIBLE)
 903                 rq->nr_uninterruptible--;
 904
 905         enqueue_task(rq, p, wakeup, now);
 906         inc_nr_running(p, rq, now);
 907 }
 908
 909 /*
 910  * activate_idle_task - move idle task to the _front_ of runqueue.
 911  */
 912 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 913 {
 914         u64 now = rq_clock(rq);
 915
 916         if (p->state == TASK_UNINTERRUPTIBLE)
 917                 rq->nr_uninterruptible--;
 918
 919         enqueue_task(rq, p, 0, now);
 920         inc_nr_running(p, rq, now);
 921 }
 922
 923 /*
 924  * deactivate_task - remove a task from the runqueue.
 925  */
 926 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 927 {
 928         u64 now = rq_clock(rq);
 929
 930         if (p->state == TASK_UNINTERRUPTIBLE)
 931                 rq->nr_uninterruptible++;
 932
 933         dequeue_task(rq, p, sleep, now);
 934         dec_nr_running(p, rq, now);
 935 }
 936
 937 /**
 938  * task_curr - is this task currently executing on a CPU?
 939  * @p: the task in question.
 940  */
 941 inline int task_curr(const struct task_struct *p)
 942 {
 943         return cpu_curr(task_cpu(p)) == p;
 944 }
 945
 946 /* Used instead of source_load when we know the type == 0 */
 947 unsigned long weighted_cpuload(const int cpu)
 948 {
 949         return cpu_rq(cpu)->ls.load.weight;
 950 }
 951
 952 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 953 {
 954 #ifdef CONFIG_SMP
 955         task_thread_info(p)->cpu = cpu;
 956         set_task_cfs_rq(p);
 957 #endif
 958 }
 959
 960 #ifdef CONFIG_SMP
 961
 962 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 963 {
 964         int old_cpu = task_cpu(p);
 965         struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 966         u64 clock_offset, fair_clock_offset;
 967
 968         clock_offset = old_rq->clock - new_rq->clock;
 969         fair_clock_offset = old_rq->cfs.fair_clock -
 970                                                  new_rq->cfs.fair_clock;
 971         if (p->se.wait_start)
 972                 p->se.wait_start -= clock_offset;
 973         if (p->se.wait_start_fair)
 974                 p->se.wait_start_fair -= fair_clock_offset;
 975         if (p->se.sleep_start)
 976                 p->se.sleep_start -= clock_offset;
 977         if (p->se.block_start)
 978                 p->se.block_start -= clock_offset;
 979         if (p->se.sleep_start_fair)
 980                 p->se.sleep_start_fair -= fair_clock_offset;
 981
 982         __set_task_cpu(p, new_cpu);
 983 }
 984
 985 struct migration_req {
 986         struct list_head list;
 987
 988         struct task_struct *task;
 989         int dest_cpu;
 990
 991         struct completion done;
 992 };
 993
 994 /*
 995  * The task's runqueue lock must be held.
 996  * Returns true if you have to wait for migration thread.
 997  */
 998 static int
 999 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1000 {
1001         struct rq *rq = task_rq(p);
1002
1003         /*
1004          * If the task is not on a runqueue (and not running), then
1005          * it is sufficient to simply update the task's cpu field.
1006          */
1007         if (!p->se.on_rq && !task_running(rq, p)) {
1008                 set_task_cpu(p, dest_cpu);
1009                 return 0;
1010         }
1011
1012         init_completion(&req->done);
1013         req->task = p;
1014         req->dest_cpu = dest_cpu;
1015         list_add(&req->list, &rq->migration_queue);
1016
1017         return 1;
1018 }
1019
1020 /*
1021  * wait_task_inactive - wait for a thread to unschedule.
1022  *
1023  * The caller must ensure that the task *will* unschedule sometime soon,
1024  * else this function might spin for a *long* time. This function can't
1025  * be called with interrupts off, or it may introduce deadlock with
1026  * smp_call_function() if an IPI is sent by the same process we are
1027  * waiting to become inactive.
1028  */
1029 void wait_task_inactive(struct task_struct *p)
1030 {
1031         unsigned long flags;
1032         int running, on_rq;
1033         struct rq *rq;
1034
1035 repeat:
1036         /*
1037          * We do the initial early heuristics without holding
1038          * any task-queue locks at all. We'll only try to get
1039          * the runqueue lock when things look like they will
1040          * work out!
1041          */
1042         rq = task_rq(p);
1043
1044         /*
1045          * If the task is actively running on another CPU
1046          * still, just relax and busy-wait without holding
1047          * any locks.
1048          *
1049          * NOTE! Since we don't hold any locks, it's not
1050          * even sure that "rq" stays as the right runqueue!
1051          * But we don't care, since "task_running()" will
1052          * return false if the runqueue has changed and p
1053          * is actually now running somewhere else!
1054          */
1055         while (task_running(rq, p))
1056                 cpu_relax();
1057
1058         /*
1059          * Ok, time to look more closely! We need the rq
1060          * lock now, to be *sure*. If we're wrong, we'll
1061          * just go back and repeat.
1062          */
1063         rq = task_rq_lock(p, &flags);
1064         running = task_running(rq, p);
1065         on_rq = p->se.on_rq;
1066         task_rq_unlock(rq, &flags);
1067
1068         /*
1069          * Was it really running after all now that we
1070          * checked with the proper locks actually held?
1071          *
1072          * Oops. Go back and try again..
1073          */
1074         if (unlikely(running)) {
1075                 cpu_relax();
1076                 goto repeat;
1077         }
1078
1079         /*
1080          * It's not enough that it's not actively running,
1081          * it must be off the runqueue _entirely_, and not
1082          * preempted!
1083          *
1084          * So if it wa still runnable (but just not actively
1085          * running right now), it's preempted, and we should
1086          * yield - it could be a while.
1087          */
1088         if (unlikely(on_rq)) {
1089                 yield();
1090                 goto repeat;
1091         }
1092
1093         /*
1094          * Ahh, all good. It wasn't running, and it wasn't
1095          * runnable, which means that it will never become
1096          * running in the future either. We're all done!
1097          */
1098 }
1099
1100 /***
1101  * kick_process - kick a running thread to enter/exit the kernel
1102  * @p: the to-be-kicked thread
1103  *
1104  * Cause a process which is running on another CPU to enter
1105  * kernel-mode, without any delay. (to get signals handled.)
1106  *
1107  * NOTE: this function doesnt have to take the runqueue lock,
1108  * because all it wants to ensure is that the remote task enters
1109  * the kernel. If the IPI races and the task has been migrated
1110  * to another CPU then no harm is done and the purpose has been
1111  * achieved as well.
1112  */
1113 void kick_process(struct task_struct *p)
1114 {
1115         int cpu;
1116
1117         preempt_disable();
1118         cpu = task_cpu(p);
1119         if ((cpu != smp_processor_id()) && task_curr(p))
1120                 smp_send_reschedule(cpu);
1121         preempt_enable();
1122 }
1123
1124 /*
1125  * Return a low guess at the load of a migration-source cpu weighted
1126  * according to the scheduling class and "nice" value.
1127  *
1128  * We want to under-estimate the load of migration sources, to
1129  * balance conservatively.
1130  */
1131 static inline unsigned long source_load(int cpu, int type)
1132 {
1133         struct rq *rq = cpu_rq(cpu);
1134         unsigned long total = weighted_cpuload(cpu);
1135
1136         if (type == 0)
1137                 return total;
1138
1139         return min(rq->cpu_load[type-1], total);
1140 }
1141
1142 /*
1143  * Return a high guess at the load of a migration-target cpu weighted
1144  * according to the scheduling class and "nice" value.
1145  */
1146 static inline unsigned long target_load(int cpu, int type)
1147 {
1148         struct rq *rq = cpu_rq(cpu);
1149         unsigned long total = weighted_cpuload(cpu);
1150
1151         if (type == 0)
1152                 return total;
1153
1154         return max(rq->cpu_load[type-1], total);
1155 }
1156
1157 /*
1158  * Return the average load per task on the cpu's run queue
1159  */
1160 static inline unsigned long cpu_avg_load_per_task(int cpu)
1161 {
1162         struct rq *rq = cpu_rq(cpu);
1163         unsigned long total = weighted_cpuload(cpu);
1164         unsigned long n = rq->nr_running;
1165
1166         return n ? total / n : SCHED_LOAD_SCALE;
1167 }
1168
1169 /*
1170  * find_idlest_group finds and returns the least busy CPU group within the
1171  * domain.
1172  */
1173 static struct sched_group *
1174 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1175 {
1176         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1177         unsigned long min_load = ULONG_MAX, this_load = 0;
1178         int load_idx = sd->forkexec_idx;
1179         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1180
1181         do {
1182                 unsigned long load, avg_load;
1183                 int local_group;
1184                 int i;
1185
1186                 /* Skip over this group if it has no CPUs allowed */
1187                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1188                         goto nextgroup;
1189
1190                 local_group = cpu_isset(this_cpu, group->cpumask);
1191
1192                 /* Tally up the load of all CPUs in the group */
1193                 avg_load = 0;
1194
1195                 for_each_cpu_mask(i, group->cpumask) {
1196                         /* Bias balancing toward cpus of our domain */
1197                         if (local_group)
1198                                 load = source_load(i, load_idx);
1199                         else
1200                                 load = target_load(i, load_idx);
1201
1202                         avg_load += load;
1203                 }
1204
1205                 /* Adjust by relative CPU power of the group */
1206                 avg_load = sg_div_cpu_power(group,
1207                                 avg_load * SCHED_LOAD_SCALE);
1208
1209                 if (local_group) {
1210                         this_load = avg_load;
1211                         this = group;
1212                 } else if (avg_load < min_load) {
1213                         min_load = avg_load;
1214                         idlest = group;
1215                 }
1216 nextgroup:
1217                 group = group->next;
1218         } while (group != sd->groups);
1219
1220         if (!idlest || 100*this_load < imbalance*min_load)
1221                 return NULL;
1222         return idlest;
1223 }
1224
1225 /*
1226  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1227  */
1228 static int
1229 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1230 {
1231         cpumask_t tmp;
1232         unsigned long load, min_load = ULONG_MAX;
1233         int idlest = -1;
1234         int i;
1235
1236         /* Traverse only the allowed CPUs */
1237         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1238
1239         for_each_cpu_mask(i, tmp) {
1240                 load = weighted_cpuload(i);
1241
1242                 if (load < min_load || (load == min_load && i == this_cpu)) {
1243                         min_load = load;
1244                         idlest = i;
1245                 }
1246         }
1247
1248         return idlest;
1249 }
1250
1251 /*
1252  * sched_balance_self: balance the current task (running on cpu) in domains
1253  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1254  * SD_BALANCE_EXEC.
1255  *
1256  * Balance, ie. select the least loaded group.
1257  *
1258  * Returns the target CPU number, or the same CPU if no balancing is needed.
1259  *
1260  * preempt must be disabled.
1261  */
1262 static int sched_balance_self(int cpu, int flag)
1263 {
1264         struct task_struct *t = current;
1265         struct sched_domain *tmp, *sd = NULL;
1266
1267         for_each_domain(cpu, tmp) {
1268                 /*
1269                  * If power savings logic is enabled for a domain, stop there.
1270                  */
1271                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1272                         break;
1273                 if (tmp->flags & flag)
1274                         sd = tmp;
1275         }
1276
1277         while (sd) {
1278                 cpumask_t span;
1279                 struct sched_group *group;
1280                 int new_cpu, weight;
1281
1282                 if (!(sd->flags & flag)) {
1283                         sd = sd->child;
1284                         continue;
1285                 }
1286
1287                 span = sd->span;
1288                 group = find_idlest_group(sd, t, cpu);
1289                 if (!group) {
1290                         sd = sd->child;
1291                         continue;
1292                 }
1293
1294                 new_cpu = find_idlest_cpu(group, t, cpu);
1295                 if (new_cpu == -1 || new_cpu == cpu) {
1296                         /* Now try balancing at a lower domain level of cpu */
1297                         sd = sd->child;
1298                         continue;
1299                 }
1300
1301                 /* Now try balancing at a lower domain level of new_cpu */
1302                 cpu = new_cpu;
1303                 sd = NULL;
1304                 weight = cpus_weight(span);
1305                 for_each_domain(cpu, tmp) {
1306                         if (weight <= cpus_weight(tmp->span))
1307                                 break;
1308                         if (tmp->flags & flag)
1309                                 sd = tmp;
1310                 }
1311                 /* while loop will break here if sd == NULL */
1312         }
1313
1314         return cpu;
1315 }
1316
1317 #endif /* CONFIG_SMP */
1318
1319 /*
1320  * wake_idle() will wake a task on an idle cpu if task->cpu is
1321  * not idle and an idle cpu is available.  The span of cpus to
1322  * search starts with cpus closest then further out as needed,
1323  * so we always favor a closer, idle cpu.
1324  *
1325  * Returns the CPU we should wake onto.
1326  */
1327 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1328 static int wake_idle(int cpu, struct task_struct *p)
1329 {
1330         cpumask_t tmp;
1331         struct sched_domain *sd;
1332         int i;
1333
1334         /*
1335          * If it is idle, then it is the best cpu to run this task.
1336          *
1337          * This cpu is also the best, if it has more than one task already.
1338          * Siblings must be also busy(in most cases) as they didn't already
1339          * pickup the extra load from this cpu and hence we need not check
1340          * sibling runqueue info. This will avoid the checks and cache miss
1341          * penalities associated with that.
1342          */
1343         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1344                 return cpu;
1345
1346         for_each_domain(cpu, sd) {
1347                 if (sd->flags & SD_WAKE_IDLE) {
1348                         cpus_and(tmp, sd->span, p->cpus_allowed);
1349                         for_each_cpu_mask(i, tmp) {
1350                                 if (idle_cpu(i))
1351                                         return i;
1352                         }
1353                 } else {
1354                         break;
1355                 }
1356         }
1357         return cpu;
1358 }
1359 #else
1360 static inline int wake_idle(int cpu, struct task_struct *p)
1361 {
1362         return cpu;
1363 }
1364 #endif
1365
1366 /***
1367  * try_to_wake_up - wake up a thread
1368  * @p: the to-be-woken-up thread
1369  * @state: the mask of task states that can be woken
1370  * @sync: do a synchronous wakeup?
1371  *
1372  * Put it on the run-queue if it's not already there. The "current"
1373  * thread is always on the run-queue (except when the actual
1374  * re-schedule is in progress), and as such you're allowed to do
1375  * the simpler "current->state = TASK_RUNNING" to mark yourself
1376  * runnable without the overhead of this.
1377  *
1378  * returns failure only if the task is already active.
1379  */
1380 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1381 {
1382         int cpu, this_cpu, success = 0;
1383         unsigned long flags;
1384         long old_state;
1385         struct rq *rq;
1386 #ifdef CONFIG_SMP
1387         struct sched_domain *sd, *this_sd = NULL;
1388         unsigned long load, this_load;
1389         int new_cpu;
1390 #endif
1391
1392         rq = task_rq_lock(p, &flags);
1393         old_state = p->state;
1394         if (!(old_state & state))
1395                 goto out;
1396
1397         if (p->se.on_rq)
1398                 goto out_running;
1399
1400         cpu = task_cpu(p);
1401         this_cpu = smp_processor_id();
1402
1403 #ifdef CONFIG_SMP
1404         if (unlikely(task_running(rq, p)))
1405                 goto out_activate;
1406
1407         new_cpu = cpu;
1408
1409         schedstat_inc(rq, ttwu_cnt);
1410         if (cpu == this_cpu) {
1411                 schedstat_inc(rq, ttwu_local);
1412                 goto out_set_cpu;
1413         }
1414
1415         for_each_domain(this_cpu, sd) {
1416                 if (cpu_isset(cpu, sd->span)) {
1417                         schedstat_inc(sd, ttwu_wake_remote);
1418                         this_sd = sd;
1419                         break;
1420                 }
1421         }
1422
1423         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1424                 goto out_set_cpu;
1425
1426         /*
1427          * Check for affine wakeup and passive balancing possibilities.
1428          */
1429         if (this_sd) {
1430                 int idx = this_sd->wake_idx;
1431                 unsigned int imbalance;
1432
1433                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1434
1435                 load = source_load(cpu, idx);
1436                 this_load = target_load(this_cpu, idx);
1437
1438                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1439
1440                 if (this_sd->flags & SD_WAKE_AFFINE) {
1441                         unsigned long tl = this_load;
1442                         unsigned long tl_per_task;
1443
1444                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1445
1446                         /*
1447                          * If sync wakeup then subtract the (maximum possible)
1448                          * effect of the currently running task from the load
1449                          * of the current CPU:
1450                          */
1451                         if (sync)
1452                                 tl -= current->se.load.weight;
1453
1454                         if ((tl <= load &&
1455                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1456                                100*(tl + p->se.load.weight) <= imbalance*load) {
1457                                 /*
1458                                  * This domain has SD_WAKE_AFFINE and
1459                                  * p is cache cold in this domain, and
1460                                  * there is no bad imbalance.
1461                                  */
1462                                 schedstat_inc(this_sd, ttwu_move_affine);
1463                                 goto out_set_cpu;
1464                         }
1465                 }
1466
1467                 /*
1468                  * Start passive balancing when half the imbalance_pct
1469                  * limit is reached.
1470                  */
1471                 if (this_sd->flags & SD_WAKE_BALANCE) {
1472                         if (imbalance*this_load <= 100*load) {
1473                                 schedstat_inc(this_sd, ttwu_move_balance);
1474                                 goto out_set_cpu;
1475                         }
1476                 }
1477         }
1478
1479         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1480 out_set_cpu:
1481         new_cpu = wake_idle(new_cpu, p);
1482         if (new_cpu != cpu) {
1483                 set_task_cpu(p, new_cpu);
1484                 task_rq_unlock(rq, &flags);
1485                 /* might preempt at this point */
1486                 rq = task_rq_lock(p, &flags);
1487                 old_state = p->state;
1488                 if (!(old_state & state))
1489                         goto out;
1490                 if (p->se.on_rq)
1491                         goto out_running;
1492
1493                 this_cpu = smp_processor_id();
1494                 cpu = task_cpu(p);
1495         }
1496
1497 out_activate:
1498 #endif /* CONFIG_SMP */
1499         activate_task(rq, p, 1);
1500         /*
1501          * Sync wakeups (i.e. those types of wakeups where the waker
1502          * has indicated that it will leave the CPU in short order)
1503          * don't trigger a preemption, if the woken up task will run on
1504          * this cpu. (in this case the 'I will reschedule' promise of
1505          * the waker guarantees that the freshly woken up task is going
1506          * to be considered on this CPU.)
1507          */
1508         if (!sync || cpu != this_cpu)
1509                 check_preempt_curr(rq, p);
1510         success = 1;
1511
1512 out_running:
1513         p->state = TASK_RUNNING;
1514 out:
1515         task_rq_unlock(rq, &flags);
1516
1517         return success;
1518 }
1519
1520 int fastcall wake_up_process(struct task_struct *p)
1521 {
1522         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1523                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1524 }
1525 EXPORT_SYMBOL(wake_up_process);
1526
1527 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1528 {
1529         return try_to_wake_up(p, state, 0);
1530 }
1531
1532 /*
1533  * Perform scheduler related setup for a newly forked process p.
1534  * p is forked by current.
1535  *
1536  * __sched_fork() is basic setup used by init_idle() too:
1537  */
1538 static void __sched_fork(struct task_struct *p)
1539 {
1540         p->se.wait_start_fair           = 0;
1541         p->se.wait_start                = 0;
1542         p->se.exec_start                = 0;
1543         p->se.sum_exec_runtime          = 0;
1544         p->se.delta_exec                = 0;
1545         p->se.delta_fair_run            = 0;
1546         p->se.delta_fair_sleep          = 0;
1547         p->se.wait_runtime              = 0;
1548         p->se.sum_wait_runtime          = 0;
1549         p->se.sum_sleep_runtime         = 0;
1550         p->se.sleep_start               = 0;
1551         p->se.sleep_start_fair          = 0;
1552         p->se.block_start               = 0;
1553         p->se.sleep_max                 = 0;
1554         p->se.block_max                 = 0;
1555         p->se.exec_max                  = 0;
1556         p->se.wait_max                  = 0;
1557         p->se.wait_runtime_overruns     = 0;
1558         p->se.wait_runtime_underruns    = 0;
1559
1560         INIT_LIST_HEAD(&p->run_list);
1561         p->se.on_rq = 0;
1562
1563         /*
1564          * We mark the process as running here, but have not actually
1565          * inserted it onto the runqueue yet. This guarantees that
1566          * nobody will actually run it, and a signal or other external
1567          * event cannot wake it up and insert it on the runqueue either.
1568          */
1569         p->state = TASK_RUNNING;
1570 }
1571
1572 /*
1573  * fork()/clone()-time setup:
1574  */
1575 void sched_fork(struct task_struct *p, int clone_flags)
1576 {
1577         int cpu = get_cpu();
1578
1579         __sched_fork(p);
1580
1581 #ifdef CONFIG_SMP
1582         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1583 #endif
1584         __set_task_cpu(p, cpu);
1585
1586         /*
1587          * Make sure we do not leak PI boosting priority to the child:
1588          */
1589         p->prio = current->normal_prio;
1590
1591 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1592         if (likely(sched_info_on()))
1593                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1594 #endif
1595 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1596         p->oncpu = 0;
1597 #endif
1598 #ifdef CONFIG_PREEMPT
1599         /* Want to start with kernel preemption disabled. */
1600         task_thread_info(p)->preempt_count = 1;
1601 #endif
1602         put_cpu();
1603 }
1604
1605 /*
1606  * After fork, child runs first. (default) If set to 0 then
1607  * parent will (try to) run first.
1608  */
1609 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1610
1611 /*
1612  * wake_up_new_task - wake up a newly created task for the first time.
1613  *
1614  * This function will do some initial scheduler statistics housekeeping
1615  * that must be done for every newly created context, then puts the task
1616  * on the runqueue and wakes it.
1617  */
1618 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1619 {
1620         unsigned long flags;
1621         struct rq *rq;
1622         int this_cpu;
1623
1624         rq = task_rq_lock(p, &flags);
1625         BUG_ON(p->state != TASK_RUNNING);
1626         this_cpu = smp_processor_id(); /* parent's CPU */
1627
1628         p->prio = effective_prio(p);
1629
1630         if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1631                         task_cpu(p) != this_cpu || !current->se.on_rq) {
1632                 activate_task(rq, p, 0);
1633         } else {
1634                 /*
1635                  * Let the scheduling class do new task startup
1636                  * management (if any):
1637                  */
1638                 p->sched_class->task_new(rq, p);
1639         }
1640         check_preempt_curr(rq, p);
1641         task_rq_unlock(rq, &flags);
1642 }
1643
1644 /**
1645  * prepare_task_switch - prepare to switch tasks
1646  * @rq: the runqueue preparing to switch
1647  * @next: the task we are going to switch to.
1648  *
1649  * This is called with the rq lock held and interrupts off. It must
1650  * be paired with a subsequent finish_task_switch after the context
1651  * switch.
1652  *
1653  * prepare_task_switch sets up locking and calls architecture specific
1654  * hooks.
1655  */
1656 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1657 {
1658         prepare_lock_switch(rq, next);
1659         prepare_arch_switch(next);
1660 }
1661
1662 /**
1663  * finish_task_switch - clean up after a task-switch
1664  * @rq: runqueue associated with task-switch
1665  * @prev: the thread we just switched away from.
1666  *
1667  * finish_task_switch must be called after the context switch, paired
1668  * with a prepare_task_switch call before the context switch.
1669  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1670  * and do any other architecture-specific cleanup actions.
1671  *
1672  * Note that we may have delayed dropping an mm in context_switch(). If
1673  * so, we finish that here outside of the runqueue lock.  (Doing it
1674  * with the lock held can cause deadlocks; see schedule() for
1675  * details.)
1676  */
1677 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1678         __releases(rq->lock)
1679 {
1680         struct mm_struct *mm = rq->prev_mm;
1681         long prev_state;
1682
1683         rq->prev_mm = NULL;
1684
1685         /*
1686          * A task struct has one reference for the use as "current".
1687          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1688          * schedule one last time. The schedule call will never return, and
1689          * the scheduled task must drop that reference.
1690          * The test for TASK_DEAD must occur while the runqueue locks are
1691          * still held, otherwise prev could be scheduled on another cpu, die
1692          * there before we look at prev->state, and then the reference would
1693          * be dropped twice.
1694          *              Manfred Spraul <manfred@colorfullife.com>
1695          */
1696         prev_state = prev->state;
1697         finish_arch_switch(prev);
1698         finish_lock_switch(rq, prev);
1699         if (mm)
1700                 mmdrop(mm);
1701         if (unlikely(prev_state == TASK_DEAD)) {
1702                 /*
1703                  * Remove function-return probe instances associated with this
1704                  * task and put them back on the free list.
1705                  */
1706                 kprobe_flush_task(prev);
1707                 put_task_struct(prev);
1708         }
1709 }
1710
1711 /**
1712  * schedule_tail - first thing a freshly forked thread must call.
1713  * @prev: the thread we just switched away from.
1714  */
1715 asmlinkage void schedule_tail(struct task_struct *prev)
1716         __releases(rq->lock)
1717 {
1718         struct rq *rq = this_rq();
1719
1720         finish_task_switch(rq, prev);
1721 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1722         /* In this case, finish_task_switch does not reenable preemption */
1723         preempt_enable();
1724 #endif
1725         if (current->set_child_tid)
1726                 put_user(current->pid, current->set_child_tid);
1727 }
1728
1729 /*
1730  * context_switch - switch to the new MM and the new
1731  * thread's register state.
1732  */
1733 static inline void
1734 context_switch(struct rq *rq, struct task_struct *prev,
1735                struct task_struct *next)
1736 {
1737         struct mm_struct *mm, *oldmm;
1738
1739         prepare_task_switch(rq, next);
1740         mm = next->mm;
1741         oldmm = prev->active_mm;
1742         /*
1743          * For paravirt, this is coupled with an exit in switch_to to
1744          * combine the page table reload and the switch backend into
1745          * one hypercall.
1746          */
1747         arch_enter_lazy_cpu_mode();
1748
1749         if (unlikely(!mm)) {
1750                 next->active_mm = oldmm;
1751                 atomic_inc(&oldmm->mm_count);
1752                 enter_lazy_tlb(oldmm, next);
1753         } else
1754                 switch_mm(oldmm, mm, next);
1755
1756         if (unlikely(!prev->mm)) {
1757                 prev->active_mm = NULL;
1758                 rq->prev_mm = oldmm;
1759         }
1760         /*
1761          * Since the runqueue lock will be released by the next
1762          * task (which is an invalid locking op but in the case
1763          * of the scheduler it's an obvious special-case), so we
1764          * do an early lockdep release here:
1765          */
1766 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1767         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1768 #endif
1769
1770         /* Here we just switch the register state and the stack. */
1771         switch_to(prev, next, prev);
1772
1773         barrier();
1774         /*
1775          * this_rq must be evaluated again because prev may have moved
1776          * CPUs since it called schedule(), thus the 'rq' on its stack
1777          * frame will be invalid.
1778          */
1779         finish_task_switch(this_rq(), prev);
1780 }
1781
1782 /*
1783  * nr_running, nr_uninterruptible and nr_context_switches:
1784  *
1785  * externally visible scheduler statistics: current number of runnable
1786  * threads, current number of uninterruptible-sleeping threads, total
1787  * number of context switches performed since bootup.
1788  */
1789 unsigned long nr_running(void)
1790 {
1791         unsigned long i, sum = 0;
1792
1793         for_each_online_cpu(i)
1794                 sum += cpu_rq(i)->nr_running;
1795
1796         return sum;
1797 }
1798
1799 unsigned long nr_uninterruptible(void)
1800 {
1801         unsigned long i, sum = 0;
1802
1803         for_each_possible_cpu(i)
1804                 sum += cpu_rq(i)->nr_uninterruptible;
1805
1806         /*
1807          * Since we read the counters lockless, it might be slightly
1808          * inaccurate. Do not allow it to go below zero though:
1809          */
1810         if (unlikely((long)sum < 0))
1811                 sum = 0;
1812
1813         return sum;
1814 }
1815
1816 unsigned long long nr_context_switches(void)
1817 {
1818         int i;
1819         unsigned long long sum = 0;
1820
1821         for_each_possible_cpu(i)
1822                 sum += cpu_rq(i)->nr_switches;
1823
1824         return sum;
1825 }
1826
1827 unsigned long nr_iowait(void)
1828 {
1829         unsigned long i, sum = 0;
1830
1831         for_each_possible_cpu(i)
1832                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1833
1834         return sum;
1835 }
1836
1837 unsigned long nr_active(void)
1838 {
1839         unsigned long i, running = 0, uninterruptible = 0;
1840
1841         for_each_online_cpu(i) {
1842                 running += cpu_rq(i)->nr_running;
1843                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1844         }
1845
1846         if (unlikely((long)uninterruptible < 0))
1847                 uninterruptible = 0;
1848
1849         return running + uninterruptible;
1850 }
1851
1852 /*
1853  * Update rq->cpu_load[] statistics. This function is usually called every
1854  * scheduler tick (TICK_NSEC).
1855  */
1856 static void update_cpu_load(struct rq *this_rq)
1857 {
1858         u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1859         unsigned long total_load = this_rq->ls.load.weight;
1860         unsigned long this_load =  total_load;
1861         struct load_stat *ls = &this_rq->ls;
1862         u64 now = __rq_clock(this_rq);
1863         int i, scale;
1864
1865         this_rq->nr_load_updates++;
1866         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1867                 goto do_avg;
1868
1869         /* Update delta_fair/delta_exec fields first */
1870         update_curr_load(this_rq, now);
1871
1872         fair_delta64 = ls->delta_fair + 1;
1873         ls->delta_fair = 0;
1874
1875         exec_delta64 = ls->delta_exec + 1;
1876         ls->delta_exec = 0;
1877
1878         sample_interval64 = now - ls->load_update_last;
1879         ls->load_update_last = now;
1880
1881         if ((s64)sample_interval64 < (s64)TICK_NSEC)
1882                 sample_interval64 = TICK_NSEC;
1883
1884         if (exec_delta64 > sample_interval64)
1885                 exec_delta64 = sample_interval64;
1886
1887         idle_delta64 = sample_interval64 - exec_delta64;
1888
1889         tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1890         tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1891
1892         this_load = (unsigned long)tmp64;
1893
1894 do_avg:
1895
1896         /* Update our load: */
1897         for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1898                 unsigned long old_load, new_load;
1899
1900                 /* scale is effectively 1 << i now, and >> i divides by scale */
1901
1902                 old_load = this_rq->cpu_load[i];
1903                 new_load = this_load;
1904
1905                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1906         }
1907 }
1908
1909 #ifdef CONFIG_SMP
1910
1911 /*
1912  * double_rq_lock - safely lock two runqueues
1913  *
1914  * Note this does not disable interrupts like task_rq_lock,
1915  * you need to do so manually before calling.
1916  */
1917 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1918         __acquires(rq1->lock)
1919         __acquires(rq2->lock)
1920 {
1921         BUG_ON(!irqs_disabled());
1922         if (rq1 == rq2) {
1923                 spin_lock(&rq1->lock);
1924                 __acquire(rq2->lock);   /* Fake it out ;) */
1925         } else {
1926                 if (rq1 < rq2) {
1927                         spin_lock(&rq1->lock);
1928                         spin_lock(&rq2->lock);
1929                 } else {
1930                         spin_lock(&rq2->lock);
1931                         spin_lock(&rq1->lock);
1932                 }
1933         }
1934 }
1935
1936 /*
1937  * double_rq_unlock - safely unlock two runqueues
1938  *
1939  * Note this does not restore interrupts like task_rq_unlock,
1940  * you need to do so manually after calling.
1941  */
1942 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1943         __releases(rq1->lock)
1944         __releases(rq2->lock)
1945 {
1946         spin_unlock(&rq1->lock);
1947         if (rq1 != rq2)
1948                 spin_unlock(&rq2->lock);
1949         else
1950                 __release(rq2->lock);
1951 }
1952
1953 /*
1954  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1955  */
1956 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1957         __releases(this_rq->lock)
1958         __acquires(busiest->lock)
1959         __acquires(this_rq->lock)
1960 {
1961         if (unlikely(!irqs_disabled())) {
1962                 /* printk() doesn't work good under rq->lock */
1963                 spin_unlock(&this_rq->lock);
1964                 BUG_ON(1);
1965         }
1966         if (unlikely(!spin_trylock(&busiest->lock))) {
1967                 if (busiest < this_rq) {
1968                         spin_unlock(&this_rq->lock);
1969                         spin_lock(&busiest->lock);
1970                         spin_lock(&this_rq->lock);
1971                 } else
1972                         spin_lock(&busiest->lock);
1973         }
1974 }
1975
1976 /*
1977  * If dest_cpu is allowed for this process, migrate the task to it.
1978  * This is accomplished by forcing the cpu_allowed mask to only
1979  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1980  * the cpu_allowed mask is restored.
1981  */
1982 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1983 {
1984         struct migration_req req;
1985         unsigned long flags;
1986         struct rq *rq;
1987
1988         rq = task_rq_lock(p, &flags);
1989         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1990             || unlikely(cpu_is_offline(dest_cpu)))
1991                 goto out;
1992
1993         /* force the process onto the specified CPU */
1994         if (migrate_task(p, dest_cpu, &req)) {
1995                 /* Need to wait for migration thread (might exit: take ref). */
1996                 struct task_struct *mt = rq->migration_thread;
1997
1998                 get_task_struct(mt);
1999                 task_rq_unlock(rq, &flags);
2000                 wake_up_process(mt);
2001                 put_task_struct(mt);
2002                 wait_for_completion(&req.done);
2003
2004                 return;
2005         }
2006 out:
2007         task_rq_unlock(rq, &flags);
2008 }
2009
2010 /*
2011  * sched_exec - execve() is a valuable balancing opportunity, because at
2012  * this point the task has the smallest effective memory and cache footprint.
2013  */
2014 void sched_exec(void)
2015 {
2016         int new_cpu, this_cpu = get_cpu();
2017         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2018         put_cpu();
2019         if (new_cpu != this_cpu)
2020                 sched_migrate_task(current, new_cpu);
2021 }
2022
2023 /*
2024  * pull_task - move a task from a remote runqueue to the local runqueue.
2025  * Both runqueues must be locked.
2026  */
2027 static void pull_task(struct rq *src_rq, struct task_struct *p,
2028                       struct rq *this_rq, int this_cpu)
2029 {
2030         deactivate_task(src_rq, p, 0);
2031         set_task_cpu(p, this_cpu);
2032         activate_task(this_rq, p, 0);
2033         /*
2034          * Note that idle threads have a prio of MAX_PRIO, for this test
2035          * to be always true for them.
2036          */
2037         check_preempt_curr(this_rq, p);
2038 }
2039
2040 /*
2041  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2042  */
2043 static
2044 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2045                      struct sched_domain *sd, enum cpu_idle_type idle,
2046                      int *all_pinned)
2047 {
2048         /*
2049          * We do not migrate tasks that are:
2050          * 1) running (obviously), or
2051          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2052          * 3) are cache-hot on their current CPU.
2053          */
2054         if (!cpu_isset(this_cpu, p->cpus_allowed))
2055                 return 0;
2056         *all_pinned = 0;
2057
2058         if (task_running(rq, p))
2059                 return 0;
2060
2061         /*
2062          * Aggressive migration if too many balance attempts have failed:
2063          */
2064         if (sd->nr_balance_failed > sd->cache_nice_tries)
2065                 return 1;
2066
2067         return 1;
2068 }
2069
2070 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2071                       unsigned long max_nr_move, unsigned long max_load_move,
2072                       struct sched_domain *sd, enum cpu_idle_type idle,
2073                       int *all_pinned, unsigned long *load_moved,
2074                       int this_best_prio, int best_prio, int best_prio_seen,
2075                       struct rq_iterator *iterator)
2076 {
2077         int pulled = 0, pinned = 0, skip_for_load;
2078         struct task_struct *p;
2079         long rem_load_move = max_load_move;
2080
2081         if (max_nr_move == 0 || max_load_move == 0)
2082                 goto out;
2083
2084         pinned = 1;
2085
2086         /*
2087          * Start the load-balancing iterator:
2088          */
2089         p = iterator->start(iterator->arg);
2090 next:
2091         if (!p)
2092                 goto out;
2093         /*
2094          * To help distribute high priority tasks accross CPUs we don't
2095          * skip a task if it will be the highest priority task (i.e. smallest
2096          * prio value) on its new queue regardless of its load weight
2097          */
2098         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2099                                                          SCHED_LOAD_SCALE_FUZZ;
2100         if (skip_for_load && p->prio < this_best_prio)
2101                 skip_for_load = !best_prio_seen && p->prio == best_prio;
2102         if (skip_for_load ||
2103             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2104
2105                 best_prio_seen |= p->prio == best_prio;
2106                 p = iterator->next(iterator->arg);
2107                 goto next;
2108         }
2109
2110         pull_task(busiest, p, this_rq, this_cpu);
2111         pulled++;
2112         rem_load_move -= p->se.load.weight;
2113
2114         /*
2115          * We only want to steal up to the prescribed number of tasks
2116          * and the prescribed amount of weighted load.
2117          */
2118         if (pulled < max_nr_move && rem_load_move > 0) {
2119                 if (p->prio < this_best_prio)
2120                         this_best_prio = p->prio;
2121                 p = iterator->next(iterator->arg);
2122                 goto next;
2123         }
2124 out:
2125         /*
2126          * Right now, this is the only place pull_task() is called,
2127          * so we can safely collect pull_task() stats here rather than
2128          * inside pull_task().
2129          */
2130         schedstat_add(sd, lb_gained[idle], pulled);
2131
2132         if (all_pinned)
2133                 *all_pinned = pinned;
2134         *load_moved = max_load_move - rem_load_move;
2135         return pulled;
2136 }
2137
2138 /*
2139  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2140  * load from busiest to this_rq, as part of a balancing operation within
2141  * "domain". Returns the number of tasks moved.
2142  *
2143  * Called with both runqueues locked.
2144  */
2145 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2146                       unsigned long max_nr_move, unsigned long max_load_move,
2147                       struct sched_domain *sd, enum cpu_idle_type idle,
2148                       int *all_pinned)
2149 {
2150         struct sched_class *class = sched_class_highest;
2151         unsigned long load_moved, total_nr_moved = 0, nr_moved;
2152         long rem_load_move = max_load_move;
2153
2154         do {
2155                 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2156                                 max_nr_move, (unsigned long)rem_load_move,
2157                                 sd, idle, all_pinned, &load_moved);
2158                 total_nr_moved += nr_moved;
2159                 max_nr_move -= nr_moved;
2160                 rem_load_move -= load_moved;
2161                 class = class->next;
2162         } while (class && max_nr_move && rem_load_move > 0);
2163
2164         return total_nr_moved;
2165 }
2166
2167 /*
2168  * find_busiest_group finds and returns the busiest CPU group within the
2169  * domain. It calculates and returns the amount of weighted load which
2170  * should be moved to restore balance via the imbalance parameter.
2171  */
2172 static struct sched_group *
2173 find_busiest_group(struct sched_domain *sd, int this_cpu,
2174                    unsigned long *imbalance, enum cpu_idle_type idle,
2175                    int *sd_idle, cpumask_t *cpus, int *balance)
2176 {
2177         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2178         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2179         unsigned long max_pull;
2180         unsigned long busiest_load_per_task, busiest_nr_running;
2181         unsigned long this_load_per_task, this_nr_running;
2182         int load_idx;
2183 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2184         int power_savings_balance = 1;
2185         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2186         unsigned long min_nr_running = ULONG_MAX;
2187         struct sched_group *group_min = NULL, *group_leader = NULL;
2188 #endif
2189
2190         max_load = this_load = total_load = total_pwr = 0;
2191         busiest_load_per_task = busiest_nr_running = 0;
2192         this_load_per_task = this_nr_running = 0;
2193         if (idle == CPU_NOT_IDLE)
2194                 load_idx = sd->busy_idx;
2195         else if (idle == CPU_NEWLY_IDLE)
2196                 load_idx = sd->newidle_idx;
2197         else
2198                 load_idx = sd->idle_idx;
2199
2200         do {
2201                 unsigned long load, group_capacity;
2202                 int local_group;
2203                 int i;
2204                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2205                 unsigned long sum_nr_running, sum_weighted_load;
2206
2207                 local_group = cpu_isset(this_cpu, group->cpumask);
2208
2209                 if (local_group)
2210                         balance_cpu = first_cpu(group->cpumask);
2211
2212                 /* Tally up the load of all CPUs in the group */
2213                 sum_weighted_load = sum_nr_running = avg_load = 0;
2214
2215                 for_each_cpu_mask(i, group->cpumask) {
2216                         struct rq *rq;
2217
2218                         if (!cpu_isset(i, *cpus))
2219                                 continue;
2220
2221                         rq = cpu_rq(i);
2222
2223                         if (*sd_idle && !idle_cpu(i))
2224                                 *sd_idle = 0;
2225
2226                         /* Bias balancing toward cpus of our domain */
2227                         if (local_group) {
2228                                 if (idle_cpu(i) && !first_idle_cpu) {
2229                                         first_idle_cpu = 1;
2230                                         balance_cpu = i;
2231                                 }
2232
2233                                 load = target_load(i, load_idx);
2234                         } else
2235                                 load = source_load(i, load_idx);
2236
2237                         avg_load += load;
2238                         sum_nr_running += rq->nr_running;
2239                         sum_weighted_load += weighted_cpuload(i);
2240                 }
2241
2242                 /*
2243                  * First idle cpu or the first cpu(busiest) in this sched group
2244                  * is eligible for doing load balancing at this and above
2245                  * domains.
2246                  */
2247                 if (local_group && balance_cpu != this_cpu && balance) {
2248                         *balance = 0;
2249                         goto ret;
2250                 }
2251
2252                 total_load += avg_load;
2253                 total_pwr += group->__cpu_power;
2254
2255                 /* Adjust by relative CPU power of the group */
2256                 avg_load = sg_div_cpu_power(group,
2257                                 avg_load * SCHED_LOAD_SCALE);
2258
2259                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2260
2261                 if (local_group) {
2262                         this_load = avg_load;
2263                         this = group;
2264                         this_nr_running = sum_nr_running;
2265                         this_load_per_task = sum_weighted_load;
2266                 } else if (avg_load > max_load &&
2267                            sum_nr_running > group_capacity) {
2268                         max_load = avg_load;
2269                         busiest = group;
2270                         busiest_nr_running = sum_nr_running;
2271                         busiest_load_per_task = sum_weighted_load;
2272                 }
2273
2274 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2275                 /*
2276                  * Busy processors will not participate in power savings
2277                  * balance.
2278                  */
2279                 if (idle == CPU_NOT_IDLE ||
2280                                 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2281                         goto group_next;
2282
2283                 /*
2284                  * If the local group is idle or completely loaded
2285                  * no need to do power savings balance at this domain
2286                  */
2287                 if (local_group && (this_nr_running >= group_capacity ||
2288                                     !this_nr_running))
2289                         power_savings_balance = 0;
2290
2291                 /*
2292                  * If a group is already running at full capacity or idle,
2293                  * don't include that group in power savings calculations
2294                  */
2295                 if (!power_savings_balance || sum_nr_running >= group_capacity
2296                     || !sum_nr_running)
2297                         goto group_next;
2298
2299                 /*
2300                  * Calculate the group which has the least non-idle load.
2301                  * This is the group from where we need to pick up the load
2302                  * for saving power
2303                  */
2304                 if ((sum_nr_running < min_nr_running) ||
2305                     (sum_nr_running == min_nr_running &&
2306                      first_cpu(group->cpumask) <
2307                      first_cpu(group_min->cpumask))) {
2308                         group_min = group;
2309                         min_nr_running = sum_nr_running;
2310                         min_load_per_task = sum_weighted_load /
2311                                                 sum_nr_running;
2312                 }
2313
2314                 /*
2315                  * Calculate the group which is almost near its
2316                  * capacity but still has some space to pick up some load
2317                  * from other group and save more power
2318                  */
2319                 if (sum_nr_running <= group_capacity - 1) {
2320                         if (sum_nr_running > leader_nr_running ||
2321                             (sum_nr_running == leader_nr_running &&
2322                              first_cpu(group->cpumask) >
2323                               first_cpu(group_leader->cpumask))) {
2324                                 group_leader = group;
2325                                 leader_nr_running = sum_nr_running;
2326                         }
2327                 }
2328 group_next:
2329 #endif
2330                 group = group->next;
2331         } while (group != sd->groups);
2332
2333         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2334                 goto out_balanced;
2335
2336         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2337
2338         if (this_load >= avg_load ||
2339                         100*max_load <= sd->imbalance_pct*this_load)
2340                 goto out_balanced;
2341
2342         busiest_load_per_task /= busiest_nr_running;
2343         /*
2344          * We're trying to get all the cpus to the average_load, so we don't
2345          * want to push ourselves above the average load, nor do we wish to
2346          * reduce the max loaded cpu below the average load, as either of these
2347          * actions would just result in more rebalancing later, and ping-pong
2348          * tasks around. Thus we look for the minimum possible imbalance.
2349          * Negative imbalances (*we* are more loaded than anyone else) will
2350          * be counted as no imbalance for these purposes -- we can't fix that
2351          * by pulling tasks to us.  Be careful of negative numbers as they'll
2352          * appear as very large values with unsigned longs.
2353          */
2354         if (max_load <= busiest_load_per_task)
2355                 goto out_balanced;
2356
2357         /*
2358          * In the presence of smp nice balancing, certain scenarios can have
2359          * max load less than avg load(as we skip the groups at or below
2360          * its cpu_power, while calculating max_load..)
2361          */
2362         if (max_load < avg_load) {
2363                 *imbalance = 0;
2364                 goto small_imbalance;
2365         }
2366
2367         /* Don't want to pull so many tasks that a group would go idle */
2368         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2369
2370         /* How much load to actually move to equalise the imbalance */
2371         *imbalance = min(max_pull * busiest->__cpu_power,
2372                                 (avg_load - this_load) * this->__cpu_power)
2373                         / SCHED_LOAD_SCALE;
2374
2375         /*
2376          * if *imbalance is less than the average load per runnable task
2377          * there is no gaurantee that any tasks will be moved so we'll have
2378          * a think about bumping its value to force at least one task to be
2379          * moved
2380          */
2381         if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2382                 unsigned long tmp, pwr_now, pwr_move;
2383                 unsigned int imbn;
2384
2385 small_imbalance:
2386                 pwr_move = pwr_now = 0;
2387                 imbn = 2;
2388                 if (this_nr_running) {
2389                         this_load_per_task /= this_nr_running;
2390                         if (busiest_load_per_task > this_load_per_task)
2391                                 imbn = 1;
2392                 } else
2393                         this_load_per_task = SCHED_LOAD_SCALE;
2394
2395                 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2396                                         busiest_load_per_task * imbn) {
2397                         *imbalance = busiest_load_per_task;
2398                         return busiest;
2399                 }
2400
2401                 /*
2402                  * OK, we don't have enough imbalance to justify moving tasks,
2403                  * however we may be able to increase total CPU power used by
2404                  * moving them.
2405                  */
2406
2407                 pwr_now += busiest->__cpu_power *
2408                                 min(busiest_load_per_task, max_load);
2409                 pwr_now += this->__cpu_power *
2410                                 min(this_load_per_task, this_load);
2411                 pwr_now /= SCHED_LOAD_SCALE;
2412
2413                 /* Amount of load we'd subtract */
2414                 tmp = sg_div_cpu_power(busiest,
2415                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2416                 if (max_load > tmp)
2417                         pwr_move += busiest->__cpu_power *
2418                                 min(busiest_load_per_task, max_load - tmp);
2419
2420                 /* Amount of load we'd add */
2421                 if (max_load * busiest->__cpu_power <
2422                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2423                         tmp = sg_div_cpu_power(this,
2424                                         max_load * busiest->__cpu_power);
2425                 else
2426                         tmp = sg_div_cpu_power(this,
2427                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2428                 pwr_move += this->__cpu_power *
2429                                 min(this_load_per_task, this_load + tmp);
2430                 pwr_move /= SCHED_LOAD_SCALE;
2431
2432                 /* Move if we gain throughput */
2433                 if (pwr_move <= pwr_now)
2434                         goto out_balanced;
2435
2436                 *imbalance = busiest_load_per_task;
2437         }
2438
2439         return busiest;
2440
2441 out_balanced:
2442 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2443         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2444                 goto ret;
2445
2446         if (this == group_leader && group_leader != group_min) {
2447                 *imbalance = min_load_per_task;
2448                 return group_min;
2449         }
2450 #endif
2451 ret:
2452         *imbalance = 0;
2453         return NULL;
2454 }
2455
2456 /*
2457  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2458  */
2459 static struct rq *
2460 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2461                    unsigned long imbalance, cpumask_t *cpus)
2462 {
2463         struct rq *busiest = NULL, *rq;
2464         unsigned long max_load = 0;
2465         int i;
2466
2467         for_each_cpu_mask(i, group->cpumask) {
2468                 unsigned long wl;
2469
2470                 if (!cpu_isset(i, *cpus))
2471                         continue;
2472
2473                 rq = cpu_rq(i);
2474                 wl = weighted_cpuload(i);
2475
2476                 if (rq->nr_running == 1 && wl > imbalance)
2477                         continue;
2478
2479                 if (wl > max_load) {
2480                         max_load = wl;
2481                         busiest = rq;
2482                 }
2483         }
2484
2485         return busiest;
2486 }
2487
2488 /*
2489  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2490  * so long as it is large enough.
2491  */
2492 #define MAX_PINNED_INTERVAL     512
2493
2494 static inline unsigned long minus_1_or_zero(unsigned long n)
2495 {
2496         return n > 0 ? n - 1 : 0;
2497 }
2498
2499 /*
2500  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2501  * tasks if there is an imbalance.
2502  */
2503 static int load_balance(int this_cpu, struct rq *this_rq,
2504                         struct sched_domain *sd, enum cpu_idle_type idle,
2505                         int *balance)
2506 {
2507         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2508         struct sched_group *group;
2509         unsigned long imbalance;
2510         struct rq *busiest;
2511         cpumask_t cpus = CPU_MASK_ALL;
2512         unsigned long flags;
2513
2514         /*
2515          * When power savings policy is enabled for the parent domain, idle
2516          * sibling can pick up load irrespective of busy siblings. In this case,
2517          * let the state of idle sibling percolate up as CPU_IDLE, instead of
2518          * portraying it as CPU_NOT_IDLE.
2519          */
2520         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2521             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2522                 sd_idle = 1;
2523
2524         schedstat_inc(sd, lb_cnt[idle]);
2525
2526 redo:
2527         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2528                                    &cpus, balance);
2529
2530         if (*balance == 0)
2531                 goto out_balanced;
2532
2533         if (!group) {
2534                 schedstat_inc(sd, lb_nobusyg[idle]);
2535                 goto out_balanced;
2536         }
2537
2538         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2539         if (!busiest) {
2540                 schedstat_inc(sd, lb_nobusyq[idle]);
2541                 goto out_balanced;
2542         }
2543
2544         BUG_ON(busiest == this_rq);
2545
2546         schedstat_add(sd, lb_imbalance[idle], imbalance);
2547
2548         nr_moved = 0;
2549         if (busiest->nr_running > 1) {
2550                 /*
2551                  * Attempt to move tasks. If find_busiest_group has found
2552                  * an imbalance but busiest->nr_running <= 1, the group is
2553                  * still unbalanced. nr_moved simply stays zero, so it is
2554                  * correctly treated as an imbalance.
2555                  */
2556                 local_irq_save(flags);
2557                 double_rq_lock(this_rq, busiest);
2558                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2559                                       minus_1_or_zero(busiest->nr_running),
2560                                       imbalance, sd, idle, &all_pinned);
2561                 double_rq_unlock(this_rq, busiest);
2562                 local_irq_restore(flags);
2563
2564                 /*
2565                  * some other cpu did the load balance for us.
2566                  */
2567                 if (nr_moved && this_cpu != smp_processor_id())
2568                         resched_cpu(this_cpu);
2569
2570                 /* All tasks on this runqueue were pinned by CPU affinity */
2571                 if (unlikely(all_pinned)) {
2572                         cpu_clear(cpu_of(busiest), cpus);
2573                         if (!cpus_empty(cpus))
2574                                 goto redo;
2575                         goto out_balanced;
2576                 }
2577         }
2578
2579         if (!nr_moved) {
2580                 schedstat_inc(sd, lb_failed[idle]);
2581                 sd->nr_balance_failed++;
2582
2583                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2584
2585                         spin_lock_irqsave(&busiest->lock, flags);
2586
2587                         /* don't kick the migration_thread, if the curr
2588                          * task on busiest cpu can't be moved to this_cpu
2589                          */
2590                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2591                                 spin_unlock_irqrestore(&busiest->lock, flags);
2592                                 all_pinned = 1;
2593                                 goto out_one_pinned;
2594                         }
2595
2596                         if (!busiest->active_balance) {
2597                                 busiest->active_balance = 1;
2598                                 busiest->push_cpu = this_cpu;
2599                                 active_balance = 1;
2600                         }
2601                         spin_unlock_irqrestore(&busiest->lock, flags);
2602                         if (active_balance)
2603                                 wake_up_process(busiest->migration_thread);
2604
2605                         /*
2606                          * We've kicked active balancing, reset the failure
2607                          * counter.
2608                          */
2609                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2610                 }
2611         } else
2612                 sd->nr_balance_failed = 0;
2613
2614         if (likely(!active_balance)) {
2615                 /* We were unbalanced, so reset the balancing interval */
2616                 sd->balance_interval = sd->min_interval;
2617         } else {
2618                 /*
2619                  * If we've begun active balancing, start to back off. This
2620                  * case may not be covered by the all_pinned logic if there
2621                  * is only 1 task on the busy runqueue (because we don't call
2622                  * move_tasks).
2623                  */
2624                 if (sd->balance_interval < sd->max_interval)
2625                         sd->balance_interval *= 2;
2626         }
2627
2628         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2629             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2630                 return -1;
2631         return nr_moved;
2632
2633 out_balanced:
2634         schedstat_inc(sd, lb_balanced[idle]);
2635
2636         sd->nr_balance_failed = 0;
2637
2638 out_one_pinned:
2639         /* tune up the balancing interval */
2640         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2641                         (sd->balance_interval < sd->max_interval))
2642                 sd->balance_interval *= 2;
2643
2644         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2645             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2646                 return -1;
2647         return 0;
2648 }
2649
2650 /*
2651  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2652  * tasks if there is an imbalance.
2653  *
2654  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2655  * this_rq is locked.
2656  */
2657 static int
2658 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2659 {
2660         struct sched_group *group;
2661         struct rq *busiest = NULL;
2662         unsigned long imbalance;
2663         int nr_moved = 0;
2664         int sd_idle = 0;
2665         cpumask_t cpus = CPU_MASK_ALL;
2666
2667         /*
2668          * When power savings policy is enabled for the parent domain, idle
2669          * sibling can pick up load irrespective of busy siblings. In this case,
2670          * let the state of idle sibling percolate up as IDLE, instead of
2671          * portraying it as CPU_NOT_IDLE.
2672          */
2673         if (sd->flags & SD_SHARE_CPUPOWER &&
2674             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2675                 sd_idle = 1;
2676
2677         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2678 redo:
2679         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2680                                    &sd_idle, &cpus, NULL);
2681         if (!group) {
2682                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2683                 goto out_balanced;
2684         }
2685
2686         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2687                                 &cpus);
2688         if (!busiest) {
2689                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2690                 goto out_balanced;
2691         }
2692
2693         BUG_ON(busiest == this_rq);
2694
2695         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2696
2697         nr_moved = 0;
2698         if (busiest->nr_running > 1) {
2699                 /* Attempt to move tasks */
2700                 double_lock_balance(this_rq, busiest);
2701                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2702                                         minus_1_or_zero(busiest->nr_running),
2703                                         imbalance, sd, CPU_NEWLY_IDLE, NULL);
2704                 spin_unlock(&busiest->lock);
2705
2706                 if (!nr_moved) {
2707                         cpu_clear(cpu_of(busiest), cpus);
2708                         if (!cpus_empty(cpus))
2709                                 goto redo;
2710                 }
2711         }
2712
2713         if (!nr_moved) {
2714                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2715                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2716                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2717                         return -1;
2718         } else
2719                 sd->nr_balance_failed = 0;
2720
2721         return nr_moved;
2722
2723 out_balanced:
2724         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2725         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2726             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2727                 return -1;
2728         sd->nr_balance_failed = 0;
2729
2730         return 0;
2731 }
2732
2733 /*
2734  * idle_balance is called by schedule() if this_cpu is about to become
2735  * idle. Attempts to pull tasks from other CPUs.
2736  */
2737 static void idle_balance(int this_cpu, struct rq *this_rq)
2738 {
2739         struct sched_domain *sd;
2740         int pulled_task = -1;
2741         unsigned long next_balance = jiffies + HZ;
2742
2743         for_each_domain(this_cpu, sd) {
2744                 unsigned long interval;
2745
2746                 if (!(sd->flags & SD_LOAD_BALANCE))
2747                         continue;
2748
2749                 if (sd->flags & SD_BALANCE_NEWIDLE)
2750                         /* If we've pulled tasks over stop searching: */
2751                         pulled_task = load_balance_newidle(this_cpu,
2752                                                                 this_rq, sd);
2753
2754                 interval = msecs_to_jiffies(sd->balance_interval);
2755                 if (time_after(next_balance, sd->last_balance + interval))
2756                         next_balance = sd->last_balance + interval;
2757                 if (pulled_task)
2758                         break;
2759         }
2760         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2761                 /*
2762                  * We are going idle. next_balance may be set based on
2763                  * a busy processor. So reset next_balance.
2764                  */
2765                 this_rq->next_balance = next_balance;
2766         }
2767 }
2768
2769 /*
2770  * active_load_balance is run by migration threads. It pushes running tasks
2771  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2772  * running on each physical CPU where possible, and avoids physical /
2773  * logical imbalances.
2774  *
2775  * Called with busiest_rq locked.
2776  */
2777 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2778 {
2779         int target_cpu = busiest_rq->push_cpu;
2780         struct sched_domain *sd;
2781         struct rq *target_rq;
2782
2783         /* Is there any task to move? */
2784         if (busiest_rq->nr_running <= 1)
2785                 return;
2786
2787         target_rq = cpu_rq(target_cpu);
2788
2789         /*
2790          * This condition is "impossible", if it occurs
2791          * we need to fix it.  Originally reported by
2792          * Bjorn Helgaas on a 128-cpu setup.
2793          */
2794         BUG_ON(busiest_rq == target_rq);
2795
2796         /* move a task from busiest_rq to target_rq */
2797         double_lock_balance(busiest_rq, target_rq);
2798
2799         /* Search for an sd spanning us and the target CPU. */
2800         for_each_domain(target_cpu, sd) {
2801                 if ((sd->flags & SD_LOAD_BALANCE) &&
2802                     cpu_isset(busiest_cpu, sd->span))
2803                                 break;
2804         }
2805
2806         if (likely(sd)) {
2807                 schedstat_inc(sd, alb_cnt);
2808
2809                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2810                                RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2811                                NULL))
2812                         schedstat_inc(sd, alb_pushed);
2813                 else
2814                         schedstat_inc(sd, alb_failed);
2815         }
2816         spin_unlock(&target_rq->lock);
2817 }
2818
2819 #ifdef CONFIG_NO_HZ
2820 static struct {
2821         atomic_t load_balancer;
2822         cpumask_t  cpu_mask;
2823 } nohz ____cacheline_aligned = {
2824         .load_balancer = ATOMIC_INIT(-1),
2825         .cpu_mask = CPU_MASK_NONE,
2826 };
2827
2828 /*
2829  * This routine will try to nominate the ilb (idle load balancing)
2830  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2831  * load balancing on behalf of all those cpus. If all the cpus in the system
2832  * go into this tickless mode, then there will be no ilb owner (as there is
2833  * no need for one) and all the cpus will sleep till the next wakeup event
2834  * arrives...
2835  *
2836  * For the ilb owner, tick is not stopped. And this tick will be used
2837  * for idle load balancing. ilb owner will still be part of
2838  * nohz.cpu_mask..
2839  *
2840  * While stopping the tick, this cpu will become the ilb owner if there
2841  * is no other owner. And will be the owner till that cpu becomes busy
2842  * or if all cpus in the system stop their ticks at which point
2843  * there is no need for ilb owner.
2844  *
2845  * When the ilb owner becomes busy, it nominates another owner, during the
2846  * next busy scheduler_tick()
2847  */
2848 int select_nohz_load_balancer(int stop_tick)
2849 {
2850         int cpu = smp_processor_id();
2851
2852         if (stop_tick) {
2853                 cpu_set(cpu, nohz.cpu_mask);
2854                 cpu_rq(cpu)->in_nohz_recently = 1;
2855
2856                 /*
2857                  * If we are going offline and still the leader, give up!
2858                  */
2859                 if (cpu_is_offline(cpu) &&
2860                     atomic_read(&nohz.load_balancer) == cpu) {
2861                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2862                                 BUG();
2863                         return 0;
2864                 }
2865
2866                 /* time for ilb owner also to sleep */
2867                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2868                         if (atomic_read(&nohz.load_balancer) == cpu)
2869                                 atomic_set(&nohz.load_balancer, -1);
2870                         return 0;
2871                 }
2872
2873                 if (atomic_read(&nohz.load_balancer) == -1) {
2874                         /* make me the ilb owner */
2875                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2876                                 return 1;
2877                 } else if (atomic_read(&nohz.load_balancer) == cpu)
2878                         return 1;
2879         } else {
2880                 if (!cpu_isset(cpu, nohz.cpu_mask))
2881                         return 0;
2882
2883                 cpu_clear(cpu, nohz.cpu_mask);
2884
2885                 if (atomic_read(&nohz.load_balancer) == cpu)
2886                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2887                                 BUG();
2888         }
2889         return 0;
2890 }
2891 #endif
2892
2893 static DEFINE_SPINLOCK(balancing);
2894
2895 /*
2896  * It checks each scheduling domain to see if it is due to be balanced,
2897  * and initiates a balancing operation if so.
2898  *
2899  * Balancing parameters are set up in arch_init_sched_domains.
2900  */
2901 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2902 {
2903         int balance = 1;
2904         struct rq *rq = cpu_rq(cpu);
2905         unsigned long interval;
2906         struct sched_domain *sd;
2907         /* Earliest time when we have to do rebalance again */
2908         unsigned long next_balance = jiffies + 60*HZ;
2909
2910         for_each_domain(cpu, sd) {
2911                 if (!(sd->flags & SD_LOAD_BALANCE))
2912                         continue;
2913
2914                 interval = sd->balance_interval;
2915                 if (idle != CPU_IDLE)
2916                         interval *= sd->busy_factor;
2917
2918                 /* scale ms to jiffies */
2919                 interval = msecs_to_jiffies(interval);
2920                 if (unlikely(!interval))
2921                         interval = 1;
2922                 if (interval > HZ*NR_CPUS/10)
2923                         interval = HZ*NR_CPUS/10;
2924
2925
2926                 if (sd->flags & SD_SERIALIZE) {
2927                         if (!spin_trylock(&balancing))
2928                                 goto out;
2929                 }
2930
2931                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2932                         if (load_balance(cpu, rq, sd, idle, &balance)) {
2933                                 /*
2934                                  * We've pulled tasks over so either we're no
2935                                  * longer idle, or one of our SMT siblings is
2936                                  * not idle.
2937                                  */
2938                                 idle = CPU_NOT_IDLE;
2939                         }
2940                         sd->last_balance = jiffies;
2941                 }
2942                 if (sd->flags & SD_SERIALIZE)
2943                         spin_unlock(&balancing);
2944 out:
2945                 if (time_after(next_balance, sd->last_balance + interval))
2946                         next_balance = sd->last_balance + interval;
2947
2948                 /*
2949                  * Stop the load balance at this level. There is another
2950                  * CPU in our sched group which is doing load balancing more
2951                  * actively.
2952                  */
2953                 if (!balance)
2954                         break;
2955         }
2956         rq->next_balance = next_balance;
2957 }
2958
2959 /*
2960  * run_rebalance_domains is triggered when needed from the scheduler tick.
2961  * In CONFIG_NO_HZ case, the idle load balance owner will do the
2962  * rebalancing for all the cpus for whom scheduler ticks are stopped.
2963  */
2964 static void run_rebalance_domains(struct softirq_action *h)
2965 {
2966         int this_cpu = smp_processor_id();
2967         struct rq *this_rq = cpu_rq(this_cpu);
2968         enum cpu_idle_type idle = this_rq->idle_at_tick ?
2969                                                 CPU_IDLE : CPU_NOT_IDLE;
2970
2971         rebalance_domains(this_cpu, idle);
2972
2973 #ifdef CONFIG_NO_HZ
2974         /*
2975          * If this cpu is the owner for idle load balancing, then do the
2976          * balancing on behalf of the other idle cpus whose ticks are
2977          * stopped.
2978          */
2979         if (this_rq->idle_at_tick &&
2980             atomic_read(&nohz.load_balancer) == this_cpu) {
2981                 cpumask_t cpus = nohz.cpu_mask;
2982                 struct rq *rq;
2983                 int balance_cpu;
2984
2985                 cpu_clear(this_cpu, cpus);
2986                 for_each_cpu_mask(balance_cpu, cpus) {
2987                         /*
2988                          * If this cpu gets work to do, stop the load balancing
2989                          * work being done for other cpus. Next load
2990                          * balancing owner will pick it up.
2991                          */
2992                         if (need_resched())
2993                                 break;
2994
2995                         rebalance_domains(balance_cpu, SCHED_IDLE);
2996
2997                         rq = cpu_rq(balance_cpu);
2998                         if (time_after(this_rq->next_balance, rq->next_balance))
2999                                 this_rq->next_balance = rq->next_balance;
3000                 }
3001         }
3002 #endif
3003 }
3004
3005 /*
3006  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3007  *
3008  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3009  * idle load balancing owner or decide to stop the periodic load balancing,
3010  * if the whole system is idle.
3011  */
3012 static inline void trigger_load_balance(struct rq *rq, int cpu)
3013 {
3014 #ifdef CONFIG_NO_HZ
3015         /*
3016          * If we were in the nohz mode recently and busy at the current
3017          * scheduler tick, then check if we need to nominate new idle
3018          * load balancer.
3019          */
3020         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3021                 rq->in_nohz_recently = 0;
3022
3023                 if (atomic_read(&nohz.load_balancer) == cpu) {
3024                         cpu_clear(cpu, nohz.cpu_mask);
3025                         atomic_set(&nohz.load_balancer, -1);
3026                 }
3027
3028                 if (atomic_read(&nohz.load_balancer) == -1) {
3029                         /*
3030                          * simple selection for now: Nominate the
3031                          * first cpu in the nohz list to be the next
3032                          * ilb owner.
3033                          *
3034                          * TBD: Traverse the sched domains and nominate
3035                          * the nearest cpu in the nohz.cpu_mask.
3036                          */
3037                         int ilb = first_cpu(nohz.cpu_mask);
3038
3039                         if (ilb != NR_CPUS)
3040                                 resched_cpu(ilb);
3041                 }
3042         }
3043
3044         /*
3045          * If this cpu is idle and doing idle load balancing for all the
3046          * cpus with ticks stopped, is it time for that to stop?
3047          */
3048         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3049             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3050                 resched_cpu(cpu);
3051                 return;
3052         }
3053
3054         /*
3055          * If this cpu is idle and the idle load balancing is done by
3056          * someone else, then no need raise the SCHED_SOFTIRQ
3057          */
3058         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3059             cpu_isset(cpu, nohz.cpu_mask))
3060                 return;
3061 #endif
3062         if (time_after_eq(jiffies, rq->next_balance))
3063                 raise_softirq(SCHED_SOFTIRQ);
3064 }
3065
3066 #else   /* CONFIG_SMP */
3067
3068 /*
3069  * on UP we do not need to balance between CPUs:
3070  */
3071 static inline void idle_balance(int cpu, struct rq *rq)
3072 {
3073 }
3074
3075 /* Avoid "used but not defined" warning on UP */
3076 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3077                       unsigned long max_nr_move, unsigned long max_load_move,
3078                       struct sched_domain *sd, enum cpu_idle_type idle,
3079                       int *all_pinned, unsigned long *load_moved,
3080                       int this_best_prio, int best_prio, int best_prio_seen,
3081                       struct rq_iterator *iterator)
3082 {
3083         *load_moved = 0;
3084
3085         return 0;
3086 }
3087
3088 #endif
3089
3090 DEFINE_PER_CPU(struct kernel_stat, kstat);
3091
3092 EXPORT_PER_CPU_SYMBOL(kstat);
3093
3094 /*
3095  * Return p->sum_exec_runtime plus any more ns on the sched_clock
3096  * that have not yet been banked in case the task is currently running.
3097  */
3098 unsigned long long task_sched_runtime(struct task_struct *p)
3099 {
3100         unsigned long flags;
3101         u64 ns, delta_exec;
3102         struct rq *rq;
3103
3104         rq = task_rq_lock(p, &flags);
3105         ns = p->se.sum_exec_runtime;
3106         if (rq->curr == p) {
3107                 delta_exec = rq_clock(rq) - p->se.exec_start;
3108                 if ((s64)delta_exec > 0)
3109                         ns += delta_exec;
3110         }
3111         task_rq_unlock(rq, &flags);
3112
3113         return ns;
3114 }
3115
3116 /*
3117  * Account user cpu time to a process.
3118  * @p: the process that the cpu time gets accounted to
3119  * @hardirq_offset: the offset to subtract from hardirq_count()
3120  * @cputime: the cpu time spent in user space since the last update
3121  */
3122 void account_user_time(struct task_struct *p, cputime_t cputime)
3123 {
3124         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3125         cputime64_t tmp;
3126
3127         p->utime = cputime_add(p->utime, cputime);
3128
3129         /* Add user time to cpustat. */
3130         tmp = cputime_to_cputime64(cputime);
3131         if (TASK_NICE(p) > 0)
3132                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3133         else
3134                 cpustat->user = cputime64_add(cpustat->user, tmp);
3135 }
3136
3137 /*
3138  * Account system cpu time to a process.
3139  * @p: the process that the cpu time gets accounted to
3140  * @hardirq_offset: the offset to subtract from hardirq_count()
3141  * @cputime: the cpu time spent in kernel space since the last update
3142  */
3143 void account_system_time(struct task_struct *p, int hardirq_offset,
3144                          cputime_t cputime)
3145 {
3146         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3147         struct rq *rq = this_rq();
3148         cputime64_t tmp;
3149
3150         p->stime = cputime_add(p->stime, cputime);
3151
3152         /* Add system time to cpustat. */
3153         tmp = cputime_to_cputime64(cputime);
3154         if (hardirq_count() - hardirq_offset)
3155                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3156         else if (softirq_count())
3157                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3158         else if (p != rq->idle)
3159                 cpustat->system = cputime64_add(cpustat->system, tmp);
3160         else if (atomic_read(&rq->nr_iowait) > 0)
3161                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3162         else
3163                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3164         /* Account for system time used */
3165         acct_update_integrals(p);
3166 }
3167
3168 /*
3169  * Account for involuntary wait time.
3170  * @p: the process from which the cpu time has been stolen
3171  * @steal: the cpu time spent in involuntary wait
3172  */
3173 void account_steal_time(struct task_struct *p, cputime_t steal)
3174 {
3175         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3176         cputime64_t tmp = cputime_to_cputime64(steal);
3177         struct rq *rq = this_rq();
3178
3179         if (p == rq->idle) {
3180                 p->stime = cputime_add(p->stime, steal);
3181                 if (atomic_read(&rq->nr_iowait) > 0)
3182                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3183                 else
3184                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3185         } else
3186                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3187 }
3188
3189 /*
3190  * This function gets called by the timer code, with HZ frequency.
3191  * We call it with interrupts disabled.
3192  *
3193  * It also gets called by the fork code, when changing the parent's
3194  * timeslices.
3195  */
3196 void scheduler_tick(void)
3197 {
3198         int cpu = smp_processor_id();
3199         struct rq *rq = cpu_rq(cpu);
3200         struct task_struct *curr = rq->curr;
3201
3202         spin_lock(&rq->lock);
3203         if (curr != rq->idle) /* FIXME: needed? */
3204                 curr->sched_class->task_tick(rq, curr);
3205         update_cpu_load(rq);
3206         spin_unlock(&rq->lock);
3207
3208 #ifdef CONFIG_SMP
3209         rq->idle_at_tick = idle_cpu(cpu);
3210         trigger_load_balance(rq, cpu);
3211 #endif
3212 }
3213
3214 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3215
3216 void fastcall add_preempt_count(int val)
3217 {
3218         /*
3219          * Underflow?
3220          */
3221         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3222                 return;
3223         preempt_count() += val;
3224         /*
3225          * Spinlock count overflowing soon?
3226          */
3227         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3228                                 PREEMPT_MASK - 10);
3229 }
3230 EXPORT_SYMBOL(add_preempt_count);
3231
3232 void fastcall sub_preempt_count(int val)
3233 {
3234         /*
3235          * Underflow?
3236          */
3237         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3238                 return;
3239         /*
3240          * Is the spinlock portion underflowing?
3241          */
3242         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3243                         !(preempt_count() & PREEMPT_MASK)))
3244                 return;
3245
3246         preempt_count() -= val;
3247 }
3248 EXPORT_SYMBOL(sub_preempt_count);
3249
3250 #endif
3251
3252 /*
3253  * Print scheduling while atomic bug:
3254  */
3255 static noinline void __schedule_bug(struct task_struct *prev)
3256 {
3257         printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3258                 prev->comm, preempt_count(), prev->pid);
3259         debug_show_held_locks(prev);
3260         if (irqs_disabled())
3261                 print_irqtrace_events(prev);
3262         dump_stack();
3263 }
3264
3265 /*
3266  * Various schedule()-time debugging checks and statistics:
3267  */
3268 static inline void schedule_debug(struct task_struct *prev)
3269 {
3270         /*
3271          * Test if we are atomic.  Since do_exit() needs to call into
3272          * schedule() atomically, we ignore that path for now.
3273          * Otherwise, whine if we are scheduling when we should not be.
3274          */
3275         if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3276                 __schedule_bug(prev);
3277
3278         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3279
3280         schedstat_inc(this_rq(), sched_cnt);
3281 }
3282
3283 /*
3284  * Pick up the highest-prio task:
3285  */
3286 static inline struct task_struct *
3287 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3288 {
3289         struct sched_class *class;
3290         struct task_struct *p;
3291
3292         /*
3293          * Optimization: we know that if all tasks are in
3294          * the fair class we can call that function directly:
3295          */
3296         if (likely(rq->nr_running == rq->cfs.nr_running)) {
3297                 p = fair_sched_class.pick_next_task(rq, now);
3298                 if (likely(p))
3299                         return p;
3300         }
3301
3302         class = sched_class_highest;
3303         for ( ; ; ) {
3304                 p = class->pick_next_task(rq, now);
3305                 if (p)
3306                         return p;
3307                 /*
3308                  * Will never be NULL as the idle class always
3309                  * returns a non-NULL p:
3310                  */
3311                 class = class->next;
3312         }
3313 }
3314
3315 /*
3316  * schedule() is the main scheduler function.
3317  */
3318 asmlinkage void __sched schedule(void)
3319 {
3320         struct task_struct *prev, *next;
3321         long *switch_count;
3322         struct rq *rq;
3323         u64 now;
3324         int cpu;
3325
3326 need_resched:
3327         preempt_disable();
3328         cpu = smp_processor_id();
3329         rq = cpu_rq(cpu);
3330         rcu_qsctr_inc(cpu);
3331         prev = rq->curr;
3332         switch_count = &prev->nivcsw;
3333
3334         release_kernel_lock(prev);
3335 need_resched_nonpreemptible:
3336
3337         schedule_debug(prev);
3338
3339         spin_lock_irq(&rq->lock);
3340         clear_tsk_need_resched(prev);
3341
3342         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3343                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3344                                 unlikely(signal_pending(prev)))) {
3345                         prev->state = TASK_RUNNING;
3346                 } else {
3347                         deactivate_task(rq, prev, 1);
3348                 }
3349                 switch_count = &prev->nvcsw;
3350         }
3351
3352         if (unlikely(!rq->nr_running))
3353                 idle_balance(cpu, rq);
3354
3355         now = __rq_clock(rq);
3356         prev->sched_class->put_prev_task(rq, prev, now);
3357         next = pick_next_task(rq, prev, now);
3358
3359         sched_info_switch(prev, next);
3360
3361         if (likely(prev != next)) {
3362                 rq->nr_switches++;
3363                 rq->curr = next;
3364                 ++*switch_count;
3365
3366                 context_switch(rq, prev, next); /* unlocks the rq */
3367         } else
3368                 spin_unlock_irq(&rq->lock);
3369
3370         if (unlikely(reacquire_kernel_lock(current) < 0)) {
3371                 cpu = smp_processor_id();
3372                 rq = cpu_rq(cpu);
3373                 goto need_resched_nonpreemptible;
3374         }
3375         preempt_enable_no_resched();
3376         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3377                 goto need_resched;
3378 }
3379 EXPORT_SYMBOL(schedule);
3380
3381 #ifdef CONFIG_PREEMPT
3382 /*
3383  * this is the entry point to schedule() from in-kernel preemption
3384  * off of preempt_enable.  Kernel preemptions off return from interrupt
3385  * occur there and call schedule directly.
3386  */
3387 asmlinkage void __sched preempt_schedule(void)
3388 {
3389         struct thread_info *ti = current_thread_info();
3390 #ifdef CONFIG_PREEMPT_BKL
3391         struct task_struct *task = current;
3392         int saved_lock_depth;
3393 #endif
3394         /*
3395          * If there is a non-zero preempt_count or interrupts are disabled,
3396          * we do not want to preempt the current task.  Just return..
3397          */
3398         if (likely(ti->preempt_count || irqs_disabled()))
3399                 return;
3400
3401 need_resched:
3402         add_preempt_count(PREEMPT_ACTIVE);
3403         /*
3404          * We keep the big kernel semaphore locked, but we
3405          * clear ->lock_depth so that schedule() doesnt
3406          * auto-release the semaphore:
3407          */
3408 #ifdef CONFIG_PREEMPT_BKL
3409         saved_lock_depth = task->lock_depth;
3410         task->lock_depth = -1;
3411 #endif
3412         schedule();
3413 #ifdef CONFIG_PREEMPT_BKL
3414         task->lock_depth = saved_lock_depth;
3415 #endif
3416         sub_preempt_count(PREEMPT_ACTIVE);
3417
3418         /* we could miss a preemption opportunity between schedule and now */
3419         barrier();
3420         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3421                 goto need_resched;
3422 }
3423 EXPORT_SYMBOL(preempt_schedule);
3424
3425 /*
3426  * this is the entry point to schedule() from kernel preemption
3427  * off of irq context.
3428  * Note, that this is called and return with irqs disabled. This will
3429  * protect us against recursive calling from irq.
3430  */
3431 asmlinkage void __sched preempt_schedule_irq(void)
3432 {
3433         struct thread_info *ti = current_thread_info();
3434 #ifdef CONFIG_PREEMPT_BKL
3435         struct task_struct *task = current;
3436         int saved_lock_depth;
3437 #endif
3438         /* Catch callers which need to be fixed */
3439         BUG_ON(ti->preempt_count || !irqs_disabled());
3440
3441 need_resched:
3442         add_preempt_count(PREEMPT_ACTIVE);
3443         /*
3444          * We keep the big kernel semaphore locked, but we
3445          * clear ->lock_depth so that schedule() doesnt
3446          * auto-release the semaphore:
3447          */
3448 #ifdef CONFIG_PREEMPT_BKL
3449         saved_lock_depth = task->lock_depth;
3450         task->lock_depth = -1;
3451 #endif
3452         local_irq_enable();
3453         schedule();
3454         local_irq_disable();
3455 #ifdef CONFIG_PREEMPT_BKL
3456         task->lock_depth = saved_lock_depth;
3457 #endif
3458         sub_preempt_count(PREEMPT_ACTIVE);
3459
3460         /* we could miss a preemption opportunity between schedule and now */
3461         barrier();
3462         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3463                 goto need_resched;
3464 }
3465
3466 #endif /* CONFIG_PREEMPT */
3467
3468 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3469                           void *key)
3470 {
3471         return try_to_wake_up(curr->private, mode, sync);
3472 }
3473 EXPORT_SYMBOL(default_wake_function);
3474
3475 /*
3476  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3477  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3478  * number) then we wake all the non-exclusive tasks and one exclusive task.
3479  *
3480  * There are circumstances in which we can try to wake a task which has already
3481  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3482  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3483  */
3484 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3485                              int nr_exclusive, int sync, void *key)
3486 {
3487         struct list_head *tmp, *next;
3488
3489         list_for_each_safe(tmp, next, &q->task_list) {
3490                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3491                 unsigned flags = curr->flags;
3492
3493                 if (curr->func(curr, mode, sync, key) &&
3494                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3495                         break;
3496         }
3497 }
3498
3499 /**
3500  * __wake_up - wake up threads blocked on a waitqueue.
3501  * @q: the waitqueue
3502  * @mode: which threads
3503  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3504  * @key: is directly passed to the wakeup function
3505  */
3506 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3507                         int nr_exclusive, void *key)
3508 {
3509         unsigned long flags;
3510
3511         spin_lock_irqsave(&q->lock, flags);
3512         __wake_up_common(q, mode, nr_exclusive, 0, key);
3513         spin_unlock_irqrestore(&q->lock, flags);
3514 }
3515 EXPORT_SYMBOL(__wake_up);
3516
3517 /*
3518  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3519  */
3520 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3521 {
3522         __wake_up_common(q, mode, 1, 0, NULL);
3523 }
3524
3525 /**
3526  * __wake_up_sync - wake up threads blocked on a waitqueue.
3527  * @q: the waitqueue
3528  * @mode: which threads
3529  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3530  *
3531  * The sync wakeup differs that the waker knows that it will schedule
3532  * away soon, so while the target thread will be woken up, it will not
3533  * be migrated to another CPU - ie. the two threads are 'synchronized'
3534  * with each other. This can prevent needless bouncing between CPUs.
3535  *
3536  * On UP it can prevent extra preemption.
3537  */
3538 void fastcall
3539 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3540 {
3541         unsigned long flags;
3542         int sync = 1;
3543
3544         if (unlikely(!q))
3545                 return;
3546
3547         if (unlikely(!nr_exclusive))
3548                 sync = 0;
3549
3550         spin_lock_irqsave(&q->lock, flags);
3551         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3552         spin_unlock_irqrestore(&q->lock, flags);
3553 }
3554 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3555
3556 void fastcall complete(struct completion *x)
3557 {
3558         unsigned long flags;
3559
3560         spin_lock_irqsave(&x->wait.lock, flags);
3561         x->done++;
3562         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3563                          1, 0, NULL);
3564         spin_unlock_irqrestore(&x->wait.lock, flags);
3565 }
3566 EXPORT_SYMBOL(complete);
3567
3568 void fastcall complete_all(struct completion *x)
3569 {
3570         unsigned long flags;
3571
3572         spin_lock_irqsave(&x->wait.lock, flags);
3573         x->done += UINT_MAX/2;
3574         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3575                          0, 0, NULL);
3576         spin_unlock_irqrestore(&x->wait.lock, flags);
3577 }
3578 EXPORT_SYMBOL(complete_all);
3579
3580 void fastcall __sched wait_for_completion(struct completion *x)
3581 {
3582         might_sleep();
3583
3584         spin_lock_irq(&x->wait.lock);
3585         if (!x->done) {
3586                 DECLARE_WAITQUEUE(wait, current);
3587
3588                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3589                 __add_wait_queue_tail(&x->wait, &wait);
3590                 do {
3591                         __set_current_state(TASK_UNINTERRUPTIBLE);
3592                         spin_unlock_irq(&x->wait.lock);
3593                         schedule();
3594                         spin_lock_irq(&x->wait.lock);
3595                 } while (!x->done);
3596                 __remove_wait_queue(&x->wait, &wait);
3597         }
3598         x->done--;
3599         spin_unlock_irq(&x->wait.lock);
3600 }
3601 EXPORT_SYMBOL(wait_for_completion);
3602
3603 unsigned long fastcall __sched
3604 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3605 {
3606         might_sleep();
3607
3608         spin_lock_irq(&x->wait.lock);
3609         if (!x->done) {
3610                 DECLARE_WAITQUEUE(wait, current);
3611
3612                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3613                 __add_wait_queue_tail(&x->wait, &wait);
3614                 do {
3615                         __set_current_state(TASK_UNINTERRUPTIBLE);
3616                         spin_unlock_irq(&x->wait.lock);
3617                         timeout = schedule_timeout(timeout);
3618                         spin_lock_irq(&x->wait.lock);
3619                         if (!timeout) {
3620                                 __remove_wait_queue(&x->wait, &wait);
3621                                 goto out;
3622                         }
3623                 } while (!x->done);
3624                 __remove_wait_queue(&x->wait, &wait);
3625         }
3626         x->done--;
3627 out:
3628         spin_unlock_irq(&x->wait.lock);
3629         return timeout;
3630 }
3631 EXPORT_SYMBOL(wait_for_completion_timeout);
3632
3633 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3634 {
3635         int ret = 0;
3636
3637         might_sleep();
3638
3639         spin_lock_irq(&x->wait.lock);
3640         if (!x->done) {
3641                 DECLARE_WAITQUEUE(wait, current);
3642
3643                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3644                 __add_wait_queue_tail(&x->wait, &wait);
3645                 do {
3646                         if (signal_pending(current)) {
3647                                 ret = -ERESTARTSYS;
3648                                 __remove_wait_queue(&x->wait, &wait);
3649                                 goto out;
3650                         }
3651                         __set_current_state(TASK_INTERRUPTIBLE);
3652                         spin_unlock_irq(&x->wait.lock);
3653                         schedule();
3654                         spin_lock_irq(&x->wait.lock);
3655                 } while (!x->done);
3656                 __remove_wait_queue(&x->wait, &wait);
3657         }
3658         x->done--;
3659 out:
3660         spin_unlock_irq(&x->wait.lock);
3661
3662         return ret;
3663 }
3664 EXPORT_SYMBOL(wait_for_completion_interruptible);
3665
3666 unsigned long fastcall __sched
3667 wait_for_completion_interruptible_timeout(struct completion *x,
3668                                           unsigned long timeout)
3669 {
3670         might_sleep();
3671
3672         spin_lock_irq(&x->wait.lock);
3673         if (!x->done) {
3674                 DECLARE_WAITQUEUE(wait, current);
3675
3676                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3677                 __add_wait_queue_tail(&x->wait, &wait);
3678                 do {
3679                         if (signal_pending(current)) {
3680                                 timeout = -ERESTARTSYS;
3681                                 __remove_wait_queue(&x->wait, &wait);
3682                                 goto out;
3683                         }
3684                         __set_current_state(TASK_INTERRUPTIBLE);
3685                         spin_unlock_irq(&x->wait.lock);
3686                         timeout = schedule_timeout(timeout);
3687                         spin_lock_irq(&x->wait.lock);
3688                         if (!timeout) {
3689                                 __remove_wait_queue(&x->wait, &wait);
3690                                 goto out;
3691                         }
3692                 } while (!x->done);
3693                 __remove_wait_queue(&x->wait, &wait);
3694         }
3695         x->done--;
3696 out:
3697         spin_unlock_irq(&x->wait.lock);
3698         return timeout;
3699 }
3700 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3701
3702 static inline void
3703 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3704 {
3705         spin_lock_irqsave(&q->lock, *flags);
3706         __add_wait_queue(q, wait);
3707         spin_unlock(&q->lock);
3708 }
3709
3710 static inline void
3711 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3712 {
3713         spin_lock_irq(&q->lock);
3714         __remove_wait_queue(q, wait);
3715         spin_unlock_irqrestore(&q->lock, *flags);
3716 }
3717
3718 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3719 {
3720         unsigned long flags;
3721         wait_queue_t wait;
3722
3723         init_waitqueue_entry(&wait, current);
3724
3725         current->state = TASK_INTERRUPTIBLE;
3726
3727         sleep_on_head(q, &wait, &flags);
3728         schedule();
3729         sleep_on_tail(q, &wait, &flags);
3730 }
3731 EXPORT_SYMBOL(interruptible_sleep_on);
3732
3733 long __sched
3734 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3735 {
3736         unsigned long flags;
3737         wait_queue_t wait;
3738
3739         init_waitqueue_entry(&wait, current);
3740
3741         current->state = TASK_INTERRUPTIBLE;
3742
3743         sleep_on_head(q, &wait, &flags);
3744         timeout = schedule_timeout(timeout);
3745         sleep_on_tail(q, &wait, &flags);
3746
3747         return timeout;
3748 }
3749 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3750
3751 void __sched sleep_on(wait_queue_head_t *q)
3752 {
3753         unsigned long flags;
3754         wait_queue_t wait;
3755
3756         init_waitqueue_entry(&wait, current);
3757
3758         current->state = TASK_UNINTERRUPTIBLE;
3759
3760         sleep_on_head(q, &wait, &flags);
3761         schedule();
3762         sleep_on_tail(q, &wait, &flags);
3763 }
3764 EXPORT_SYMBOL(sleep_on);
3765
3766 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3767 {
3768         unsigned long flags;
3769         wait_queue_t wait;
3770
3771         init_waitqueue_entry(&wait, current);
3772
3773         current->state = TASK_UNINTERRUPTIBLE;
3774
3775         sleep_on_head(q, &wait, &flags);
3776         timeout = schedule_timeout(timeout);
3777         sleep_on_tail(q, &wait, &flags);
3778
3779         return timeout;
3780 }
3781 EXPORT_SYMBOL(sleep_on_timeout);
3782
3783 #ifdef CONFIG_RT_MUTEXES
3784
3785 /*
3786  * rt_mutex_setprio - set the current priority of a task
3787  * @p: task
3788  * @prio: prio value (kernel-internal form)
3789  *
3790  * This function changes the 'effective' priority of a task. It does
3791  * not touch ->normal_prio like __setscheduler().
3792  *
3793  * Used by the rt_mutex code to implement priority inheritance logic.
3794  */
3795 void rt_mutex_setprio(struct task_struct *p, int prio)
3796 {
3797         unsigned long flags;
3798         int oldprio, on_rq;
3799         struct rq *rq;
3800         u64 now;
3801
3802         BUG_ON(prio < 0 || prio > MAX_PRIO);
3803
3804         rq = task_rq_lock(p, &flags);
3805         now = rq_clock(rq);
3806
3807         oldprio = p->prio;
3808         on_rq = p->se.on_rq;
3809         if (on_rq)
3810                 dequeue_task(rq, p, 0, now);
3811
3812         if (rt_prio(prio))
3813                 p->sched_class = &rt_sched_class;
3814         else
3815                 p->sched_class = &fair_sched_class;
3816
3817         p->prio = prio;
3818
3819         if (on_rq) {
3820                 enqueue_task(rq, p, 0, now);
3821                 /*
3822                  * Reschedule if we are currently running on this runqueue and
3823                  * our priority decreased, or if we are not currently running on
3824                  * this runqueue and our priority is higher than the current's
3825                  */
3826                 if (task_running(rq, p)) {
3827                         if (p->prio > oldprio)
3828                                 resched_task(rq->curr);
3829                 } else {
3830                         check_preempt_curr(rq, p);
3831                 }
3832         }
3833         task_rq_unlock(rq, &flags);
3834 }
3835
3836 #endif
3837
3838 void set_user_nice(struct task_struct *p, long nice)
3839 {
3840         int old_prio, delta, on_rq;
3841         unsigned long flags;
3842         struct rq *rq;
3843         u64 now;
3844
3845         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3846                 return;
3847         /*
3848          * We have to be careful, if called from sys_setpriority(),
3849          * the task might be in the middle of scheduling on another CPU.
3850          */
3851         rq = task_rq_lock(p, &flags);
3852         now = rq_clock(rq);
3853         /*
3854          * The RT priorities are set via sched_setscheduler(), but we still
3855          * allow the 'normal' nice value to be set - but as expected
3856          * it wont have any effect on scheduling until the task is
3857          * SCHED_FIFO/SCHED_RR:
3858          */
3859         if (task_has_rt_policy(p)) {
3860                 p->static_prio = NICE_TO_PRIO(nice);
3861                 goto out_unlock;
3862         }
3863         on_rq = p->se.on_rq;
3864         if (on_rq) {
3865                 dequeue_task(rq, p, 0, now);
3866                 dec_load(rq, p, now);
3867         }
3868
3869         p->static_prio = NICE_TO_PRIO(nice);
3870         set_load_weight(p);
3871         old_prio = p->prio;
3872         p->prio = effective_prio(p);
3873         delta = p->prio - old_prio;
3874
3875         if (on_rq) {
3876                 enqueue_task(rq, p, 0, now);
3877                 inc_load(rq, p, now);
3878                 /*
3879                  * If the task increased its priority or is running and
3880                  * lowered its priority, then reschedule its CPU:
3881                  */
3882                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3883                         resched_task(rq->curr);
3884         }
3885 out_unlock:
3886         task_rq_unlock(rq, &flags);
3887 }
3888 EXPORT_SYMBOL(set_user_nice);
3889
3890 /*
3891  * can_nice - check if a task can reduce its nice value
3892  * @p: task
3893  * @nice: nice value
3894  */
3895 int can_nice(const struct task_struct *p, const int nice)
3896 {
3897         /* convert nice value [19,-20] to rlimit style value [1,40] */
3898         int nice_rlim = 20 - nice;
3899
3900         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3901                 capable(CAP_SYS_NICE));
3902 }
3903
3904 #ifdef __ARCH_WANT_SYS_NICE
3905
3906 /*
3907  * sys_nice - change the priority of the current process.
3908  * @increment: priority increment
3909  *
3910  * sys_setpriority is a more generic, but much slower function that
3911  * does similar things.
3912  */
3913 asmlinkage long sys_nice(int increment)
3914 {
3915         long nice, retval;
3916
3917         /*
3918          * Setpriority might change our priority at the same moment.
3919          * We don't have to worry. Conceptually one call occurs first
3920          * and we have a single winner.
3921          */
3922         if (increment < -40)
3923                 increment = -40;
3924         if (increment > 40)
3925                 increment = 40;
3926
3927         nice = PRIO_TO_NICE(current->static_prio) + increment;
3928         if (nice < -20)
3929                 nice = -20;
3930         if (nice > 19)
3931                 nice = 19;
3932
3933         if (increment < 0 && !can_nice(current, nice))
3934                 return -EPERM;
3935
3936         retval = security_task_setnice(current, nice);
3937         if (retval)
3938                 return retval;
3939
3940         set_user_nice(current, nice);
3941         return 0;
3942 }
3943
3944 #endif
3945
3946 /**
3947  * task_prio - return the priority value of a given task.
3948  * @p: the task in question.
3949  *
3950  * This is the priority value as seen by users in /proc.
3951  * RT tasks are offset by -200. Normal tasks are centered
3952  * around 0, value goes from -16 to +15.
3953  */
3954 int task_prio(const struct task_struct *p)
3955 {
3956         return p->prio - MAX_RT_PRIO;
3957 }
3958
3959 /**
3960  * task_nice - return the nice value of a given task.
3961  * @p: the task in question.
3962  */
3963 int task_nice(const struct task_struct *p)
3964 {
3965         return TASK_NICE(p);
3966 }
3967 EXPORT_SYMBOL_GPL(task_nice);
3968
3969 /**
3970  * idle_cpu - is a given cpu idle currently?
3971  * @cpu: the processor in question.
3972  */
3973 int idle_cpu(int cpu)
3974 {
3975         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3976 }
3977
3978 /**
3979  * idle_task - return the idle task for a given cpu.
3980  * @cpu: the processor in question.
3981  */
3982 struct task_struct *idle_task(int cpu)
3983 {
3984         return cpu_rq(cpu)->idle;
3985 }
3986
3987 /**
3988  * find_process_by_pid - find a process with a matching PID value.
3989  * @pid: the pid in question.
3990  */
3991 static inline struct task_struct *find_process_by_pid(pid_t pid)
3992 {
3993         return pid ? find_task_by_pid(pid) : current;
3994 }
3995
3996 /* Actually do priority change: must hold rq lock. */
3997 static void
3998 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3999 {
4000         BUG_ON(p->se.on_rq);
4001
4002         p->policy = policy;
4003         switch (p->policy) {
4004         case SCHED_NORMAL:
4005         case SCHED_BATCH:
4006         case SCHED_IDLE:
4007                 p->sched_class = &fair_sched_class;
4008                 break;
4009         case SCHED_FIFO:
4010         case SCHED_RR:
4011                 p->sched_class = &rt_sched_class;
4012                 break;
4013         }
4014
4015         p->rt_priority = prio;
4016         p->normal_prio = normal_prio(p);
4017         /* we are holding p->pi_lock already */
4018         p->prio = rt_mutex_getprio(p);
4019         set_load_weight(p);
4020 }
4021
4022 /**
4023  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4024  * @p: the task in question.
4025  * @policy: new policy.
4026  * @param: structure containing the new RT priority.
4027  *
4028  * NOTE that the task may be already dead.
4029  */
4030 int sched_setscheduler(struct task_struct *p, int policy,
4031                        struct sched_param *param)
4032 {
4033         int retval, oldprio, oldpolicy = -1, on_rq;
4034         unsigned long flags;
4035         struct rq *rq;
4036
4037         /* may grab non-irq protected spin_locks */
4038         BUG_ON(in_interrupt());
4039 recheck:
4040         /* double check policy once rq lock held */
4041         if (policy < 0)
4042                 policy = oldpolicy = p->policy;
4043         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4044                         policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4045                         policy != SCHED_IDLE)
4046                 return -EINVAL;
4047         /*
4048          * Valid priorities for SCHED_FIFO and SCHED_RR are
4049          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4050          * SCHED_BATCH and SCHED_IDLE is 0.
4051          */
4052         if (param->sched_priority < 0 ||
4053             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4054             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4055                 return -EINVAL;
4056         if (rt_policy(policy) != (param->sched_priority != 0))
4057                 return -EINVAL;
4058
4059         /*
4060          * Allow unprivileged RT tasks to decrease priority:
4061          */
4062         if (!capable(CAP_SYS_NICE)) {
4063                 if (rt_policy(policy)) {
4064                         unsigned long rlim_rtprio;
4065
4066                         if (!lock_task_sighand(p, &flags))
4067                                 return -ESRCH;
4068                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4069                         unlock_task_sighand(p, &flags);
4070
4071                         /* can't set/change the rt policy */
4072                         if (policy != p->policy && !rlim_rtprio)
4073                                 return -EPERM;
4074
4075                         /* can't increase priority */
4076                         if (param->sched_priority > p->rt_priority &&
4077                             param->sched_priority > rlim_rtprio)
4078                                 return -EPERM;
4079                 }
4080                 /*
4081                  * Like positive nice levels, dont allow tasks to
4082                  * move out of SCHED_IDLE either:
4083                  */
4084                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4085                         return -EPERM;
4086
4087                 /* can't change other user's priorities */
4088                 if ((current->euid != p->euid) &&
4089                     (current->euid != p->uid))
4090                         return -EPERM;
4091         }
4092
4093         retval = security_task_setscheduler(p, policy, param);
4094         if (retval)
4095                 return retval;
4096         /*
4097          * make sure no PI-waiters arrive (or leave) while we are
4098          * changing the priority of the task:
4099          */
4100         spin_lock_irqsave(&p->pi_lock, flags);
4101         /*
4102          * To be able to change p->policy safely, the apropriate
4103          * runqueue lock must be held.
4104          */
4105         rq = __task_rq_lock(p);
4106         /* recheck policy now with rq lock held */
4107         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4108                 policy = oldpolicy = -1;
4109                 __task_rq_unlock(rq);
4110                 spin_unlock_irqrestore(&p->pi_lock, flags);
4111                 goto recheck;
4112         }
4113         on_rq = p->se.on_rq;
4114         if (on_rq)
4115                 deactivate_task(rq, p, 0);
4116         oldprio = p->prio;
4117         __setscheduler(rq, p, policy, param->sched_priority);
4118         if (on_rq) {
4119                 activate_task(rq, p, 0);
4120                 /*
4121                  * Reschedule if we are currently running on this runqueue and
4122                  * our priority decreased, or if we are not currently running on
4123                  * this runqueue and our priority is higher than the current's
4124                  */
4125                 if (task_running(rq, p)) {
4126                         if (p->prio > oldprio)
4127                                 resched_task(rq->curr);
4128                 } else {
4129                         check_preempt_curr(rq, p);
4130                 }
4131         }
4132         __task_rq_unlock(rq);
4133         spin_unlock_irqrestore(&p->pi_lock, flags);
4134
4135         rt_mutex_adjust_pi(p);
4136
4137         return 0;
4138 }
4139 EXPORT_SYMBOL_GPL(sched_setscheduler);
4140
4141 static int
4142 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4143 {
4144         struct sched_param lparam;
4145         struct task_struct *p;
4146         int retval;
4147
4148         if (!param || pid < 0)
4149                 return -EINVAL;
4150         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4151                 return -EFAULT;
4152
4153         rcu_read_lock();
4154         retval = -ESRCH;
4155         p = find_process_by_pid(pid);
4156         if (p != NULL)
4157                 retval = sched_setscheduler(p, policy, &lparam);
4158         rcu_read_unlock();
4159
4160         return retval;
4161 }
4162
4163 /**
4164  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4165  * @pid: the pid in question.
4166  * @policy: new policy.
4167  * @param: structure containing the new RT priority.
4168  */
4169 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4170                                        struct sched_param __user *param)
4171 {
4172         /* negative values for policy are not valid */
4173         if (policy < 0)
4174                 return -EINVAL;
4175
4176         return do_sched_setscheduler(pid, policy, param);
4177 }
4178
4179 /**
4180  * sys_sched_setparam - set/change the RT priority of a thread
4181  * @pid: the pid in question.
4182  * @param: structure containing the new RT priority.
4183  */
4184 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4185 {
4186         return do_sched_setscheduler(pid, -1, param);
4187 }
4188
4189 /**
4190  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4191  * @pid: the pid in question.
4192  */
4193 asmlinkage long sys_sched_getscheduler(pid_t pid)
4194 {
4195         struct task_struct *p;
4196         int retval = -EINVAL;
4197
4198         if (pid < 0)
4199                 goto out_nounlock;
4200
4201         retval = -ESRCH;
4202         read_lock(&tasklist_lock);
4203         p = find_process_by_pid(pid);
4204         if (p) {
4205                 retval = security_task_getscheduler(p);
4206                 if (!retval)
4207                         retval = p->policy;
4208         }
4209         read_unlock(&tasklist_lock);
4210
4211 out_nounlock:
4212         return retval;
4213 }
4214
4215 /**
4216  * sys_sched_getscheduler - get the RT priority of a thread
4217  * @pid: the pid in question.
4218  * @param: structure containing the RT priority.
4219  */
4220 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4221 {
4222         struct sched_param lp;
4223         struct task_struct *p;
4224         int retval = -EINVAL;
4225
4226         if (!param || pid < 0)
4227                 goto out_nounlock;
4228
4229         read_lock(&tasklist_lock);
4230         p = find_process_by_pid(pid);
4231         retval = -ESRCH;
4232         if (!p)
4233                 goto out_unlock;
4234
4235         retval = security_task_getscheduler(p);
4236         if (retval)
4237                 goto out_unlock;
4238
4239         lp.sched_priority = p->rt_priority;
4240         read_unlock(&tasklist_lock);
4241
4242         /*
4243          * This one might sleep, we cannot do it with a spinlock held ...
4244          */
4245         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4246
4247 out_nounlock:
4248         return retval;
4249
4250 out_unlock:
4251         read_unlock(&tasklist_lock);
4252         return retval;
4253 }
4254
4255 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4256 {
4257         cpumask_t cpus_allowed;
4258         struct task_struct *p;
4259         int retval;
4260
4261         mutex_lock(&sched_hotcpu_mutex);
4262         read_lock(&tasklist_lock);
4263
4264         p = find_process_by_pid(pid);
4265         if (!p) {
4266                 read_unlock(&tasklist_lock);
4267                 mutex_unlock(&sched_hotcpu_mutex);
4268                 return -ESRCH;
4269         }
4270
4271         /*
4272          * It is not safe to call set_cpus_allowed with the
4273          * tasklist_lock held.  We will bump the task_struct's
4274          * usage count and then drop tasklist_lock.
4275          */
4276         get_task_struct(p);
4277         read_unlock(&tasklist_lock);
4278
4279         retval = -EPERM;
4280         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4281                         !capable(CAP_SYS_NICE))
4282                 goto out_unlock;
4283
4284         retval = security_task_setscheduler(p, 0, NULL);
4285         if (retval)
4286                 goto out_unlock;
4287
4288         cpus_allowed = cpuset_cpus_allowed(p);
4289         cpus_and(new_mask, new_mask, cpus_allowed);
4290         retval = set_cpus_allowed(p, new_mask);
4291
4292 out_unlock:
4293         put_task_struct(p);
4294         mutex_unlock(&sched_hotcpu_mutex);
4295         return retval;
4296 }
4297
4298 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4299                              cpumask_t *new_mask)
4300 {
4301         if (len < sizeof(cpumask_t)) {
4302                 memset(new_mask, 0, sizeof(cpumask_t));
4303         } else if (len > sizeof(cpumask_t)) {
4304                 len = sizeof(cpumask_t);
4305         }
4306         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4307 }
4308
4309 /**
4310  * sys_sched_setaffinity - set the cpu affinity of a process
4311  * @pid: pid of the process
4312  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4313  * @user_mask_ptr: user-space pointer to the new cpu mask
4314  */
4315 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4316                                       unsigned long __user *user_mask_ptr)
4317 {
4318         cpumask_t new_mask;
4319         int retval;
4320
4321         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4322         if (retval)
4323                 return retval;
4324
4325         return sched_setaffinity(pid, new_mask);
4326 }
4327
4328 /*
4329  * Represents all cpu's present in the system
4330  * In systems capable of hotplug, this map could dynamically grow
4331  * as new cpu's are detected in the system via any platform specific
4332  * method, such as ACPI for e.g.
4333  */
4334
4335 cpumask_t cpu_present_map __read_mostly;
4336 EXPORT_SYMBOL(cpu_present_map);
4337
4338 #ifndef CONFIG_SMP
4339 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4340 EXPORT_SYMBOL(cpu_online_map);
4341
4342 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4343 EXPORT_SYMBOL(cpu_possible_map);
4344 #endif
4345
4346 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4347 {
4348         struct task_struct *p;
4349         int retval;
4350
4351         mutex_lock(&sched_hotcpu_mutex);
4352         read_lock(&tasklist_lock);
4353
4354         retval = -ESRCH;
4355         p = find_process_by_pid(pid);
4356         if (!p)
4357                 goto out_unlock;
4358
4359         retval = security_task_getscheduler(p);
4360         if (retval)
4361                 goto out_unlock;
4362
4363         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4364
4365 out_unlock:
4366         read_unlock(&tasklist_lock);
4367         mutex_unlock(&sched_hotcpu_mutex);
4368         if (retval)
4369                 return retval;
4370
4371         return 0;
4372 }
4373
4374 /**
4375  * sys_sched_getaffinity - get the cpu affinity of a process
4376  * @pid: pid of the process
4377  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4378  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4379  */
4380 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4381                                       unsigned long __user *user_mask_ptr)
4382 {
4383         int ret;
4384         cpumask_t mask;
4385
4386         if (len < sizeof(cpumask_t))
4387                 return -EINVAL;
4388
4389         ret = sched_getaffinity(pid, &mask);
4390         if (ret < 0)
4391                 return ret;
4392
4393         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4394                 return -EFAULT;
4395
4396         return sizeof(cpumask_t);
4397 }
4398
4399 /**
4400  * sys_sched_yield - yield the current processor to other threads.
4401  *
4402  * This function yields the current CPU to other tasks. If there are no
4403  * other threads running on this CPU then this function will return.
4404  */
4405 asmlinkage long sys_sched_yield(void)
4406 {
4407         struct rq *rq = this_rq_lock();
4408
4409         schedstat_inc(rq, yld_cnt);
4410         if (unlikely(rq->nr_running == 1))
4411                 schedstat_inc(rq, yld_act_empty);
4412         else
4413                 current->sched_class->yield_task(rq, current);
4414
4415         /*
4416          * Since we are going to call schedule() anyway, there's
4417          * no need to preempt or enable interrupts:
4418          */
4419         __release(rq->lock);
4420         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4421         _raw_spin_unlock(&rq->lock);
4422         preempt_enable_no_resched();
4423
4424         schedule();
4425
4426         return 0;
4427 }
4428
4429 static void __cond_resched(void)
4430 {
4431 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4432         __might_sleep(__FILE__, __LINE__);
4433 #endif
4434         /*
4435          * The BKS might be reacquired before we have dropped
4436          * PREEMPT_ACTIVE, which could trigger a second
4437          * cond_resched() call.
4438          */
4439         do {
4440                 add_preempt_count(PREEMPT_ACTIVE);
4441                 schedule();
4442                 sub_preempt_count(PREEMPT_ACTIVE);
4443         } while (need_resched());
4444 }
4445
4446 int __sched cond_resched(void)
4447 {
4448         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4449                                         system_state == SYSTEM_RUNNING) {
4450                 __cond_resched();
4451                 return 1;
4452         }
4453         return 0;
4454 }
4455 EXPORT_SYMBOL(cond_resched);
4456
4457 /*
4458  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4459  * call schedule, and on return reacquire the lock.
4460  *
4461  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4462  * operations here to prevent schedule() from being called twice (once via
4463  * spin_unlock(), once by hand).
4464  */
4465 int cond_resched_lock(spinlock_t *lock)
4466 {
4467         int ret = 0;
4468
4469         if (need_lockbreak(lock)) {
4470                 spin_unlock(lock);
4471                 cpu_relax();
4472                 ret = 1;
4473                 spin_lock(lock);
4474         }
4475         if (need_resched() && system_state == SYSTEM_RUNNING) {
4476                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4477                 _raw_spin_unlock(lock);
4478                 preempt_enable_no_resched();
4479                 __cond_resched();
4480                 ret = 1;
4481                 spin_lock(lock);
4482         }
4483         return ret;
4484 }
4485 EXPORT_SYMBOL(cond_resched_lock);
4486
4487 int __sched cond_resched_softirq(void)
4488 {
4489         BUG_ON(!in_softirq());
4490
4491         if (need_resched() && system_state == SYSTEM_RUNNING) {
4492                 local_bh_enable();
4493                 __cond_resched();
4494                 local_bh_disable();
4495                 return 1;
4496         }
4497         return 0;
4498 }
4499 EXPORT_SYMBOL(cond_resched_softirq);
4500
4501 /**
4502  * yield - yield the current processor to other threads.
4503  *
4504  * This is a shortcut for kernel-space yielding - it marks the
4505  * thread runnable and calls sys_sched_yield().
4506  */
4507 void __sched yield(void)
4508 {
4509         set_current_state(TASK_RUNNING);
4510         sys_sched_yield();
4511 }
4512 EXPORT_SYMBOL(yield);
4513
4514 /*
4515  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4516  * that process accounting knows that this is a task in IO wait state.
4517  *
4518  * But don't do that if it is a deliberate, throttling IO wait (this task
4519  * has set its backing_dev_info: the queue against which it should throttle)
4520  */
4521 void __sched io_schedule(void)
4522 {
4523         struct rq *rq = &__raw_get_cpu_var(runqueues);
4524
4525         delayacct_blkio_start();
4526         atomic_inc(&rq->nr_iowait);
4527         schedule();
4528         atomic_dec(&rq->nr_iowait);
4529         delayacct_blkio_end();
4530 }
4531 EXPORT_SYMBOL(io_schedule);
4532
4533 long __sched io_schedule_timeout(long timeout)
4534 {
4535         struct rq *rq = &__raw_get_cpu_var(runqueues);
4536         long ret;
4537
4538         delayacct_blkio_start();
4539         atomic_inc(&rq->nr_iowait);
4540         ret = schedule_timeout(timeout);
4541         atomic_dec(&rq->nr_iowait);
4542         delayacct_blkio_end();
4543         return ret;
4544 }
4545
4546 /**
4547  * sys_sched_get_priority_max - return maximum RT priority.
4548  * @policy: scheduling class.
4549  *
4550  * this syscall returns the maximum rt_priority that can be used
4551  * by a given scheduling class.
4552  */
4553 asmlinkage long sys_sched_get_priority_max(int policy)
4554 {
4555         int ret = -EINVAL;
4556
4557         switch (policy) {
4558         case SCHED_FIFO:
4559         case SCHED_RR:
4560                 ret = MAX_USER_RT_PRIO-1;
4561                 break;
4562         case SCHED_NORMAL:
4563         case SCHED_BATCH:
4564         case SCHED_IDLE:
4565                 ret = 0;
4566                 break;
4567         }
4568         return ret;
4569 }
4570
4571 /**
4572  * sys_sched_get_priority_min - return minimum RT priority.
4573  * @policy: scheduling class.
4574  *
4575  * this syscall returns the minimum rt_priority that can be used
4576  * by a given scheduling class.
4577  */
4578 asmlinkage long sys_sched_get_priority_min(int policy)
4579 {
4580         int ret = -EINVAL;
4581
4582         switch (policy) {
4583         case SCHED_FIFO:
4584         case SCHED_RR:
4585                 ret = 1;
4586                 break;
4587         case SCHED_NORMAL:
4588         case SCHED_BATCH:
4589         case SCHED_IDLE:
4590                 ret = 0;
4591         }
4592         return ret;
4593 }
4594
4595 /**
4596  * sys_sched_rr_get_interval - return the default timeslice of a process.
4597  * @pid: pid of the process.
4598  * @interval: userspace pointer to the timeslice value.
4599  *
4600  * this syscall writes the default timeslice value of a given process
4601  * into the user-space timespec buffer. A value of '0' means infinity.
4602  */
4603 asmlinkage
4604 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4605 {
4606         struct task_struct *p;
4607         int retval = -EINVAL;
4608         struct timespec t;
4609
4610         if (pid < 0)
4611                 goto out_nounlock;
4612
4613         retval = -ESRCH;
4614         read_lock(&tasklist_lock);
4615         p = find_process_by_pid(pid);
4616         if (!p)
4617                 goto out_unlock;
4618
4619         retval = security_task_getscheduler(p);
4620         if (retval)
4621                 goto out_unlock;
4622
4623         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4624                                 0 : static_prio_timeslice(p->static_prio), &t);
4625         read_unlock(&tasklist_lock);
4626         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4627 out_nounlock:
4628         return retval;
4629 out_unlock:
4630         read_unlock(&tasklist_lock);
4631         return retval;
4632 }
4633
4634 static const char stat_nam[] = "RSDTtZX";
4635
4636 static void show_task(struct task_struct *p)
4637 {
4638         unsigned long free = 0;
4639         unsigned state;
4640
4641         state = p->state ? __ffs(p->state) + 1 : 0;
4642         printk("%-13.13s %c", p->comm,
4643                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4644 #if (BITS_PER_LONG == 32)
4645         if (state == TASK_RUNNING)
4646                 printk(" running ");
4647         else
4648                 printk(" %08lX ", thread_saved_pc(p));
4649 #else
4650         if (state == TASK_RUNNING)
4651                 printk("  running task   ");
4652         else
4653                 printk(" %016lx ", thread_saved_pc(p));
4654 #endif
4655 #ifdef CONFIG_DEBUG_STACK_USAGE
4656         {
4657                 unsigned long *n = end_of_stack(p);
4658                 while (!*n)
4659                         n++;
4660                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4661         }
4662 #endif
4663         printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4664         if (!p->mm)
4665                 printk(" (L-TLB)\n");
4666         else
4667                 printk(" (NOTLB)\n");
4668
4669         if (state != TASK_RUNNING)
4670                 show_stack(p, NULL);
4671 }
4672
4673 void show_state_filter(unsigned long state_filter)
4674 {
4675         struct task_struct *g, *p;
4676
4677 #if (BITS_PER_LONG == 32)
4678         printk("\n"
4679                "                         free                        sibling\n");
4680         printk("  task             PC    stack   pid father child younger older\n");
4681 #else
4682         printk("\n"
4683                "                                 free                        sibling\n");
4684         printk("  task                 PC        stack   pid father child younger older\n");
4685 #endif
4686         read_lock(&tasklist_lock);
4687         do_each_thread(g, p) {
4688                 /*
4689                  * reset the NMI-timeout, listing all files on a slow
4690                  * console might take alot of time:
4691                  */
4692                 touch_nmi_watchdog();
4693                 if (!state_filter || (p->state & state_filter))
4694                         show_task(p);
4695         } while_each_thread(g, p);
4696
4697         touch_all_softlockup_watchdogs();
4698
4699 #ifdef CONFIG_SCHED_DEBUG
4700         sysrq_sched_debug_show();
4701 #endif
4702         read_unlock(&tasklist_lock);
4703         /*
4704          * Only show locks if all tasks are dumped:
4705          */
4706         if (state_filter == -1)
4707                 debug_show_all_locks();
4708 }
4709
4710 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4711 {
4712         idle->sched_class = &idle_sched_class;
4713 }
4714
4715 /**
4716  * init_idle - set up an idle thread for a given CPU
4717  * @idle: task in question
4718  * @cpu: cpu the idle task belongs to
4719  *
4720  * NOTE: this function does not set the idle thread's NEED_RESCHED
4721  * flag, to make booting more robust.
4722  */
4723 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4724 {
4725         struct rq *rq = cpu_rq(cpu);
4726         unsigned long flags;
4727
4728         __sched_fork(idle);
4729         idle->se.exec_start = sched_clock();
4730
4731         idle->prio = idle->normal_prio = MAX_PRIO;
4732         idle->cpus_allowed = cpumask_of_cpu(cpu);
4733         __set_task_cpu(idle, cpu);
4734
4735         spin_lock_irqsave(&rq->lock, flags);
4736         rq->curr = rq->idle = idle;
4737 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4738         idle->oncpu = 1;
4739 #endif
4740         spin_unlock_irqrestore(&rq->lock, flags);
4741
4742         /* Set the preempt count _outside_ the spinlocks! */
4743 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4744         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4745 #else
4746         task_thread_info(idle)->preempt_count = 0;
4747 #endif
4748         /*
4749          * The idle tasks have their own, simple scheduling class:
4750          */
4751         idle->sched_class = &idle_sched_class;
4752 }
4753
4754 /*
4755  * In a system that switches off the HZ timer nohz_cpu_mask
4756  * indicates which cpus entered this state. This is used
4757  * in the rcu update to wait only for active cpus. For system
4758  * which do not switch off the HZ timer nohz_cpu_mask should
4759  * always be CPU_MASK_NONE.
4760  */
4761 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4762
4763 /*
4764  * Increase the granularity value when there are more CPUs,
4765  * because with more CPUs the 'effective latency' as visible
4766  * to users decreases. But the relationship is not linear,
4767  * so pick a second-best guess by going with the log2 of the
4768  * number of CPUs.
4769  *
4770  * This idea comes from the SD scheduler of Con Kolivas:
4771  */
4772 static inline void sched_init_granularity(void)
4773 {
4774         unsigned int factor = 1 + ilog2(num_online_cpus());
4775         const unsigned long gran_limit = 10000000;
4776
4777         sysctl_sched_granularity *= factor;
4778         if (sysctl_sched_granularity > gran_limit)
4779                 sysctl_sched_granularity = gran_limit;
4780
4781         sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4782         sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4783 }
4784
4785 #ifdef CONFIG_SMP
4786 /*
4787  * This is how migration works:
4788  *
4789  * 1) we queue a struct migration_req structure in the source CPU's
4790  *    runqueue and wake up that CPU's migration thread.
4791  * 2) we down() the locked semaphore => thread blocks.
4792  * 3) migration thread wakes up (implicitly it forces the migrated
4793  *    thread off the CPU)
4794  * 4) it gets the migration request and checks whether the migrated
4795  *    task is still in the wrong runqueue.
4796  * 5) if it's in the wrong runqueue then the migration thread removes
4797  *    it and puts it into the right queue.
4798  * 6) migration thread up()s the semaphore.
4799  * 7) we wake up and the migration is done.
4800  */
4801
4802 /*
4803  * Change a given task's CPU affinity. Migrate the thread to a
4804  * proper CPU and schedule it away if the CPU it's executing on
4805  * is removed from the allowed bitmask.
4806  *
4807  * NOTE: the caller must have a valid reference to the task, the
4808  * task must not exit() & deallocate itself prematurely.  The
4809  * call is not atomic; no spinlocks may be held.
4810  */
4811 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4812 {
4813         struct migration_req req;
4814         unsigned long flags;
4815         struct rq *rq;
4816         int ret = 0;
4817
4818         rq = task_rq_lock(p, &flags);
4819         if (!cpus_intersects(new_mask, cpu_online_map)) {
4820                 ret = -EINVAL;
4821                 goto out;
4822         }
4823
4824         p->cpus_allowed = new_mask;
4825         /* Can the task run on the task's current CPU? If so, we're done */
4826         if (cpu_isset(task_cpu(p), new_mask))
4827                 goto out;
4828
4829         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4830                 /* Need help from migration thread: drop lock and wait. */
4831                 task_rq_unlock(rq, &flags);
4832                 wake_up_process(rq->migration_thread);
4833                 wait_for_completion(&req.done);
4834                 tlb_migrate_finish(p->mm);
4835                 return 0;
4836         }
4837 out:
4838         task_rq_unlock(rq, &flags);
4839
4840         return ret;
4841 }
4842 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4843
4844 /*
4845  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4846  * this because either it can't run here any more (set_cpus_allowed()
4847  * away from this CPU, or CPU going down), or because we're
4848  * attempting to rebalance this task on exec (sched_exec).
4849  *
4850  * So we race with normal scheduler movements, but that's OK, as long
4851  * as the task is no longer on this CPU.
4852  *
4853  * Returns non-zero if task was successfully migrated.
4854  */
4855 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4856 {
4857         struct rq *rq_dest, *rq_src;
4858         int ret = 0, on_rq;
4859
4860         if (unlikely(cpu_is_offline(dest_cpu)))
4861                 return ret;
4862
4863         rq_src = cpu_rq(src_cpu);
4864         rq_dest = cpu_rq(dest_cpu);
4865
4866         double_rq_lock(rq_src, rq_dest);
4867         /* Already moved. */
4868         if (task_cpu(p) != src_cpu)
4869                 goto out;
4870         /* Affinity changed (again). */
4871         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4872                 goto out;
4873
4874         on_rq = p->se.on_rq;
4875         if (on_rq)
4876                 deactivate_task(rq_src, p, 0);
4877         set_task_cpu(p, dest_cpu);
4878         if (on_rq) {
4879                 activate_task(rq_dest, p, 0);
4880                 check_preempt_curr(rq_dest, p);
4881         }
4882         ret = 1;
4883 out:
4884         double_rq_unlock(rq_src, rq_dest);
4885         return ret;
4886 }
4887
4888 /*
4889  * migration_thread - this is a highprio system thread that performs
4890  * thread migration by bumping thread off CPU then 'pushing' onto
4891  * another runqueue.
4892  */
4893 static int migration_thread(void *data)
4894 {
4895         int cpu = (long)data;
4896         struct rq *rq;
4897
4898         rq = cpu_rq(cpu);
4899         BUG_ON(rq->migration_thread != current);
4900
4901         set_current_state(TASK_INTERRUPTIBLE);
4902         while (!kthread_should_stop()) {
4903                 struct migration_req *req;
4904                 struct list_head *head;
4905
4906                 try_to_freeze();
4907
4908                 spin_lock_irq(&rq->lock);
4909
4910                 if (cpu_is_offline(cpu)) {
4911                         spin_unlock_irq(&rq->lock);
4912                         goto wait_to_die;
4913                 }
4914
4915                 if (rq->active_balance) {
4916                         active_load_balance(rq, cpu);
4917                         rq->active_balance = 0;
4918                 }
4919
4920                 head = &rq->migration_queue;
4921
4922                 if (list_empty(head)) {
4923                         spin_unlock_irq(&rq->lock);
4924                         schedule();
4925                         set_current_state(TASK_INTERRUPTIBLE);
4926                         continue;
4927                 }
4928                 req = list_entry(head->next, struct migration_req, list);
4929                 list_del_init(head->next);
4930
4931                 spin_unlock(&rq->lock);
4932                 __migrate_task(req->task, cpu, req->dest_cpu);
4933                 local_irq_enable();
4934
4935                 complete(&req->done);
4936         }
4937         __set_current_state(TASK_RUNNING);
4938         return 0;
4939
4940 wait_to_die:
4941         /* Wait for kthread_stop */
4942         set_current_state(TASK_INTERRUPTIBLE);
4943         while (!kthread_should_stop()) {
4944                 schedule();
4945                 set_current_state(TASK_INTERRUPTIBLE);
4946         }
4947         __set_current_state(TASK_RUNNING);
4948         return 0;
4949 }
4950
4951 #ifdef CONFIG_HOTPLUG_CPU
4952 /*
4953  * Figure out where task on dead CPU should go, use force if neccessary.
4954  * NOTE: interrupts should be disabled by the caller
4955  */
4956 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4957 {
4958         unsigned long flags;
4959         cpumask_t mask;
4960         struct rq *rq;
4961         int dest_cpu;
4962
4963 restart:
4964         /* On same node? */
4965         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4966         cpus_and(mask, mask, p->cpus_allowed);
4967         dest_cpu = any_online_cpu(mask);
4968
4969         /* On any allowed CPU? */
4970         if (dest_cpu == NR_CPUS)
4971                 dest_cpu = any_online_cpu(p->cpus_allowed);
4972
4973         /* No more Mr. Nice Guy. */
4974         if (dest_cpu == NR_CPUS) {
4975                 rq = task_rq_lock(p, &flags);
4976                 cpus_setall(p->cpus_allowed);
4977                 dest_cpu = any_online_cpu(p->cpus_allowed);
4978                 task_rq_unlock(rq, &flags);
4979
4980                 /*
4981                  * Don't tell them about moving exiting tasks or
4982                  * kernel threads (both mm NULL), since they never
4983                  * leave kernel.
4984                  */
4985                 if (p->mm && printk_ratelimit())
4986                         printk(KERN_INFO "process %d (%s) no "
4987                                "longer affine to cpu%d\n",
4988                                p->pid, p->comm, dead_cpu);
4989         }
4990         if (!__migrate_task(p, dead_cpu, dest_cpu))
4991                 goto restart;
4992 }
4993
4994 /*
4995  * While a dead CPU has no uninterruptible tasks queued at this point,
4996  * it might still have a nonzero ->nr_uninterruptible counter, because
4997  * for performance reasons the counter is not stricly tracking tasks to
4998  * their home CPUs. So we just add the counter to another CPU's counter,
4999  * to keep the global sum constant after CPU-down:
5000  */
5001 static void migrate_nr_uninterruptible(struct rq *rq_src)
5002 {
5003         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5004         unsigned long flags;
5005
5006         local_irq_save(flags);
5007         double_rq_lock(rq_src, rq_dest);
5008         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5009         rq_src->nr_uninterruptible = 0;
5010         double_rq_unlock(rq_src, rq_dest);
5011         local_irq_restore(flags);
5012 }
5013
5014 /* Run through task list and migrate tasks from the dead cpu. */
5015 static void migrate_live_tasks(int src_cpu)
5016 {
5017         struct task_struct *p, *t;
5018
5019         write_lock_irq(&tasklist_lock);
5020
5021         do_each_thread(t, p) {
5022                 if (p == current)
5023                         continue;
5024
5025                 if (task_cpu(p) == src_cpu)
5026                         move_task_off_dead_cpu(src_cpu, p);
5027         } while_each_thread(t, p);
5028
5029         write_unlock_irq(&tasklist_lock);
5030 }
5031
5032 /*
5033  * Schedules idle task to be the next runnable task on current CPU.
5034  * It does so by boosting its priority to highest possible and adding it to
5035  * the _front_ of the runqueue. Used by CPU offline code.
5036  */
5037 void sched_idle_next(void)
5038 {
5039         int this_cpu = smp_processor_id();
5040         struct rq *rq = cpu_rq(this_cpu);
5041         struct task_struct *p = rq->idle;
5042         unsigned long flags;
5043
5044         /* cpu has to be offline */
5045         BUG_ON(cpu_online(this_cpu));
5046
5047         /*
5048          * Strictly not necessary since rest of the CPUs are stopped by now
5049          * and interrupts disabled on the current cpu.
5050          */
5051         spin_lock_irqsave(&rq->lock, flags);
5052
5053         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5054
5055         /* Add idle task to the _front_ of its priority queue: */
5056         activate_idle_task(p, rq);
5057
5058         spin_unlock_irqrestore(&rq->lock, flags);
5059 }
5060
5061 /*
5062  * Ensures that the idle task is using init_mm right before its cpu goes
5063  * offline.
5064  */
5065 void idle_task_exit(void)
5066 {
5067         struct mm_struct *mm = current->active_mm;
5068
5069         BUG_ON(cpu_online(smp_processor_id()));
5070
5071         if (mm != &init_mm)
5072                 switch_mm(mm, &init_mm, current);
5073         mmdrop(mm);
5074 }
5075
5076 /* called under rq->lock with disabled interrupts */
5077 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5078 {
5079         struct rq *rq = cpu_rq(dead_cpu);
5080
5081         /* Must be exiting, otherwise would be on tasklist. */
5082         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5083
5084         /* Cannot have done final schedule yet: would have vanished. */
5085         BUG_ON(p->state == TASK_DEAD);
5086
5087         get_task_struct(p);
5088
5089         /*
5090          * Drop lock around migration; if someone else moves it,
5091          * that's OK.  No task can be added to this CPU, so iteration is
5092          * fine.
5093          * NOTE: interrupts should be left disabled  --dev@
5094          */
5095         spin_unlock(&rq->lock);
5096         move_task_off_dead_cpu(dead_cpu, p);
5097         spin_lock(&rq->lock);
5098
5099         put_task_struct(p);
5100 }
5101
5102 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5103 static void migrate_dead_tasks(unsigned int dead_cpu)
5104 {
5105         struct rq *rq = cpu_rq(dead_cpu);
5106         struct task_struct *next;
5107
5108         for ( ; ; ) {
5109                 if (!rq->nr_running)
5110                         break;
5111                 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5112                 if (!next)
5113                         break;
5114                 migrate_dead(dead_cpu, next);
5115         }
5116 }
5117 #endif /* CONFIG_HOTPLUG_CPU */
5118
5119 /*
5120  * migration_call - callback that gets triggered when a CPU is added.
5121  * Here we can start up the necessary migration thread for the new CPU.
5122  */
5123 static int __cpuinit
5124 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5125 {
5126         struct task_struct *p;
5127         int cpu = (long)hcpu;
5128         unsigned long flags;
5129         struct rq *rq;
5130
5131         switch (action) {
5132         case CPU_LOCK_ACQUIRE:
5133                 mutex_lock(&sched_hotcpu_mutex);
5134                 break;
5135
5136         case CPU_UP_PREPARE:
5137         case CPU_UP_PREPARE_FROZEN:
5138                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5139                 if (IS_ERR(p))
5140                         return NOTIFY_BAD;
5141                 p->flags |= PF_NOFREEZE;
5142                 kthread_bind(p, cpu);
5143                 /* Must be high prio: stop_machine expects to yield to it. */
5144                 rq = task_rq_lock(p, &flags);
5145                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5146                 task_rq_unlock(rq, &flags);
5147                 cpu_rq(cpu)->migration_thread = p;
5148                 break;
5149
5150         case CPU_ONLINE:
5151         case CPU_ONLINE_FROZEN:
5152                 /* Strictly unneccessary, as first user will wake it. */
5153                 wake_up_process(cpu_rq(cpu)->migration_thread);
5154                 break;
5155
5156 #ifdef CONFIG_HOTPLUG_CPU
5157         case CPU_UP_CANCELED:
5158         case CPU_UP_CANCELED_FROZEN:
5159                 if (!cpu_rq(cpu)->migration_thread)
5160                         break;
5161                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5162                 kthread_bind(cpu_rq(cpu)->migration_thread,
5163                              any_online_cpu(cpu_online_map));
5164                 kthread_stop(cpu_rq(cpu)->migration_thread);
5165                 cpu_rq(cpu)->migration_thread = NULL;
5166                 break;
5167
5168         case CPU_DEAD:
5169         case CPU_DEAD_FROZEN:
5170                 migrate_live_tasks(cpu);
5171                 rq = cpu_rq(cpu);
5172                 kthread_stop(rq->migration_thread);
5173                 rq->migration_thread = NULL;
5174                 /* Idle task back to normal (off runqueue, low prio) */
5175                 rq = task_rq_lock(rq->idle, &flags);
5176                 deactivate_task(rq, rq->idle, 0);
5177                 rq->idle->static_prio = MAX_PRIO;
5178                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5179                 rq->idle->sched_class = &idle_sched_class;
5180                 migrate_dead_tasks(cpu);
5181                 task_rq_unlock(rq, &flags);
5182                 migrate_nr_uninterruptible(rq);
5183                 BUG_ON(rq->nr_running != 0);
5184
5185                 /* No need to migrate the tasks: it was best-effort if
5186                  * they didn't take sched_hotcpu_mutex.  Just wake up
5187                  * the requestors. */
5188                 spin_lock_irq(&rq->lock);
5189                 while (!list_empty(&rq->migration_queue)) {
5190                         struct migration_req *req;
5191
5192                         req = list_entry(rq->migration_queue.next,
5193                                          struct migration_req, list);
5194                         list_del_init(&req->list);
5195                         complete(&req->done);
5196                 }
5197                 spin_unlock_irq(&rq->lock);
5198                 break;
5199 #endif
5200         case CPU_LOCK_RELEASE:
5201                 mutex_unlock(&sched_hotcpu_mutex);
5202                 break;
5203         }
5204         return NOTIFY_OK;
5205 }
5206
5207 /* Register at highest priority so that task migration (migrate_all_tasks)
5208  * happens before everything else.
5209  */
5210 static struct notifier_block __cpuinitdata migration_notifier = {
5211         .notifier_call = migration_call,
5212         .priority = 10
5213 };
5214
5215 int __init migration_init(void)
5216 {
5217         void *cpu = (void *)(long)smp_processor_id();
5218         int err;
5219
5220         /* Start one for the boot CPU: */
5221         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5222         BUG_ON(err == NOTIFY_BAD);
5223         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5224         register_cpu_notifier(&migration_notifier);
5225
5226         return 0;
5227 }
5228 #endif
5229
5230 #ifdef CONFIG_SMP
5231
5232 /* Number of possible processor ids */
5233 int nr_cpu_ids __read_mostly = NR_CPUS;
5234 EXPORT_SYMBOL(nr_cpu_ids);
5235
5236 #undef SCHED_DOMAIN_DEBUG
5237 #ifdef SCHED_DOMAIN_DEBUG
5238 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5239 {
5240         int level = 0;
5241
5242         if (!sd) {
5243                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5244                 return;
5245         }
5246
5247         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5248
5249         do {
5250                 int i;
5251                 char str[NR_CPUS];
5252                 struct sched_group *group = sd->groups;
5253                 cpumask_t groupmask;
5254
5255                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5256                 cpus_clear(groupmask);
5257
5258                 printk(KERN_DEBUG);
5259                 for (i = 0; i < level + 1; i++)
5260                         printk(" ");
5261                 printk("domain %d: ", level);
5262
5263                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5264                         printk("does not load-balance\n");
5265                         if (sd->parent)
5266                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5267                                                 " has parent");
5268                         break;
5269                 }
5270
5271                 printk("span %s\n", str);
5272
5273                 if (!cpu_isset(cpu, sd->span))
5274                         printk(KERN_ERR "ERROR: domain->span does not contain "
5275                                         "CPU%d\n", cpu);
5276                 if (!cpu_isset(cpu, group->cpumask))
5277                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5278                                         " CPU%d\n", cpu);
5279
5280                 printk(KERN_DEBUG);
5281                 for (i = 0; i < level + 2; i++)
5282                         printk(" ");
5283                 printk("groups:");
5284                 do {
5285                         if (!group) {
5286                                 printk("\n");
5287                                 printk(KERN_ERR "ERROR: group is NULL\n");
5288                                 break;
5289                         }
5290
5291                         if (!group->__cpu_power) {
5292                                 printk("\n");
5293                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5294                                                 "set\n");
5295                         }
5296
5297                         if (!cpus_weight(group->cpumask)) {
5298                                 printk("\n");
5299                                 printk(KERN_ERR "ERROR: empty group\n");
5300                         }
5301
5302                         if (cpus_intersects(groupmask, group->cpumask)) {
5303                                 printk("\n");
5304                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5305                         }
5306
5307                         cpus_or(groupmask, groupmask, group->cpumask);
5308
5309                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5310                         printk(" %s", str);
5311
5312                         group = group->next;
5313                 } while (group != sd->groups);
5314                 printk("\n");
5315
5316                 if (!cpus_equal(sd->span, groupmask))
5317                         printk(KERN_ERR "ERROR: groups don't span "
5318                                         "domain->span\n");
5319
5320                 level++;
5321                 sd = sd->parent;
5322                 if (!sd)
5323                         continue;
5324
5325                 if (!cpus_subset(groupmask, sd->span))
5326                         printk(KERN_ERR "ERROR: parent span is not a superset "
5327                                 "of domain->span\n");
5328
5329         } while (sd);
5330 }
5331 #else
5332 # define sched_domain_debug(sd, cpu) do { } while (0)
5333 #endif
5334
5335 static int sd_degenerate(struct sched_domain *sd)
5336 {
5337         if (cpus_weight(sd->span) == 1)
5338                 return 1;
5339
5340         /* Following flags need at least 2 groups */
5341         if (sd->flags & (SD_LOAD_BALANCE |
5342                          SD_BALANCE_NEWIDLE |
5343                          SD_BALANCE_FORK |
5344                          SD_BALANCE_EXEC |
5345                          SD_SHARE_CPUPOWER |
5346                          SD_SHARE_PKG_RESOURCES)) {
5347                 if (sd->groups != sd->groups->next)
5348                         return 0;
5349         }
5350
5351         /* Following flags don't use groups */
5352         if (sd->flags & (SD_WAKE_IDLE |
5353                          SD_WAKE_AFFINE |
5354                          SD_WAKE_BALANCE))
5355                 return 0;
5356
5357         return 1;
5358 }
5359
5360 static int
5361 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5362 {
5363         unsigned long cflags = sd->flags, pflags = parent->flags;
5364
5365         if (sd_degenerate(parent))
5366                 return 1;
5367
5368         if (!cpus_equal(sd->span, parent->span))
5369                 return 0;
5370
5371         /* Does parent contain flags not in child? */
5372         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5373         if (cflags & SD_WAKE_AFFINE)
5374                 pflags &= ~SD_WAKE_BALANCE;
5375         /* Flags needing groups don't count if only 1 group in parent */
5376         if (parent->groups == parent->groups->next) {
5377                 pflags &= ~(SD_LOAD_BALANCE |
5378                                 SD_BALANCE_NEWIDLE |
5379                                 SD_BALANCE_FORK |
5380                                 SD_BALANCE_EXEC |
5381                                 SD_SHARE_CPUPOWER |
5382                                 SD_SHARE_PKG_RESOURCES);
5383         }
5384         if (~cflags & pflags)
5385                 return 0;
5386
5387         return 1;
5388 }
5389
5390 /*
5391  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5392  * hold the hotplug lock.
5393  */
5394 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5395 {
5396         struct rq *rq = cpu_rq(cpu);
5397         struct sched_domain *tmp;
5398
5399         /* Remove the sched domains which do not contribute to scheduling. */
5400         for (tmp = sd; tmp; tmp = tmp->parent) {
5401                 struct sched_domain *parent = tmp->parent;
5402                 if (!parent)
5403                         break;
5404                 if (sd_parent_degenerate(tmp, parent)) {
5405                         tmp->parent = parent->parent;
5406                         if (parent->parent)
5407                                 parent->parent->child = tmp;
5408                 }
5409         }
5410
5411         if (sd && sd_degenerate(sd)) {
5412                 sd = sd->parent;
5413                 if (sd)
5414                         sd->child = NULL;
5415         }
5416
5417         sched_domain_debug(sd, cpu);
5418
5419         rcu_assign_pointer(rq->sd, sd);
5420 }
5421
5422 /* cpus with isolated domains */
5423 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5424
5425 /* Setup the mask of cpus configured for isolated domains */
5426 static int __init isolated_cpu_setup(char *str)
5427 {
5428         int ints[NR_CPUS], i;
5429
5430         str = get_options(str, ARRAY_SIZE(ints), ints);
5431         cpus_clear(cpu_isolated_map);
5432         for (i = 1; i <= ints[0]; i++)
5433                 if (ints[i] < NR_CPUS)
5434                         cpu_set(ints[i], cpu_isolated_map);
5435         return 1;
5436 }
5437
5438 __setup ("isolcpus=", isolated_cpu_setup);
5439
5440 /*
5441  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5442  * to a function which identifies what group(along with sched group) a CPU
5443  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5444  * (due to the fact that we keep track of groups covered with a cpumask_t).
5445  *
5446  * init_sched_build_groups will build a circular linked list of the groups
5447  * covered by the given span, and will set each group's ->cpumask correctly,
5448  * and ->cpu_power to 0.
5449  */
5450 static void
5451 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5452                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5453                                         struct sched_group **sg))
5454 {
5455         struct sched_group *first = NULL, *last = NULL;
5456         cpumask_t covered = CPU_MASK_NONE;
5457         int i;
5458
5459         for_each_cpu_mask(i, span) {
5460                 struct sched_group *sg;
5461                 int group = group_fn(i, cpu_map, &sg);
5462                 int j;
5463
5464                 if (cpu_isset(i, covered))
5465                         continue;
5466
5467                 sg->cpumask = CPU_MASK_NONE;
5468                 sg->__cpu_power = 0;
5469
5470                 for_each_cpu_mask(j, span) {
5471                         if (group_fn(j, cpu_map, NULL) != group)
5472                                 continue;
5473
5474                         cpu_set(j, covered);
5475                         cpu_set(j, sg->cpumask);
5476                 }
5477                 if (!first)
5478                         first = sg;
5479                 if (last)
5480                         last->next = sg;
5481                 last = sg;
5482         }
5483         last->next = first;
5484 }
5485
5486 #define SD_NODES_PER_DOMAIN 16
5487
5488 #ifdef CONFIG_NUMA
5489
5490 /**
5491  * find_next_best_node - find the next node to include in a sched_domain
5492  * @node: node whose sched_domain we're building
5493  * @used_nodes: nodes already in the sched_domain
5494  *
5495  * Find the next node to include in a given scheduling domain.  Simply
5496  * finds the closest node not already in the @used_nodes map.
5497  *
5498  * Should use nodemask_t.
5499  */
5500 static int find_next_best_node(int node, unsigned long *used_nodes)
5501 {
5502         int i, n, val, min_val, best_node = 0;
5503
5504         min_val = INT_MAX;
5505
5506         for (i = 0; i < MAX_NUMNODES; i++) {
5507                 /* Start at @node */
5508                 n = (node + i) % MAX_NUMNODES;
5509
5510                 if (!nr_cpus_node(n))
5511                         continue;
5512
5513                 /* Skip already used nodes */
5514                 if (test_bit(n, used_nodes))
5515                         continue;
5516
5517                 /* Simple min distance search */
5518                 val = node_distance(node, n);
5519
5520                 if (val < min_val) {
5521                         min_val = val;
5522                         best_node = n;
5523                 }
5524         }
5525
5526         set_bit(best_node, used_nodes);
5527         return best_node;
5528 }
5529
5530 /**
5531  * sched_domain_node_span - get a cpumask for a node's sched_domain
5532  * @node: node whose cpumask we're constructing
5533  * @size: number of nodes to include in this span
5534  *
5535  * Given a node, construct a good cpumask for its sched_domain to span.  It
5536  * should be one that prevents unnecessary balancing, but also spreads tasks
5537  * out optimally.
5538  */
5539 static cpumask_t sched_domain_node_span(int node)
5540 {
5541         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5542         cpumask_t span, nodemask;
5543         int i;
5544
5545         cpus_clear(span);
5546         bitmap_zero(used_nodes, MAX_NUMNODES);
5547
5548         nodemask = node_to_cpumask(node);
5549         cpus_or(span, span, nodemask);
5550         set_bit(node, used_nodes);
5551
5552         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5553                 int next_node = find_next_best_node(node, used_nodes);
5554
5555                 nodemask = node_to_cpumask(next_node);
5556                 cpus_or(span, span, nodemask);
5557         }
5558
5559         return span;
5560 }
5561 #endif
5562
5563 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5564
5565 /*
5566  * SMT sched-domains:
5567  */
5568 #ifdef CONFIG_SCHED_SMT
5569 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5570 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5571
5572 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5573                             struct sched_group **sg)
5574 {
5575         if (sg)
5576                 *sg = &per_cpu(sched_group_cpus, cpu);
5577         return cpu;
5578 }
5579 #endif
5580
5581 /*
5582  * multi-core sched-domains:
5583  */
5584 #ifdef CONFIG_SCHED_MC
5585 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5586 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5587 #endif
5588
5589 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5590 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5591                              struct sched_group **sg)
5592 {
5593         int group;
5594         cpumask_t mask = cpu_sibling_map[cpu];
5595         cpus_and(mask, mask, *cpu_map);
5596         group = first_cpu(mask);
5597         if (sg)
5598                 *sg = &per_cpu(sched_group_core, group);
5599         return group;
5600 }
5601 #elif defined(CONFIG_SCHED_MC)
5602 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5603                              struct sched_group **sg)
5604 {
5605         if (sg)
5606                 *sg = &per_cpu(sched_group_core, cpu);
5607         return cpu;
5608 }
5609 #endif
5610
5611 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5612 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5613
5614 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5615                              struct sched_group **sg)
5616 {
5617         int group;
5618 #ifdef CONFIG_SCHED_MC
5619         cpumask_t mask = cpu_coregroup_map(cpu);
5620         cpus_and(mask, mask, *cpu_map);
5621         group = first_cpu(mask);
5622 #elif defined(CONFIG_SCHED_SMT)
5623         cpumask_t mask = cpu_sibling_map[cpu];
5624         cpus_and(mask, mask, *cpu_map);
5625         group = first_cpu(mask);
5626 #else
5627         group = cpu;
5628 #endif
5629         if (sg)
5630                 *sg = &per_cpu(sched_group_phys, group);
5631         return group;
5632 }
5633
5634 #ifdef CONFIG_NUMA
5635 /*
5636  * The init_sched_build_groups can't handle what we want to do with node
5637  * groups, so roll our own. Now each node has its own list of groups which
5638  * gets dynamically allocated.
5639  */
5640 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5641 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5642
5643 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5644 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5645
5646 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5647                                  struct sched_group **sg)
5648 {
5649         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5650         int group;
5651
5652         cpus_and(nodemask, nodemask, *cpu_map);
5653         group = first_cpu(nodemask);
5654
5655         if (sg)
5656                 *sg = &per_cpu(sched_group_allnodes, group);
5657         return group;
5658 }
5659
5660 static void init_numa_sched_groups_power(struct sched_group *group_head)
5661 {
5662         struct sched_group *sg = group_head;
5663         int j;
5664
5665         if (!sg)
5666                 return;
5667 next_sg:
5668         for_each_cpu_mask(j, sg->cpumask) {
5669                 struct sched_domain *sd;
5670
5671                 sd = &per_cpu(phys_domains, j);
5672                 if (j != first_cpu(sd->groups->cpumask)) {
5673                         /*
5674                          * Only add "power" once for each
5675                          * physical package.
5676                          */
5677                         continue;
5678                 }
5679
5680                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5681         }
5682         sg = sg->next;
5683         if (sg != group_head)
5684                 goto next_sg;
5685 }
5686 #endif
5687
5688 #ifdef CONFIG_NUMA
5689 /* Free memory allocated for various sched_group structures */
5690 static void free_sched_groups(const cpumask_t *cpu_map)
5691 {
5692         int cpu, i;
5693
5694         for_each_cpu_mask(cpu, *cpu_map) {
5695                 struct sched_group **sched_group_nodes
5696                         = sched_group_nodes_bycpu[cpu];
5697
5698                 if (!sched_group_nodes)
5699                         continue;
5700
5701                 for (i = 0; i < MAX_NUMNODES; i++) {
5702                         cpumask_t nodemask = node_to_cpumask(i);
5703                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5704
5705                         cpus_and(nodemask, nodemask, *cpu_map);
5706                         if (cpus_empty(nodemask))
5707                                 continue;
5708
5709                         if (sg == NULL)
5710                                 continue;
5711                         sg = sg->next;
5712 next_sg:
5713                         oldsg = sg;
5714                         sg = sg->next;
5715                         kfree(oldsg);
5716                         if (oldsg != sched_group_nodes[i])
5717                                 goto next_sg;
5718                 }
5719                 kfree(sched_group_nodes);
5720                 sched_group_nodes_bycpu[cpu] = NULL;
5721         }
5722 }
5723 #else
5724 static void free_sched_groups(const cpumask_t *cpu_map)
5725 {
5726 }
5727 #endif
5728
5729 /*
5730  * Initialize sched groups cpu_power.
5731  *
5732  * cpu_power indicates the capacity of sched group, which is used while
5733  * distributing the load between different sched groups in a sched domain.
5734  * Typically cpu_power for all the groups in a sched domain will be same unless
5735  * there are asymmetries in the topology. If there are asymmetries, group
5736  * having more cpu_power will pickup more load compared to the group having
5737  * less cpu_power.
5738  *
5739  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5740  * the maximum number of tasks a group can handle in the presence of other idle
5741  * or lightly loaded groups in the same sched domain.
5742  */
5743 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5744 {
5745         struct sched_domain *child;
5746         struct sched_group *group;
5747
5748         WARN_ON(!sd || !sd->groups);
5749
5750         if (cpu != first_cpu(sd->groups->cpumask))
5751                 return;
5752
5753         child = sd->child;
5754
5755         sd->groups->__cpu_power = 0;
5756
5757         /*
5758          * For perf policy, if the groups in child domain share resources
5759          * (for example cores sharing some portions of the cache hierarchy
5760          * or SMT), then set this domain groups cpu_power such that each group
5761          * can handle only one task, when there are other idle groups in the
5762          * same sched domain.
5763          */
5764         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5765                        (child->flags &
5766                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5767                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5768                 return;
5769         }
5770
5771         /*
5772          * add cpu_power of each child group to this groups cpu_power
5773          */
5774         group = child->groups;
5775         do {
5776                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5777                 group = group->next;
5778         } while (group != child->groups);
5779 }
5780
5781 /*
5782  * Build sched domains for a given set of cpus and attach the sched domains
5783  * to the individual cpus
5784  */
5785 static int build_sched_domains(const cpumask_t *cpu_map)
5786 {
5787         int i;
5788 #ifdef CONFIG_NUMA
5789         struct sched_group **sched_group_nodes = NULL;
5790         int sd_allnodes = 0;
5791
5792         /*
5793          * Allocate the per-node list of sched groups
5794          */
5795         sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5796                                            GFP_KERNEL);
5797         if (!sched_group_nodes) {
5798                 printk(KERN_WARNING "Can not alloc sched group node list\n");
5799                 return -ENOMEM;
5800         }
5801         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5802 #endif
5803
5804         /*
5805          * Set up domains for cpus specified by the cpu_map.
5806          */
5807         for_each_cpu_mask(i, *cpu_map) {
5808                 struct sched_domain *sd = NULL, *p;
5809                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5810
5811                 cpus_and(nodemask, nodemask, *cpu_map);
5812
5813 #ifdef CONFIG_NUMA
5814                 if (cpus_weight(*cpu_map) >
5815                                 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5816                         sd = &per_cpu(allnodes_domains, i);
5817                         *sd = SD_ALLNODES_INIT;
5818                         sd->span = *cpu_map;
5819                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5820                         p = sd;
5821                         sd_allnodes = 1;
5822                 } else
5823                         p = NULL;
5824
5825                 sd = &per_cpu(node_domains, i);
5826                 *sd = SD_NODE_INIT;
5827                 sd->span = sched_domain_node_span(cpu_to_node(i));
5828                 sd->parent = p;
5829                 if (p)
5830                         p->child = sd;
5831                 cpus_and(sd->span, sd->span, *cpu_map);
5832 #endif
5833
5834                 p = sd;
5835                 sd = &per_cpu(phys_domains, i);
5836                 *sd = SD_CPU_INIT;
5837                 sd->span = nodemask;
5838                 sd->parent = p;
5839                 if (p)
5840                         p->child = sd;
5841                 cpu_to_phys_group(i, cpu_map, &sd->groups);
5842
5843 #ifdef CONFIG_SCHED_MC
5844                 p = sd;
5845                 sd = &per_cpu(core_domains, i);
5846                 *sd = SD_MC_INIT;
5847                 sd->span = cpu_coregroup_map(i);
5848                 cpus_and(sd->span, sd->span, *cpu_map);
5849                 sd->parent = p;
5850                 p->child = sd;
5851                 cpu_to_core_group(i, cpu_map, &sd->groups);
5852 #endif
5853
5854 #ifdef CONFIG_SCHED_SMT
5855                 p = sd;
5856                 sd = &per_cpu(cpu_domains, i);
5857                 *sd = SD_SIBLING_INIT;
5858                 sd->span = cpu_sibling_map[i];
5859                 cpus_and(sd->span, sd->span, *cpu_map);
5860                 sd->parent = p;
5861                 p->child = sd;
5862                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5863 #endif
5864         }
5865
5866 #ifdef CONFIG_SCHED_SMT
5867         /* Set up CPU (sibling) groups */
5868         for_each_cpu_mask(i, *cpu_map) {
5869                 cpumask_t this_sibling_map = cpu_sibling_map[i];
5870                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5871                 if (i != first_cpu(this_sibling_map))
5872                         continue;
5873
5874                 init_sched_build_groups(this_sibling_map, cpu_map,
5875                                         &cpu_to_cpu_group);
5876         }
5877 #endif
5878
5879 #ifdef CONFIG_SCHED_MC
5880         /* Set up multi-core groups */
5881         for_each_cpu_mask(i, *cpu_map) {
5882                 cpumask_t this_core_map = cpu_coregroup_map(i);
5883                 cpus_and(this_core_map, this_core_map, *cpu_map);
5884                 if (i != first_cpu(this_core_map))
5885                         continue;
5886                 init_sched_build_groups(this_core_map, cpu_map,
5887                                         &cpu_to_core_group);
5888         }
5889 #endif
5890
5891         /* Set up physical groups */
5892         for (i = 0; i < MAX_NUMNODES; i++) {
5893                 cpumask_t nodemask = node_to_cpumask(i);
5894
5895                 cpus_and(nodemask, nodemask, *cpu_map);
5896                 if (cpus_empty(nodemask))
5897                         continue;
5898
5899                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5900         }
5901
5902 #ifdef CONFIG_NUMA
5903         /* Set up node groups */
5904         if (sd_allnodes)
5905                 init_sched_build_groups(*cpu_map, cpu_map,
5906                                         &cpu_to_allnodes_group);
5907
5908         for (i = 0; i < MAX_NUMNODES; i++) {
5909                 /* Set up node groups */
5910                 struct sched_group *sg, *prev;
5911                 cpumask_t nodemask = node_to_cpumask(i);
5912                 cpumask_t domainspan;
5913                 cpumask_t covered = CPU_MASK_NONE;
5914                 int j;
5915
5916                 cpus_and(nodemask, nodemask, *cpu_map);
5917                 if (cpus_empty(nodemask)) {
5918                         sched_group_nodes[i] = NULL;
5919                         continue;
5920                 }
5921
5922                 domainspan = sched_domain_node_span(i);
5923                 cpus_and(domainspan, domainspan, *cpu_map);
5924
5925                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5926                 if (!sg) {
5927                         printk(KERN_WARNING "Can not alloc domain group for "
5928                                 "node %d\n", i);
5929                         goto error;
5930                 }
5931                 sched_group_nodes[i] = sg;
5932                 for_each_cpu_mask(j, nodemask) {
5933                         struct sched_domain *sd;
5934
5935                         sd = &per_cpu(node_domains, j);
5936                         sd->groups = sg;
5937                 }
5938                 sg->__cpu_power = 0;
5939                 sg->cpumask = nodemask;
5940                 sg->next = sg;
5941                 cpus_or(covered, covered, nodemask);
5942                 prev = sg;
5943
5944                 for (j = 0; j < MAX_NUMNODES; j++) {
5945                         cpumask_t tmp, notcovered;
5946                         int n = (i + j) % MAX_NUMNODES;
5947
5948                         cpus_complement(notcovered, covered);
5949                         cpus_and(tmp, notcovered, *cpu_map);
5950                         cpus_and(tmp, tmp, domainspan);
5951                         if (cpus_empty(tmp))
5952                                 break;
5953
5954                         nodemask = node_to_cpumask(n);
5955                         cpus_and(tmp, tmp, nodemask);
5956                         if (cpus_empty(tmp))
5957                                 continue;
5958
5959                         sg = kmalloc_node(sizeof(struct sched_group),
5960                                           GFP_KERNEL, i);
5961                         if (!sg) {
5962                                 printk(KERN_WARNING
5963                                 "Can not alloc domain group for node %d\n", j);
5964                                 goto error;
5965                         }
5966                         sg->__cpu_power = 0;
5967                         sg->cpumask = tmp;
5968                         sg->next = prev->next;
5969                         cpus_or(covered, covered, tmp);
5970                         prev->next = sg;
5971                         prev = sg;
5972                 }
5973         }
5974 #endif
5975
5976         /* Calculate CPU power for physical packages and nodes */
5977 #ifdef CONFIG_SCHED_SMT
5978         for_each_cpu_mask(i, *cpu_map) {
5979                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5980
5981                 init_sched_groups_power(i, sd);
5982         }
5983 #endif
5984 #ifdef CONFIG_SCHED_MC
5985         for_each_cpu_mask(i, *cpu_map) {
5986                 struct sched_domain *sd = &per_cpu(core_domains, i);
5987
5988                 init_sched_groups_power(i, sd);
5989         }
5990 #endif
5991
5992         for_each_cpu_mask(i, *cpu_map) {
5993                 struct sched_domain *sd = &per_cpu(phys_domains, i);
5994
5995                 init_sched_groups_power(i, sd);
5996         }
5997
5998 #ifdef CONFIG_NUMA
5999         for (i = 0; i < MAX_NUMNODES; i++)
6000                 init_numa_sched_groups_power(sched_group_nodes[i]);
6001
6002         if (sd_allnodes) {
6003                 struct sched_group *sg;
6004
6005                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6006                 init_numa_sched_groups_power(sg);
6007         }
6008 #endif
6009
6010         /* Attach the domains */
6011         for_each_cpu_mask(i, *cpu_map) {
6012                 struct sched_domain *sd;
6013 #ifdef CONFIG_SCHED_SMT
6014                 sd = &per_cpu(cpu_domains, i);
6015 #elif defined(CONFIG_SCHED_MC)
6016                 sd = &per_cpu(core_domains, i);
6017 #else
6018                 sd = &per_cpu(phys_domains, i);
6019 #endif
6020                 cpu_attach_domain(sd, i);
6021         }
6022
6023         return 0;
6024
6025 #ifdef CONFIG_NUMA
6026 error:
6027         free_sched_groups(cpu_map);
6028         return -ENOMEM;
6029 #endif
6030 }
6031 /*
6032  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6033  */
6034 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6035 {
6036         cpumask_t cpu_default_map;
6037         int err;
6038
6039         /*
6040          * Setup mask for cpus without special case scheduling requirements.
6041          * For now this just excludes isolated cpus, but could be used to
6042          * exclude other special cases in the future.
6043          */
6044         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6045
6046         err = build_sched_domains(&cpu_default_map);
6047
6048         return err;
6049 }
6050
6051 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6052 {
6053         free_sched_groups(cpu_map);
6054 }
6055
6056 /*
6057  * Detach sched domains from a group of cpus specified in cpu_map
6058  * These cpus will now be attached to the NULL domain
6059  */
6060 static void detach_destroy_domains(const cpumask_t *cpu_map)
6061 {
6062         int i;
6063
6064         for_each_cpu_mask(i, *cpu_map)
6065                 cpu_attach_domain(NULL, i);
6066         synchronize_sched();
6067         arch_destroy_sched_domains(cpu_map);
6068 }
6069
6070 /*
6071  * Partition sched domains as specified by the cpumasks below.
6072  * This attaches all cpus from the cpumasks to the NULL domain,
6073  * waits for a RCU quiescent period, recalculates sched
6074  * domain information and then attaches them back to the
6075  * correct sched domains
6076  * Call with hotplug lock held
6077  */
6078 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6079 {
6080         cpumask_t change_map;
6081         int err = 0;
6082
6083         cpus_and(*partition1, *partition1, cpu_online_map);
6084         cpus_and(*partition2, *partition2, cpu_online_map);
6085         cpus_or(change_map, *partition1, *partition2);
6086
6087         /* Detach sched domains from all of the affected cpus */
6088         detach_destroy_domains(&change_map);
6089         if (!cpus_empty(*partition1))
6090                 err = build_sched_domains(partition1);
6091         if (!err && !cpus_empty(*partition2))
6092                 err = build_sched_domains(partition2);
6093
6094         return err;
6095 }
6096
6097 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6098 int arch_reinit_sched_domains(void)
6099 {
6100         int err;
6101
6102         mutex_lock(&sched_hotcpu_mutex);
6103         detach_destroy_domains(&cpu_online_map);
6104         err = arch_init_sched_domains(&cpu_online_map);
6105         mutex_unlock(&sched_hotcpu_mutex);
6106
6107         return err;
6108 }
6109
6110 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6111 {
6112         int ret;
6113
6114         if (buf[0] != '0' && buf[0] != '1')
6115                 return -EINVAL;
6116
6117         if (smt)
6118                 sched_smt_power_savings = (buf[0] == '1');
6119         else
6120                 sched_mc_power_savings = (buf[0] == '1');
6121
6122         ret = arch_reinit_sched_domains();
6123
6124         return ret ? ret : count;
6125 }
6126
6127 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6128 {
6129         int err = 0;
6130
6131 #ifdef CONFIG_SCHED_SMT
6132         if (smt_capable())
6133                 err = sysfs_create_file(&cls->kset.kobj,
6134                                         &attr_sched_smt_power_savings.attr);
6135 #endif
6136 #ifdef CONFIG_SCHED_MC
6137         if (!err && mc_capable())
6138                 err = sysfs_create_file(&cls->kset.kobj,
6139                                         &attr_sched_mc_power_savings.attr);
6140 #endif
6141         return err;
6142 }
6143 #endif
6144
6145 #ifdef CONFIG_SCHED_MC
6146 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6147 {
6148         return sprintf(page, "%u\n", sched_mc_power_savings);
6149 }
6150 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6151                                             const char *buf, size_t count)
6152 {
6153         return sched_power_savings_store(buf, count, 0);
6154 }
6155 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6156             sched_mc_power_savings_store);
6157 #endif
6158
6159 #ifdef CONFIG_SCHED_SMT
6160 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6161 {
6162         return sprintf(page, "%u\n", sched_smt_power_savings);
6163 }
6164 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6165                                              const char *buf, size_t count)
6166 {
6167         return sched_power_savings_store(buf, count, 1);
6168 }
6169 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6170             sched_smt_power_savings_store);
6171 #endif
6172
6173 /*
6174  * Force a reinitialization of the sched domains hierarchy.  The domains
6175  * and groups cannot be updated in place without racing with the balancing
6176  * code, so we temporarily attach all running cpus to the NULL domain
6177  * which will prevent rebalancing while the sched domains are recalculated.
6178  */
6179 static int update_sched_domains(struct notifier_block *nfb,
6180                                 unsigned long action, void *hcpu)
6181 {
6182         switch (action) {
6183         case CPU_UP_PREPARE:
6184         case CPU_UP_PREPARE_FROZEN:
6185         case CPU_DOWN_PREPARE:
6186         case CPU_DOWN_PREPARE_FROZEN:
6187                 detach_destroy_domains(&cpu_online_map);
6188                 return NOTIFY_OK;
6189
6190         case CPU_UP_CANCELED:
6191         case CPU_UP_CANCELED_FROZEN:
6192         case CPU_DOWN_FAILED:
6193         case CPU_DOWN_FAILED_FROZEN:
6194         case CPU_ONLINE:
6195         case CPU_ONLINE_FROZEN:
6196         case CPU_DEAD:
6197         case CPU_DEAD_FROZEN:
6198                 /*
6199                  * Fall through and re-initialise the domains.
6200                  */
6201                 break;
6202         default:
6203                 return NOTIFY_DONE;
6204         }
6205
6206         /* The hotplug lock is already held by cpu_up/cpu_down */
6207         arch_init_sched_domains(&cpu_online_map);
6208
6209         return NOTIFY_OK;
6210 }
6211
6212 void __init sched_init_smp(void)
6213 {
6214         cpumask_t non_isolated_cpus;
6215
6216         mutex_lock(&sched_hotcpu_mutex);
6217         arch_init_sched_domains(&cpu_online_map);
6218         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6219         if (cpus_empty(non_isolated_cpus))
6220                 cpu_set(smp_processor_id(), non_isolated_cpus);
6221         mutex_unlock(&sched_hotcpu_mutex);
6222         /* XXX: Theoretical race here - CPU may be hotplugged now */
6223         hotcpu_notifier(update_sched_domains, 0);
6224
6225         /* Move init over to a non-isolated CPU */
6226         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6227                 BUG();
6228         sched_init_granularity();
6229 }
6230 #else
6231 void __init sched_init_smp(void)
6232 {
6233         sched_init_granularity();
6234 }
6235 #endif /* CONFIG_SMP */
6236
6237 int in_sched_functions(unsigned long addr)
6238 {
6239         /* Linker adds these: start and end of __sched functions */
6240         extern char __sched_text_start[], __sched_text_end[];
6241
6242         return in_lock_functions(addr) ||
6243                 (addr >= (unsigned long)__sched_text_start
6244                 && addr < (unsigned long)__sched_text_end);
6245 }
6246
6247 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6248 {
6249         cfs_rq->tasks_timeline = RB_ROOT;
6250         cfs_rq->fair_clock = 1;
6251 #ifdef CONFIG_FAIR_GROUP_SCHED
6252         cfs_rq->rq = rq;
6253 #endif
6254 }
6255
6256 void __init sched_init(void)
6257 {
6258         u64 now = sched_clock();
6259         int highest_cpu = 0;
6260         int i, j;
6261
6262         /*
6263          * Link up the scheduling class hierarchy:
6264          */
6265         rt_sched_class.next = &fair_sched_class;
6266         fair_sched_class.next = &idle_sched_class;
6267         idle_sched_class.next = NULL;
6268
6269         for_each_possible_cpu(i) {
6270                 struct rt_prio_array *array;
6271                 struct rq *rq;
6272
6273                 rq = cpu_rq(i);
6274                 spin_lock_init(&rq->lock);
6275                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6276                 rq->nr_running = 0;
6277                 rq->clock = 1;
6278                 init_cfs_rq(&rq->cfs, rq);
6279 #ifdef CONFIG_FAIR_GROUP_SCHED
6280                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6281                 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6282 #endif
6283                 rq->ls.load_update_last = now;
6284                 rq->ls.load_update_start = now;
6285
6286                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6287                         rq->cpu_load[j] = 0;
6288 #ifdef CONFIG_SMP
6289                 rq->sd = NULL;
6290                 rq->active_balance = 0;
6291                 rq->next_balance = jiffies;
6292                 rq->push_cpu = 0;
6293                 rq->cpu = i;
6294                 rq->migration_thread = NULL;
6295                 INIT_LIST_HEAD(&rq->migration_queue);
6296 #endif
6297                 atomic_set(&rq->nr_iowait, 0);
6298
6299                 array = &rq->rt.active;
6300                 for (j = 0; j < MAX_RT_PRIO; j++) {
6301                         INIT_LIST_HEAD(array->queue + j);
6302                         __clear_bit(j, array->bitmap);
6303                 }
6304                 highest_cpu = i;
6305                 /* delimiter for bitsearch: */
6306                 __set_bit(MAX_RT_PRIO, array->bitmap);
6307         }
6308
6309         set_load_weight(&init_task);
6310
6311 #ifdef CONFIG_SMP
6312         nr_cpu_ids = highest_cpu + 1;
6313         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6314 #endif
6315
6316 #ifdef CONFIG_RT_MUTEXES
6317         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6318 #endif
6319
6320         /*
6321          * The boot idle thread does lazy MMU switching as well:
6322          */
6323         atomic_inc(&init_mm.mm_count);
6324         enter_lazy_tlb(&init_mm, current);
6325
6326         /*
6327          * Make us the idle thread. Technically, schedule() should not be
6328          * called from this thread, however somewhere below it might be,
6329          * but because we are the idle thread, we just pick up running again
6330          * when this runqueue becomes "idle".
6331          */
6332         init_idle(current, smp_processor_id());
6333         /*
6334          * During early bootup we pretend to be a normal task:
6335          */
6336         current->sched_class = &fair_sched_class;
6337 }
6338
6339 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6340 void __might_sleep(char *file, int line)
6341 {
6342 #ifdef in_atomic
6343         static unsigned long prev_jiffy;        /* ratelimiting */
6344
6345         if ((in_atomic() || irqs_disabled()) &&
6346             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6347                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6348                         return;
6349                 prev_jiffy = jiffies;
6350                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6351                                 " context at %s:%d\n", file, line);
6352                 printk("in_atomic():%d, irqs_disabled():%d\n",
6353                         in_atomic(), irqs_disabled());
6354                 debug_show_held_locks(current);
6355                 if (irqs_disabled())
6356                         print_irqtrace_events(current);
6357                 dump_stack();
6358         }
6359 #endif
6360 }
6361 EXPORT_SYMBOL(__might_sleep);
6362 #endif
6363
6364 #ifdef CONFIG_MAGIC_SYSRQ
6365 void normalize_rt_tasks(void)
6366 {
6367         struct task_struct *g, *p;
6368         unsigned long flags;
6369         struct rq *rq;
6370         int on_rq;
6371
6372         read_lock_irq(&tasklist_lock);
6373         do_each_thread(g, p) {
6374                 p->se.fair_key                  = 0;
6375                 p->se.wait_runtime              = 0;
6376                 p->se.wait_start_fair           = 0;
6377                 p->se.wait_start                = 0;
6378                 p->se.exec_start                = 0;
6379                 p->se.sleep_start               = 0;
6380                 p->se.sleep_start_fair          = 0;
6381                 p->se.block_start               = 0;
6382                 task_rq(p)->cfs.fair_clock      = 0;
6383                 task_rq(p)->clock               = 0;
6384
6385                 if (!rt_task(p)) {
6386                         /*
6387                          * Renice negative nice level userspace
6388                          * tasks back to 0:
6389                          */
6390                         if (TASK_NICE(p) < 0 && p->mm)
6391                                 set_user_nice(p, 0);
6392                         continue;
6393                 }
6394
6395                 spin_lock_irqsave(&p->pi_lock, flags);
6396                 rq = __task_rq_lock(p);
6397 #ifdef CONFIG_SMP
6398                 /*
6399                  * Do not touch the migration thread:
6400                  */
6401                 if (p == rq->migration_thread)
6402                         goto out_unlock;
6403 #endif
6404
6405                 on_rq = p->se.on_rq;
6406                 if (on_rq)
6407                         deactivate_task(task_rq(p), p, 0);
6408                 __setscheduler(rq, p, SCHED_NORMAL, 0);
6409                 if (on_rq) {
6410                         activate_task(task_rq(p), p, 0);
6411                         resched_task(rq->curr);
6412                 }
6413 #ifdef CONFIG_SMP
6414  out_unlock:
6415 #endif
6416                 __task_rq_unlock(rq);
6417                 spin_unlock_irqrestore(&p->pi_lock, flags);
6418         } while_each_thread(g, p);
6419
6420         read_unlock_irq(&tasklist_lock);
6421 }
6422
6423 #endif /* CONFIG_MAGIC_SYSRQ */
6424
6425 #ifdef CONFIG_IA64
6426 /*
6427  * These functions are only useful for the IA64 MCA handling.
6428  *
6429  * They can only be called when the whole system has been
6430  * stopped - every CPU needs to be quiescent, and no scheduling
6431  * activity can take place. Using them for anything else would
6432  * be a serious bug, and as a result, they aren't even visible
6433  * under any other configuration.
6434  */
6435
6436 /**
6437  * curr_task - return the current task for a given cpu.
6438  * @cpu: the processor in question.
6439  *
6440  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6441  */
6442 struct task_struct *curr_task(int cpu)
6443 {
6444         return cpu_curr(cpu);
6445 }
6446
6447 /**
6448  * set_curr_task - set the current task for a given cpu.
6449  * @cpu: the processor in question.
6450  * @p: the task pointer to set.
6451  *
6452  * Description: This function must only be used when non-maskable interrupts
6453  * are serviced on a separate stack.  It allows the architecture to switch the
6454  * notion of the current task on a cpu in a non-blocking manner.  This function
6455  * must be called with all CPU's synchronized, and interrupts disabled, the
6456  * and caller must save the original value of the current task (see
6457  * curr_task() above) and restore that value before reenabling interrupts and
6458  * re-starting the system.
6459  *
6460  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6461  */
6462 void set_curr_task(int cpu, struct task_struct *p)
6463 {
6464         cpu_curr(cpu) = p;
6465 }
6466
6467 #endif