kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/slab.h>
  27 #include <linux/profile.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/mempolicy.h>
  30 #include <linux/migrate.h>
  31 #include <linux/task_work.h>
  32
  33 #include <trace/events/sched.h>
  34 #include <linux/sysfs.h>
  35 #include <linux/vmalloc.h>
  36 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
  37 /* Include cpufreq header to add a notifier so that cpu frequency
  38  * scaling can track the current CPU frequency
  39  */
  40 #include <linux/cpufreq.h>
  41 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
  42
  43 #include "sched.h"
  44
  45
  46 /*
  47  * Targeted preemption latency for CPU-bound tasks:
  48  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  49  *
  50  * NOTE: this latency value is not the same as the concept of
  51  * 'timeslice length' - timeslices in CFS are of variable length
  52  * and have no persistent notion like in traditional, time-slice
  53  * based scheduling concepts.
  54  *
  55  * (to see the precise effective timeslice length of your workload,
  56  *  run vmstat and monitor the context-switches (cs) field)
  57  */
  58 unsigned int sysctl_sched_latency = 6000000ULL;
  59 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  60
  61 /*
  62  * The initial- and re-scaling of tunables is configurable
  63  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  64  *
  65  * Options are:
  66  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  67  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  68  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  69  */
  70 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  71         = SCHED_TUNABLESCALING_LOG;
  72
  73 /*
  74  * Minimal preemption granularity for CPU-bound tasks:
  75  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  76  */
  77 unsigned int sysctl_sched_min_granularity = 750000ULL;
  78 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  79
  80 /*
  81  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  82  */
  83 static unsigned int sched_nr_latency = 8;
  84
  85 /*
  86  * After fork, child runs first. If set to 0 (default) then
  87  * parent will (try to) run first.
  88  */
  89 unsigned int sysctl_sched_child_runs_first __read_mostly;
  90
  91 /*
  92  * SCHED_OTHER wake-up granularity.
  93  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  94  *
  95  * This option delays the preemption effects of decoupled workloads
  96  * and reduces their over-scheduling. Synchronous workloads will still
  97  * have immediate wakeup/sleep latencies.
  98  */
  99 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 100 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 101
 102 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 103
 104 /*
 105  * The exponential sliding  window over which load is averaged for shares
 106  * distribution.
 107  * (default: 10msec)
 108  */
 109 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 110
 111 #ifdef CONFIG_CFS_BANDWIDTH
 112 /*
 113  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 114  * each time a cfs_rq requests quota.
 115  *
 116  * Note: in the case that the slice exceeds the runtime remaining (either due
 117  * to consumption or the quota being specified to be smaller than the slice)
 118  * we will always only issue the remaining available time.
 119  *
 120  * default: 5 msec, units: microseconds
 121   */
 122 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 123 #endif
 124
 125 /*
 126  * Increase the granularity value when there are more CPUs,
 127  * because with more CPUs the 'effective latency' as visible
 128  * to users decreases. But the relationship is not linear,
 129  * so pick a second-best guess by going with the log2 of the
 130  * number of CPUs.
 131  *
 132  * This idea comes from the SD scheduler of Con Kolivas:
 133  */
 134 static int get_update_sysctl_factor(void)
 135 {
 136         unsigned int cpus = min_t(int, num_online_cpus(), 8);
 137         unsigned int factor;
 138
 139         switch (sysctl_sched_tunable_scaling) {
 140         case SCHED_TUNABLESCALING_NONE:
 141                 factor = 1;
 142                 break;
 143         case SCHED_TUNABLESCALING_LINEAR:
 144                 factor = cpus;
 145                 break;
 146         case SCHED_TUNABLESCALING_LOG:
 147         default:
 148                 factor = 1 + ilog2(cpus);
 149                 break;
 150         }
 151
 152         return factor;
 153 }
 154
 155 static void update_sysctl(void)
 156 {
 157         unsigned int factor = get_update_sysctl_factor();
 158
 159 #define SET_SYSCTL(name) \
 160         (sysctl_##name = (factor) * normalized_sysctl_##name)
 161         SET_SYSCTL(sched_min_granularity);
 162         SET_SYSCTL(sched_latency);
 163         SET_SYSCTL(sched_wakeup_granularity);
 164 #undef SET_SYSCTL
 165 }
 166
 167 void sched_init_granularity(void)
 168 {
 169         update_sysctl();
 170 }
 171
 172 #if BITS_PER_LONG == 32
 173 # define WMULT_CONST    (~0UL)
 174 #else
 175 # define WMULT_CONST    (1UL << 32)
 176 #endif
 177
 178 #define WMULT_SHIFT     32
 179
 180 /*
 181  * Shift right and round:
 182  */
 183 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 184
 185 /*
 186  * delta *= weight / lw
 187  */
 188 static unsigned long
 189 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 190                 struct load_weight *lw)
 191 {
 192         u64 tmp;
 193
 194         /*
 195          * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
 196          * entities since MIN_SHARES = 2. Treat weight as 1 if less than
 197          * 2^SCHED_LOAD_RESOLUTION.
 198          */
 199         if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
 200                 tmp = (u64)delta_exec * scale_load_down(weight);
 201         else
 202                 tmp = (u64)delta_exec;
 203
 204         if (!lw->inv_weight) {
 205                 unsigned long w = scale_load_down(lw->weight);
 206
 207                 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 208                         lw->inv_weight = 1;
 209                 else if (unlikely(!w))
 210                         lw->inv_weight = WMULT_CONST;
 211                 else
 212                         lw->inv_weight = WMULT_CONST / w;
 213         }
 214
 215         /*
 216          * Check whether we'd overflow the 64-bit multiplication:
 217          */
 218         if (unlikely(tmp > WMULT_CONST))
 219                 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 220                         WMULT_SHIFT/2);
 221         else
 222                 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 223
 224         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 225 }
 226
 227
 228 const struct sched_class fair_sched_class;
 229
 230 /**************************************************************
 231  * CFS operations on generic schedulable entities:
 232  */
 233
 234 #ifdef CONFIG_FAIR_GROUP_SCHED
 235
 236 /* cpu runqueue to which this cfs_rq is attached */
 237 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 238 {
 239         return cfs_rq->rq;
 240 }
 241
 242 /* An entity is a task if it doesn't "own" a runqueue */
 243 #define entity_is_task(se)      (!se->my_q)
 244
 245 static inline struct task_struct *task_of(struct sched_entity *se)
 246 {
 247 #ifdef CONFIG_SCHED_DEBUG
 248         WARN_ON_ONCE(!entity_is_task(se));
 249 #endif
 250         return container_of(se, struct task_struct, se);
 251 }
 252
 253 /* Walk up scheduling entities hierarchy */
 254 #define for_each_sched_entity(se) \
 255                 for (; se; se = se->parent)
 256
 257 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 258 {
 259         return p->se.cfs_rq;
 260 }
 261
 262 /* runqueue on which this entity is (to be) queued */
 263 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 264 {
 265         return se->cfs_rq;
 266 }
 267
 268 /* runqueue "owned" by this group */
 269 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 270 {
 271         return grp->my_q;
 272 }
 273
 274 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 275                                        int force_update);
 276
 277 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 278 {
 279         if (!cfs_rq->on_list) {
 280                 /*
 281                  * Ensure we either appear before our parent (if already
 282                  * enqueued) or force our parent to appear after us when it is
 283                  * enqueued.  The fact that we always enqueue bottom-up
 284                  * reduces this to two cases.
 285                  */
 286                 if (cfs_rq->tg->parent &&
 287                     cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 288                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 289                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 290                 } else {
 291                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 292                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 293                 }
 294
 295                 cfs_rq->on_list = 1;
 296                 /* We should have no load, but we need to update last_decay. */
 297                 update_cfs_rq_blocked_load(cfs_rq, 0);
 298         }
 299 }
 300
 301 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 302 {
 303         if (cfs_rq->on_list) {
 304                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 305                 cfs_rq->on_list = 0;
 306         }
 307 }
 308
 309 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 310 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 311         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 312
 313 /* Do the two (enqueued) entities belong to the same group ? */
 314 static inline int
 315 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 316 {
 317         if (se->cfs_rq == pse->cfs_rq)
 318                 return 1;
 319
 320         return 0;
 321 }
 322
 323 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 324 {
 325         return se->parent;
 326 }
 327
 328 /* return depth at which a sched entity is present in the hierarchy */
 329 static inline int depth_se(struct sched_entity *se)
 330 {
 331         int depth = 0;
 332
 333         for_each_sched_entity(se)
 334                 depth++;
 335
 336         return depth;
 337 }
 338
 339 static void
 340 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 341 {
 342         int se_depth, pse_depth;
 343
 344         /*
 345          * preemption test can be made between sibling entities who are in the
 346          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 347          * both tasks until we find their ancestors who are siblings of common
 348          * parent.
 349          */
 350
 351         /* First walk up until both entities are at same depth */
 352         se_depth = depth_se(*se);
 353         pse_depth = depth_se(*pse);
 354
 355         while (se_depth > pse_depth) {
 356                 se_depth--;
 357                 *se = parent_entity(*se);
 358         }
 359
 360         while (pse_depth > se_depth) {
 361                 pse_depth--;
 362                 *pse = parent_entity(*pse);
 363         }
 364
 365         while (!is_same_group(*se, *pse)) {
 366                 *se = parent_entity(*se);
 367                 *pse = parent_entity(*pse);
 368         }
 369 }
 370
 371 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 372
 373 static inline struct task_struct *task_of(struct sched_entity *se)
 374 {
 375         return container_of(se, struct task_struct, se);
 376 }
 377
 378 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 379 {
 380         return container_of(cfs_rq, struct rq, cfs);
 381 }
 382
 383 #define entity_is_task(se)      1
 384
 385 #define for_each_sched_entity(se) \
 386                 for (; se; se = NULL)
 387
 388 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 389 {
 390         return &task_rq(p)->cfs;
 391 }
 392
 393 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 394 {
 395         struct task_struct *p = task_of(se);
 396         struct rq *rq = task_rq(p);
 397
 398         return &rq->cfs;
 399 }
 400
 401 /* runqueue "owned" by this group */
 402 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 403 {
 404         return NULL;
 405 }
 406
 407 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 408 {
 409 }
 410
 411 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 412 {
 413 }
 414
 415 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 416                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 417
 418 static inline int
 419 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 420 {
 421         return 1;
 422 }
 423
 424 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 425 {
 426         return NULL;
 427 }
 428
 429 static inline void
 430 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 431 {
 432 }
 433
 434 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 435
 436 static __always_inline
 437 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 438
 439 /**************************************************************
 440  * Scheduling class tree data structure manipulation methods:
 441  */
 442
 443 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 444 {
 445         s64 delta = (s64)(vruntime - max_vruntime);
 446         if (delta > 0)
 447                 max_vruntime = vruntime;
 448
 449         return max_vruntime;
 450 }
 451
 452 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 453 {
 454         s64 delta = (s64)(vruntime - min_vruntime);
 455         if (delta < 0)
 456                 min_vruntime = vruntime;
 457
 458         return min_vruntime;
 459 }
 460
 461 static inline int entity_before(struct sched_entity *a,
 462                                 struct sched_entity *b)
 463 {
 464         return (s64)(a->vruntime - b->vruntime) < 0;
 465 }
 466
 467 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 468 {
 469         u64 vruntime = cfs_rq->min_vruntime;
 470
 471         if (cfs_rq->curr)
 472                 vruntime = cfs_rq->curr->vruntime;
 473
 474         if (cfs_rq->rb_leftmost) {
 475                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 476                                                    struct sched_entity,
 477                                                    run_node);
 478
 479                 if (!cfs_rq->curr)
 480                         vruntime = se->vruntime;
 481                 else
 482                         vruntime = min_vruntime(vruntime, se->vruntime);
 483         }
 484
 485         /* ensure we never gain time by being placed backwards. */
 486         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 487 #ifndef CONFIG_64BIT
 488         smp_wmb();
 489         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 490 #endif
 491 }
 492
 493 /*
 494  * Enqueue an entity into the rb-tree:
 495  */
 496 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 497 {
 498         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 499         struct rb_node *parent = NULL;
 500         struct sched_entity *entry;
 501         int leftmost = 1;
 502
 503         /*
 504          * Find the right place in the rbtree:
 505          */
 506         while (*link) {
 507                 parent = *link;
 508                 entry = rb_entry(parent, struct sched_entity, run_node);
 509                 /*
 510                  * We dont care about collisions. Nodes with
 511                  * the same key stay together.
 512                  */
 513                 if (entity_before(se, entry)) {
 514                         link = &parent->rb_left;
 515                 } else {
 516                         link = &parent->rb_right;
 517                         leftmost = 0;
 518                 }
 519         }
 520
 521         /*
 522          * Maintain a cache of leftmost tree entries (it is frequently
 523          * used):
 524          */
 525         if (leftmost)
 526                 cfs_rq->rb_leftmost = &se->run_node;
 527
 528         rb_link_node(&se->run_node, parent, link);
 529         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 530 }
 531
 532 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 533 {
 534         if (cfs_rq->rb_leftmost == &se->run_node) {
 535                 struct rb_node *next_node;
 536
 537                 next_node = rb_next(&se->run_node);
 538                 cfs_rq->rb_leftmost = next_node;
 539         }
 540
 541         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 542 }
 543
 544 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 545 {
 546         struct rb_node *left = cfs_rq->rb_leftmost;
 547
 548         if (!left)
 549                 return NULL;
 550
 551         return rb_entry(left, struct sched_entity, run_node);
 552 }
 553
 554 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 555 {
 556         struct rb_node *next = rb_next(&se->run_node);
 557
 558         if (!next)
 559                 return NULL;
 560
 561         return rb_entry(next, struct sched_entity, run_node);
 562 }
 563
 564 #ifdef CONFIG_SCHED_DEBUG
 565 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 566 {
 567         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 568
 569         if (!last)
 570                 return NULL;
 571
 572         return rb_entry(last, struct sched_entity, run_node);
 573 }
 574
 575 /**************************************************************
 576  * Scheduling class statistics methods:
 577  */
 578
 579 int sched_proc_update_handler(struct ctl_table *table, int write,
 580                 void __user *buffer, size_t *lenp,
 581                 loff_t *ppos)
 582 {
 583         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 584         int factor = get_update_sysctl_factor();
 585
 586         if (ret || !write)
 587                 return ret;
 588
 589         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 590                                         sysctl_sched_min_granularity);
 591
 592 #define WRT_SYSCTL(name) \
 593         (normalized_sysctl_##name = sysctl_##name / (factor))
 594         WRT_SYSCTL(sched_min_granularity);
 595         WRT_SYSCTL(sched_latency);
 596         WRT_SYSCTL(sched_wakeup_granularity);
 597 #undef WRT_SYSCTL
 598
 599         return 0;
 600 }
 601 #endif
 602
 603 /*
 604  * delta /= w
 605  */
 606 static inline unsigned long
 607 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 608 {
 609         if (unlikely(se->load.weight != NICE_0_LOAD))
 610                 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
 611
 612         return delta;
 613 }
 614
 615 /*
 616  * The idea is to set a period in which each task runs once.
 617  *
 618  * When there are too many tasks (sched_nr_latency) we have to stretch
 619  * this period because otherwise the slices get too small.
 620  *
 621  * p = (nr <= nl) ? l : l*nr/nl
 622  */
 623 static u64 __sched_period(unsigned long nr_running)
 624 {
 625         u64 period = sysctl_sched_latency;
 626         unsigned long nr_latency = sched_nr_latency;
 627
 628         if (unlikely(nr_running > nr_latency)) {
 629                 period = sysctl_sched_min_granularity;
 630                 period *= nr_running;
 631         }
 632
 633         return period;
 634 }
 635
 636 /*
 637  * We calculate the wall-time slice from the period by taking a part
 638  * proportional to the weight.
 639  *
 640  * s = p*P[w/rw]
 641  */
 642 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 643 {
 644         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 645
 646         for_each_sched_entity(se) {
 647                 struct load_weight *load;
 648                 struct load_weight lw;
 649
 650                 cfs_rq = cfs_rq_of(se);
 651                 load = &cfs_rq->load;
 652
 653                 if (unlikely(!se->on_rq)) {
 654                         lw = cfs_rq->load;
 655
 656                         update_load_add(&lw, se->load.weight);
 657                         load = &lw;
 658                 }
 659                 slice = calc_delta_mine(slice, se->load.weight, load);
 660         }
 661         return slice;
 662 }
 663
 664 /*
 665  * We calculate the vruntime slice of a to-be-inserted task.
 666  *
 667  * vs = s/w
 668  */
 669 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 670 {
 671         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 672 }
 673
 674 /*
 675  * Update the current task's runtime statistics. Skip current tasks that
 676  * are not in our scheduling class.
 677  */
 678 static inline void
 679 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 680               unsigned long delta_exec)
 681 {
 682         unsigned long delta_exec_weighted;
 683
 684         schedstat_set(curr->statistics.exec_max,
 685                       max((u64)delta_exec, curr->statistics.exec_max));
 686
 687         curr->sum_exec_runtime += delta_exec;
 688         schedstat_add(cfs_rq, exec_clock, delta_exec);
 689         delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 690
 691         curr->vruntime += delta_exec_weighted;
 692         update_min_vruntime(cfs_rq);
 693 }
 694
 695 static void update_curr(struct cfs_rq *cfs_rq)
 696 {
 697         struct sched_entity *curr = cfs_rq->curr;
 698         u64 now = rq_of(cfs_rq)->clock_task;
 699         unsigned long delta_exec;
 700
 701         if (unlikely(!curr))
 702                 return;
 703
 704         /*
 705          * Get the amount of time the current task was running
 706          * since the last time we changed load (this cannot
 707          * overflow on 32 bits):
 708          */
 709         delta_exec = (unsigned long)(now - curr->exec_start);
 710         if (!delta_exec)
 711                 return;
 712
 713         __update_curr(cfs_rq, curr, delta_exec);
 714         curr->exec_start = now;
 715
 716         if (entity_is_task(curr)) {
 717                 struct task_struct *curtask = task_of(curr);
 718
 719                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 720                 cpuacct_charge(curtask, delta_exec);
 721                 account_group_exec_runtime(curtask, delta_exec);
 722         }
 723
 724         account_cfs_rq_runtime(cfs_rq, delta_exec);
 725 }
 726
 727 static inline void
 728 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 729 {
 730         schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 731 }
 732
 733 /*
 734  * Task is being enqueued - update stats:
 735  */
 736 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 737 {
 738         /*
 739          * Are we enqueueing a waiting task? (for current tasks
 740          * a dequeue/enqueue event is a NOP)
 741          */
 742         if (se != cfs_rq->curr)
 743                 update_stats_wait_start(cfs_rq, se);
 744 }
 745
 746 static void
 747 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 748 {
 749         schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 750                         rq_of(cfs_rq)->clock - se->statistics.wait_start));
 751         schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 752         schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 753                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
 754 #ifdef CONFIG_SCHEDSTATS
 755         if (entity_is_task(se)) {
 756                 trace_sched_stat_wait(task_of(se),
 757                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
 758         }
 759 #endif
 760         schedstat_set(se->statistics.wait_start, 0);
 761 }
 762
 763 static inline void
 764 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 765 {
 766         /*
 767          * Mark the end of the wait period if dequeueing a
 768          * waiting task:
 769          */
 770         if (se != cfs_rq->curr)
 771                 update_stats_wait_end(cfs_rq, se);
 772 }
 773
 774 /*
 775  * We are picking a new current task - update its stats:
 776  */
 777 static inline void
 778 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 779 {
 780         /*
 781          * We are starting a new run period:
 782          */
 783         se->exec_start = rq_of(cfs_rq)->clock_task;
 784 }
 785
 786 /**************************************************
 787  * Scheduling class queueing methods:
 788  */
 789
 790 #ifdef CONFIG_NUMA_BALANCING
 791 /*
 792  * numa task sample period in ms
 793  */
 794 unsigned int sysctl_numa_balancing_scan_period_min = 100;
 795 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
 796 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 797
 798 /* Portion of address space to scan in MB */
 799 unsigned int sysctl_numa_balancing_scan_size = 256;
 800
 801 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 802 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 803
 804 static void task_numa_placement(struct task_struct *p)
 805 {
 806         int seq;
 807
 808         if (!p->mm)     /* for example, ksmd faulting in a user's mm */
 809                 return;
 810         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 811         if (p->numa_scan_seq == seq)
 812                 return;
 813         p->numa_scan_seq = seq;
 814
 815         /* FIXME: Scheduling placement policy hints go here */
 816 }
 817
 818 /*
 819  * Got a PROT_NONE fault for a page on @node.
 820  */
 821 void task_numa_fault(int node, int pages, bool migrated)
 822 {
 823         struct task_struct *p = current;
 824
 825         if (!sched_feat_numa(NUMA))
 826                 return;
 827
 828         /* FIXME: Allocate task-specific structure for placement policy here */
 829
 830         /*
 831          * If pages are properly placed (did not migrate) then scan slower.
 832          * This is reset periodically in case of phase changes
 833          */
 834         if (!migrated)
 835                 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
 836                         p->numa_scan_period + jiffies_to_msecs(10));
 837
 838         task_numa_placement(p);
 839 }
 840
 841 static void reset_ptenuma_scan(struct task_struct *p)
 842 {
 843         ACCESS_ONCE(p->mm->numa_scan_seq)++;
 844         p->mm->numa_scan_offset = 0;
 845 }
 846
 847 /*
 848  * The expensive part of numa migration is done from task_work context.
 849  * Triggered from task_tick_numa().
 850  */
 851 void task_numa_work(struct callback_head *work)
 852 {
 853         unsigned long migrate, next_scan, now = jiffies;
 854         struct task_struct *p = current;
 855         struct mm_struct *mm = p->mm;
 856         struct vm_area_struct *vma;
 857         unsigned long start, end;
 858         long pages;
 859
 860         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
 861
 862         work->next = work; /* protect against double add */
 863         /*
 864          * Who cares about NUMA placement when they're dying.
 865          *
 866          * NOTE: make sure not to dereference p->mm before this check,
 867          * exit_task_work() happens _after_ exit_mm() so we could be called
 868          * without p->mm even though we still had it when we enqueued this
 869          * work.
 870          */
 871         if (p->flags & PF_EXITING)
 872                 return;
 873
 874         /*
 875          * We do not care about task placement until a task runs on a node
 876          * other than the first one used by the address space. This is
 877          * largely because migrations are driven by what CPU the task
 878          * is running on. If it's never scheduled on another node, it'll
 879          * not migrate so why bother trapping the fault.
 880          */
 881         if (mm->first_nid == NUMA_PTE_SCAN_INIT)
 882                 mm->first_nid = numa_node_id();
 883         if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
 884                 /* Are we running on a new node yet? */
 885                 if (numa_node_id() == mm->first_nid &&
 886                     !sched_feat_numa(NUMA_FORCE))
 887                         return;
 888
 889                 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
 890         }
 891
 892         /*
 893          * Reset the scan period if enough time has gone by. Objective is that
 894          * scanning will be reduced if pages are properly placed. As tasks
 895          * can enter different phases this needs to be re-examined. Lacking
 896          * proper tracking of reference behaviour, this blunt hammer is used.
 897          */
 898         migrate = mm->numa_next_reset;
 899         if (time_after(now, migrate)) {
 900                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 901                 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
 902                 xchg(&mm->numa_next_reset, next_scan);
 903         }
 904
 905         /*
 906          * Enforce maximal scan/migration frequency..
 907          */
 908         migrate = mm->numa_next_scan;
 909         if (time_before(now, migrate))
 910                 return;
 911
 912         if (p->numa_scan_period == 0)
 913                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 914
 915         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 916         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
 917                 return;
 918
 919         /*
 920          * Do not set pte_numa if the current running node is rate-limited.
 921          * This loses statistics on the fault but if we are unwilling to
 922          * migrate to this node, it is less likely we can do useful work
 923          */
 924         if (migrate_ratelimited(numa_node_id()))
 925                 return;
 926
 927         start = mm->numa_scan_offset;
 928         pages = sysctl_numa_balancing_scan_size;
 929         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 930         if (!pages)
 931                 return;
 932
 933         down_read(&mm->mmap_sem);
 934         vma = find_vma(mm, start);
 935         if (!vma) {
 936                 reset_ptenuma_scan(p);
 937                 start = 0;
 938                 vma = mm->mmap;
 939         }
 940         for (; vma; vma = vma->vm_next) {
 941                 if (!vma_migratable(vma))
 942                         continue;
 943
 944                 /* Skip small VMAs. They are not likely to be of relevance */
 945                 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 946                         continue;
 947
 948                 do {
 949                         start = max(start, vma->vm_start);
 950                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 951                         end = min(end, vma->vm_end);
 952                         pages -= change_prot_numa(vma, start, end);
 953
 954                         start = end;
 955                         if (pages <= 0)
 956                                 goto out;
 957                 } while (end != vma->vm_end);
 958         }
 959
 960 out:
 961         /*
 962          * It is possible to reach the end of the VMA list but the last few VMAs are
 963          * not guaranteed to the vma_migratable. If they are not, we would find the
 964          * !migratable VMA on the next scan but not reset the scanner to the start
 965          * so check it now.
 966          */
 967         if (vma)
 968                 mm->numa_scan_offset = start;
 969         else
 970                 reset_ptenuma_scan(p);
 971         up_read(&mm->mmap_sem);
 972 }
 973
 974 /*
 975  * Drive the periodic memory faults..
 976  */
 977 void task_tick_numa(struct rq *rq, struct task_struct *curr)
 978 {
 979         struct callback_head *work = &curr->numa_work;
 980         u64 period, now;
 981
 982         /*
 983          * We don't care about NUMA placement if we don't have memory.
 984          */
 985         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 986                 return;
 987
 988         /*
 989          * Using runtime rather than walltime has the dual advantage that
 990          * we (mostly) drive the selection from busy threads and that the
 991          * task needs to have done some actual work before we bother with
 992          * NUMA placement.
 993          */
 994         now = curr->se.sum_exec_runtime;
 995         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 996
 997         if (now - curr->node_stamp > period) {
 998                 if (!curr->node_stamp)
 999                         curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1000                 curr->node_stamp = now;
1001
1002                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1003                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1004                         task_work_add(curr, work, true);
1005                 }
1006         }
1007 }
1008 #else
1009 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1010 {
1011 }
1012 #endif /* CONFIG_NUMA_BALANCING */
1013
1014 static void
1015 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1016 {
1017         update_load_add(&cfs_rq->load, se->load.weight);
1018         if (!parent_entity(se))
1019                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1020 #ifdef CONFIG_SMP
1021         if (entity_is_task(se))
1022                 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1023 #endif
1024         cfs_rq->nr_running++;
1025 }
1026
1027 static void
1028 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1029 {
1030         update_load_sub(&cfs_rq->load, se->load.weight);
1031         if (!parent_entity(se))
1032                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1033         if (entity_is_task(se))
1034                 list_del_init(&se->group_node);
1035         cfs_rq->nr_running--;
1036 }
1037
1038 #ifdef CONFIG_FAIR_GROUP_SCHED
1039 # ifdef CONFIG_SMP
1040 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1041 {
1042         long tg_weight;
1043
1044         /*
1045          * Use this CPU's actual weight instead of the last load_contribution
1046          * to gain a more accurate current total weight. See
1047          * update_cfs_rq_load_contribution().
1048          */
1049         tg_weight = atomic64_read(&tg->load_avg);
1050         tg_weight -= cfs_rq->tg_load_contrib;
1051         tg_weight += cfs_rq->load.weight;
1052
1053         return tg_weight;
1054 }
1055
1056 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1057 {
1058         long tg_weight, load, shares;
1059
1060         tg_weight = calc_tg_weight(tg, cfs_rq);
1061         load = cfs_rq->load.weight;
1062
1063         shares = (tg->shares * load);
1064         if (tg_weight)
1065                 shares /= tg_weight;
1066
1067         if (shares < MIN_SHARES)
1068                 shares = MIN_SHARES;
1069         if (shares > tg->shares)
1070                 shares = tg->shares;
1071
1072         return shares;
1073 }
1074 # else /* CONFIG_SMP */
1075 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1076 {
1077         return tg->shares;
1078 }
1079 # endif /* CONFIG_SMP */
1080 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1081                             unsigned long weight)
1082 {
1083         if (se->on_rq) {
1084                 /* commit outstanding execution time */
1085                 if (cfs_rq->curr == se)
1086                         update_curr(cfs_rq);
1087                 account_entity_dequeue(cfs_rq, se);
1088         }
1089
1090         update_load_set(&se->load, weight);
1091
1092         if (se->on_rq)
1093                 account_entity_enqueue(cfs_rq, se);
1094 }
1095
1096 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1097
1098 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1099 {
1100         struct task_group *tg;
1101         struct sched_entity *se;
1102         long shares;
1103
1104         tg = cfs_rq->tg;
1105         se = tg->se[cpu_of(rq_of(cfs_rq))];
1106         if (!se || throttled_hierarchy(cfs_rq))
1107                 return;
1108 #ifndef CONFIG_SMP
1109         if (likely(se->load.weight == tg->shares))
1110                 return;
1111 #endif
1112         shares = calc_cfs_shares(cfs_rq, tg);
1113
1114         reweight_entity(cfs_rq_of(se), se, shares);
1115 }
1116 #else /* CONFIG_FAIR_GROUP_SCHED */
1117 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1118 {
1119 }
1120 #endif /* CONFIG_FAIR_GROUP_SCHED */
1121
1122 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1123 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1124 /*
1125  * We choose a half-life close to 1 scheduling period.
1126  * Note: The tables below are dependent on this value.
1127  */
1128 #define LOAD_AVG_PERIOD 32
1129 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1130 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1131
1132 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1133 static const u32 runnable_avg_yN_inv[] = {
1134         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1135         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1136         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1137         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1138         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1139         0x85aac367, 0x82cd8698,
1140 };
1141
1142 /*
1143  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1144  * over-estimates when re-combining.
1145  */
1146 static const u32 runnable_avg_yN_sum[] = {
1147             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1148          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1149         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1150 };
1151
1152 /*
1153  * Approximate:
1154  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1155  */
1156 static __always_inline u64 decay_load(u64 val, u64 n)
1157 {
1158         unsigned int local_n;
1159
1160         if (!n)
1161                 return val;
1162         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1163                 return 0;
1164
1165         /* after bounds checking we can collapse to 32-bit */
1166         local_n = n;
1167
1168         /*
1169          * As y^PERIOD = 1/2, we can combine
1170          *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1171          * With a look-up table which covers k^n (n<PERIOD)
1172          *
1173          * To achieve constant time decay_load.
1174          */
1175         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1176                 val >>= local_n / LOAD_AVG_PERIOD;
1177                 local_n %= LOAD_AVG_PERIOD;
1178         }
1179
1180         val *= runnable_avg_yN_inv[local_n];
1181         /* We don't use SRR here since we always want to round down. */
1182         return val >> 32;
1183 }
1184
1185 /*
1186  * For updates fully spanning n periods, the contribution to runnable
1187  * average will be: \Sum 1024*y^n
1188  *
1189  * We can compute this reasonably efficiently by combining:
1190  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1191  */
1192 static u32 __compute_runnable_contrib(u64 n)
1193 {
1194         u32 contrib = 0;
1195
1196         if (likely(n <= LOAD_AVG_PERIOD))
1197                 return runnable_avg_yN_sum[n];
1198         else if (unlikely(n >= LOAD_AVG_MAX_N))
1199                 return LOAD_AVG_MAX;
1200
1201         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1202         do {
1203                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1204                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1205
1206                 n -= LOAD_AVG_PERIOD;
1207         } while (n > LOAD_AVG_PERIOD);
1208
1209         contrib = decay_load(contrib, n);
1210         return contrib + runnable_avg_yN_sum[n];
1211 }
1212
1213 #ifdef CONFIG_SCHED_HMP
1214 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
1215 struct hmp_global_attr {
1216         struct attribute attr;
1217         ssize_t (*show)(struct kobject *kobj,
1218                         struct attribute *attr, char *buf);
1219         ssize_t (*store)(struct kobject *a, struct attribute *b,
1220                         const char *c, size_t count);
1221         int *value;
1222         int (*to_sysfs)(int);
1223         int (*from_sysfs)(int);
1224         ssize_t (*to_sysfs_text)(char *buf, int buf_size);
1225 };
1226
1227 #define HMP_DATA_SYSFS_MAX 8
1228
1229 struct hmp_data_struct {
1230 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1231         int freqinvar_load_scale_enabled;
1232 #endif
1233         int multiplier; /* used to scale the time delta */
1234         struct attribute_group attr_group;
1235         struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
1236         struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
1237 } hmp_data;
1238
1239 static u64 hmp_variable_scale_convert(u64 delta);
1240 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1241 /* Frequency-Invariant Load Modification:
1242  * Loads are calculated as in PJT's patch however we also scale the current
1243  * contribution in line with the frequency of the CPU that the task was
1244  * executed on.
1245  * In this version, we use a simple linear scale derived from the maximum
1246  * frequency reported by CPUFreq. As an example:
1247  *
1248  * Consider that we ran a task for 100% of the previous interval.
1249  *
1250  * Our CPU was under asynchronous frequency control through one of the
1251  * CPUFreq governors.
1252  *
1253  * The CPUFreq governor reports that it is able to scale the CPU between
1254  * 500MHz and 1GHz.
1255  *
1256  * During the period, the CPU was running at 1GHz.
1257  *
1258  * In this case, our load contribution for that period is calculated as
1259  * 1 * (number_of_active_microseconds)
1260  *
1261  * This results in our task being able to accumulate maximum load as normal.
1262  *
1263  *
1264  * Consider now that our CPU was executing at 500MHz.
1265  *
1266  * We now scale the load contribution such that it is calculated as
1267  * 0.5 * (number_of_active_microseconds)
1268  *
1269  * Our task can only record 50% maximum load during this period.
1270  *
1271  * This represents the task consuming 50% of the CPU's *possible* compute
1272  * capacity. However the task did consume 100% of the CPU's *available*
1273  * compute capacity which is the value seen by the CPUFreq governor and
1274  * user-side CPU Utilization tools.
1275  *
1276  * Restricting tracked load to be scaled by the CPU's frequency accurately
1277  * represents the consumption of possible compute capacity and allows the
1278  * HMP migration's simple threshold migration strategy to interact more
1279  * predictably with CPUFreq's asynchronous compute capacity changes.
1280  */
1281 #define SCHED_FREQSCALE_SHIFT 10
1282 struct cpufreq_extents {
1283         u32 curr_scale;
1284         u32 min;
1285         u32 max;
1286         u32 flags;
1287 };
1288 /* Flag set when the governor in use only allows one frequency.
1289  * Disables scaling.
1290  */
1291 #define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1292
1293 static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
1294 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1295 #endif /* CONFIG_SCHED_HMP */
1296
1297 /* We can represent the historical contribution to runnable average as the
1298  * coefficients of a geometric series.  To do this we sub-divide our runnable
1299  * history into segments of approximately 1ms (1024us); label the segment that
1300  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1301  *
1302  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1303  *      p0            p1           p2
1304  *     (now)       (~1ms ago)  (~2ms ago)
1305  *
1306  * Let u_i denote the fraction of p_i that the entity was runnable.
1307  *
1308  * We then designate the fractions u_i as our co-efficients, yielding the
1309  * following representation of historical load:
1310  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1311  *
1312  * We choose y based on the with of a reasonably scheduling period, fixing:
1313  *   y^32 = 0.5
1314  *
1315  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1316  * approximately half as much as the contribution to load within the last ms
1317  * (u_0).
1318  *
1319  * When a period "rolls over" and we have new u_0`, multiplying the previous
1320  * sum again by y is sufficient to update:
1321  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1322  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1323  */
1324 static __always_inline int __update_entity_runnable_avg(u64 now,
1325                                                         struct sched_avg *sa,
1326                                                         int runnable,
1327                                                         int running,
1328                                                         int cpu)
1329 {
1330         u64 delta, periods;
1331         u32 runnable_contrib;
1332         int delta_w, decayed = 0;
1333 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1334         u64 scaled_delta;
1335         u32 scaled_runnable_contrib;
1336         int scaled_delta_w;
1337         u32 curr_scale = 1024;
1338 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1339
1340         delta = now - sa->last_runnable_update;
1341 #ifdef CONFIG_SCHED_HMP
1342         delta = hmp_variable_scale_convert(delta);
1343 #endif
1344         /*
1345          * This should only happen when time goes backwards, which it
1346          * unfortunately does during sched clock init when we swap over to TSC.
1347          */
1348         if ((s64)delta < 0) {
1349                 sa->last_runnable_update = now;
1350                 return 0;
1351         }
1352
1353         /*
1354          * Use 1024ns as the unit of measurement since it's a reasonable
1355          * approximation of 1us and fast to compute.
1356          */
1357         delta >>= 10;
1358         if (!delta)
1359                 return 0;
1360         sa->last_runnable_update = now;
1361
1362 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1363         /* retrieve scale factor for load */
1364         if (hmp_data.freqinvar_load_scale_enabled)
1365                 curr_scale = freq_scale[cpu].curr_scale;
1366 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1367
1368         /* delta_w is the amount already accumulated against our next period */
1369         delta_w = sa->runnable_avg_period % 1024;
1370         if (delta + delta_w >= 1024) {
1371                 /* period roll-over */
1372                 decayed = 1;
1373
1374                 /*
1375                  * Now that we know we're crossing a period boundary, figure
1376                  * out how much from delta we need to complete the current
1377                  * period and accrue it.
1378                  */
1379                 delta_w = 1024 - delta_w;
1380                 /* scale runnable time if necessary */
1381 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1382                 scaled_delta_w = (delta_w * curr_scale)
1383                                 >> SCHED_FREQSCALE_SHIFT;
1384                 if (runnable)
1385                         sa->runnable_avg_sum += scaled_delta_w;
1386                 if (running)
1387                         sa->usage_avg_sum += scaled_delta_w;
1388 #else
1389                 if (runnable)
1390                         sa->runnable_avg_sum += delta_w;
1391                 if (running)
1392                         sa->usage_avg_sum += delta_w;
1393 #endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1394                 sa->runnable_avg_period += delta_w;
1395
1396                 delta -= delta_w;
1397
1398                 /* Figure out how many additional periods this update spans */
1399                 periods = delta / 1024;
1400                 delta %= 1024;
1401                 /* decay the load we have accumulated so far */
1402                 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1403                                                   periods + 1);
1404                 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1405                                                      periods + 1);
1406                 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
1407                 /* add the contribution from this period */
1408                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1409                 runnable_contrib = __compute_runnable_contrib(periods);
1410                 /* Apply load scaling if necessary.
1411                  * Note that multiplying the whole series is same as
1412                  * multiplying all terms
1413                  */
1414 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1415                 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1416                                 >> SCHED_FREQSCALE_SHIFT;
1417                 if (runnable)
1418                         sa->runnable_avg_sum += scaled_runnable_contrib;
1419                 if (running)
1420                         sa->usage_avg_sum += scaled_runnable_contrib;
1421 #else
1422                 if (runnable)
1423                         sa->runnable_avg_sum += runnable_contrib;
1424                 if (running)
1425                         sa->usage_avg_sum += runnable_contrib;
1426 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1427                 sa->runnable_avg_period += runnable_contrib;
1428         }
1429
1430         /* Remainder of delta accrued against u_0` */
1431         /* scale if necessary */
1432 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1433         scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
1434         if (runnable)
1435                 sa->runnable_avg_sum += scaled_delta;
1436         if (running)
1437                 sa->usage_avg_sum += scaled_delta;
1438 #else
1439         if (runnable)
1440                 sa->runnable_avg_sum += delta;
1441         if (running)
1442                 sa->usage_avg_sum += delta;
1443 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1444         sa->runnable_avg_period += delta;
1445
1446         return decayed;
1447 }
1448
1449 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1450 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1451 {
1452         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1453         u64 decays = atomic64_read(&cfs_rq->decay_counter);
1454
1455         decays -= se->avg.decay_count;
1456         if (!decays)
1457                 return 0;
1458
1459         se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1460         se->avg.decay_count = 0;
1461
1462         return decays;
1463 }
1464
1465 #ifdef CONFIG_FAIR_GROUP_SCHED
1466 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1467                                                  int force_update)
1468 {
1469         struct task_group *tg = cfs_rq->tg;
1470         s64 tg_contrib;
1471
1472         tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1473         tg_contrib -= cfs_rq->tg_load_contrib;
1474
1475         if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1476                 atomic64_add(tg_contrib, &tg->load_avg);
1477                 cfs_rq->tg_load_contrib += tg_contrib;
1478         }
1479 }
1480
1481 /*
1482  * Aggregate cfs_rq runnable averages into an equivalent task_group
1483  * representation for computing load contributions.
1484  */
1485 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1486                                                   struct cfs_rq *cfs_rq)
1487 {
1488         struct task_group *tg = cfs_rq->tg;
1489         long contrib, usage_contrib;
1490
1491         /* The fraction of a cpu used by this cfs_rq */
1492         contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1493                           sa->runnable_avg_period + 1);
1494         contrib -= cfs_rq->tg_runnable_contrib;
1495
1496         usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
1497                                 sa->runnable_avg_period + 1);
1498         usage_contrib -= cfs_rq->tg_usage_contrib;
1499
1500         /*
1501          * contrib/usage at this point represent deltas, only update if they
1502          * are substantive.
1503          */
1504         if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
1505             (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
1506                 atomic_add(contrib, &tg->runnable_avg);
1507                 cfs_rq->tg_runnable_contrib += contrib;
1508
1509                 atomic_add(usage_contrib, &tg->usage_avg);
1510                 cfs_rq->tg_usage_contrib += usage_contrib;
1511         }
1512 }
1513
1514 static inline void __update_group_entity_contrib(struct sched_entity *se)
1515 {
1516         struct cfs_rq *cfs_rq = group_cfs_rq(se);
1517         struct task_group *tg = cfs_rq->tg;
1518         int runnable_avg;
1519
1520         u64 contrib;
1521
1522         contrib = cfs_rq->tg_load_contrib * tg->shares;
1523         se->avg.load_avg_contrib = div64_u64(contrib,
1524                                              atomic64_read(&tg->load_avg) + 1);
1525
1526         /*
1527          * For group entities we need to compute a correction term in the case
1528          * that they are consuming <1 cpu so that we would contribute the same
1529          * load as a task of equal weight.
1530          *
1531          * Explicitly co-ordinating this measurement would be expensive, but
1532          * fortunately the sum of each cpus contribution forms a usable
1533          * lower-bound on the true value.
1534          *
1535          * Consider the aggregate of 2 contributions.  Either they are disjoint
1536          * (and the sum represents true value) or they are disjoint and we are
1537          * understating by the aggregate of their overlap.
1538          *
1539          * Extending this to N cpus, for a given overlap, the maximum amount we
1540          * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1541          * cpus that overlap for this interval and w_i is the interval width.
1542          *
1543          * On a small machine; the first term is well-bounded which bounds the
1544          * total error since w_i is a subset of the period.  Whereas on a
1545          * larger machine, while this first term can be larger, if w_i is the
1546          * of consequential size guaranteed to see n_i*w_i quickly converge to
1547          * our upper bound of 1-cpu.
1548          */
1549         runnable_avg = atomic_read(&tg->runnable_avg);
1550         if (runnable_avg < NICE_0_LOAD) {
1551                 se->avg.load_avg_contrib *= runnable_avg;
1552                 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1553         }
1554 }
1555 #else
1556 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1557                                                  int force_update) {}
1558 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1559                                                   struct cfs_rq *cfs_rq) {}
1560 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1561 #endif
1562
1563 static inline void __update_task_entity_contrib(struct sched_entity *se)
1564 {
1565         u32 contrib;
1566
1567         /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1568         contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1569         contrib /= (se->avg.runnable_avg_period + 1);
1570         se->avg.load_avg_contrib = scale_load(contrib);
1571         trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
1572         contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
1573         contrib /= (se->avg.runnable_avg_period + 1);
1574         se->avg.load_avg_ratio = scale_load(contrib);
1575         trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
1576 }
1577
1578 /* Compute the current contribution to load_avg by se, return any delta */
1579 static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio)
1580 {
1581         long old_contrib = se->avg.load_avg_contrib;
1582         long old_ratio   = se->avg.load_avg_ratio;
1583
1584         if (entity_is_task(se)) {
1585                 __update_task_entity_contrib(se);
1586         } else {
1587                 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1588                 __update_group_entity_contrib(se);
1589         }
1590
1591         if (ratio)
1592                 *ratio = se->avg.load_avg_ratio - old_ratio;
1593         return se->avg.load_avg_contrib - old_contrib;
1594 }
1595
1596 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1597                                                  long load_contrib)
1598 {
1599         if (likely(load_contrib < cfs_rq->blocked_load_avg))
1600                 cfs_rq->blocked_load_avg -= load_contrib;
1601         else
1602                 cfs_rq->blocked_load_avg = 0;
1603 }
1604
1605 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1606
1607 /* Update a sched_entity's runnable average */
1608 static inline void update_entity_load_avg(struct sched_entity *se,
1609                                           int update_cfs_rq)
1610 {
1611         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1612         long contrib_delta, ratio_delta;
1613         u64 now;
1614         int cpu = -1;   /* not used in normal case */
1615
1616 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1617         cpu = cfs_rq->rq->cpu;
1618 #endif
1619         /*
1620          * For a group entity we need to use their owned cfs_rq_clock_task() in
1621          * case they are the parent of a throttled hierarchy.
1622          */
1623         if (entity_is_task(se))
1624                 now = cfs_rq_clock_task(cfs_rq);
1625         else
1626                 now = cfs_rq_clock_task(group_cfs_rq(se));
1627
1628         if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
1629                         cfs_rq->curr == se, cpu))
1630                 return;
1631
1632         contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta);
1633
1634         if (!update_cfs_rq)
1635                 return;
1636
1637         if (se->on_rq) {
1638                 cfs_rq->runnable_load_avg += contrib_delta;
1639                 rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta;
1640         } else {
1641                 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1642         }
1643 }
1644
1645 /*
1646  * Decay the load contributed by all blocked children and account this so that
1647  * their contribution may appropriately discounted when they wake up.
1648  */
1649 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1650 {
1651         u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1652         u64 decays;
1653
1654         decays = now - cfs_rq->last_decay;
1655         if (!decays && !force_update)
1656                 return;
1657
1658         if (atomic64_read(&cfs_rq->removed_load)) {
1659                 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1660                 subtract_blocked_load_contrib(cfs_rq, removed_load);
1661         }
1662
1663         if (decays) {
1664                 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1665                                                       decays);
1666                 atomic64_add(decays, &cfs_rq->decay_counter);
1667                 cfs_rq->last_decay = now;
1668         }
1669
1670         __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1671 }
1672
1673 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1674 {
1675         int cpu = -1;   /* not used in normal case */
1676
1677 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1678         cpu = rq->cpu;
1679 #endif
1680         __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
1681                                      runnable, cpu);
1682         __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1683         trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio);
1684         trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
1685         trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter);
1686 }
1687
1688 /* Add the load generated by se into cfs_rq's child load-average */
1689 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1690                                                   struct sched_entity *se,
1691                                                   int wakeup)
1692 {
1693         /*
1694          * We track migrations using entity decay_count <= 0, on a wake-up
1695          * migration we use a negative decay count to track the remote decays
1696          * accumulated while sleeping.
1697          */
1698         if (unlikely(se->avg.decay_count <= 0)) {
1699                 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1700                 if (se->avg.decay_count) {
1701                         /*
1702                          * In a wake-up migration we have to approximate the
1703                          * time sleeping.  This is because we can't synchronize
1704                          * clock_task between the two cpus, and it is not
1705                          * guaranteed to be read-safe.  Instead, we can
1706                          * approximate this using our carried decays, which are
1707                          * explicitly atomically readable.
1708                          */
1709                         se->avg.last_runnable_update -= (-se->avg.decay_count)
1710                                                         << 20;
1711                         update_entity_load_avg(se, 0);
1712                         /* Indicate that we're now synchronized and on-rq */
1713                         se->avg.decay_count = 0;
1714                 }
1715                 wakeup = 0;
1716         } else {
1717                 __synchronize_entity_decay(se);
1718         }
1719
1720         /* migrated tasks did not contribute to our blocked load */
1721         if (wakeup) {
1722                 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1723                 update_entity_load_avg(se, 0);
1724         }
1725
1726         cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1727         rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio;
1728
1729         /* we force update consideration on load-balancer moves */
1730         update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1731 }
1732
1733 /*
1734  * Remove se's load from this cfs_rq child load-average, if the entity is
1735  * transitioning to a blocked state we track its projected decay using
1736  * blocked_load_avg.
1737  */
1738 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1739                                                   struct sched_entity *se,
1740                                                   int sleep)
1741 {
1742         update_entity_load_avg(se, 1);
1743         /* we force update consideration on load-balancer moves */
1744         update_cfs_rq_blocked_load(cfs_rq, !sleep);
1745
1746         cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1747         rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio;
1748
1749         if (sleep) {
1750                 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1751                 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1752         } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1753 }
1754
1755 /*
1756  * Update the rq's load with the elapsed running time before entering
1757  * idle. if the last scheduled task is not a CFS task, idle_enter will
1758  * be the only way to update the runnable statistic.
1759  */
1760 void idle_enter_fair(struct rq *this_rq)
1761 {
1762         update_rq_runnable_avg(this_rq, 1);
1763 }
1764
1765 /*
1766  * Update the rq's load with the elapsed idle time before a task is
1767  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1768  * be the only way to update the runnable statistic.
1769  */
1770 void idle_exit_fair(struct rq *this_rq)
1771 {
1772         update_rq_runnable_avg(this_rq, 0);
1773 }
1774
1775 #else
1776 static inline void update_entity_load_avg(struct sched_entity *se,
1777                                           int update_cfs_rq) {}
1778 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1779 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1780                                            struct sched_entity *se,
1781                                            int wakeup) {}
1782 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1783                                            struct sched_entity *se,
1784                                            int sleep) {}
1785 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1786                                               int force_update) {}
1787 #endif
1788
1789 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1790 {
1791 #ifdef CONFIG_SCHEDSTATS
1792         struct task_struct *tsk = NULL;
1793
1794         if (entity_is_task(se))
1795                 tsk = task_of(se);
1796
1797         if (se->statistics.sleep_start) {
1798                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1799
1800                 if ((s64)delta < 0)
1801                         delta = 0;
1802
1803                 if (unlikely(delta > se->statistics.sleep_max))
1804                         se->statistics.sleep_max = delta;
1805
1806                 se->statistics.sleep_start = 0;
1807                 se->statistics.sum_sleep_runtime += delta;
1808
1809                 if (tsk) {
1810                         account_scheduler_latency(tsk, delta >> 10, 1);
1811                         trace_sched_stat_sleep(tsk, delta);
1812                 }
1813         }
1814         if (se->statistics.block_start) {
1815                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1816
1817                 if ((s64)delta < 0)
1818                         delta = 0;
1819
1820                 if (unlikely(delta > se->statistics.block_max))
1821                         se->statistics.block_max = delta;
1822
1823                 se->statistics.block_start = 0;
1824                 se->statistics.sum_sleep_runtime += delta;
1825
1826                 if (tsk) {
1827                         if (tsk->in_iowait) {
1828                                 se->statistics.iowait_sum += delta;
1829                                 se->statistics.iowait_count++;
1830                                 trace_sched_stat_iowait(tsk, delta);
1831                         }
1832
1833                         trace_sched_stat_blocked(tsk, delta);
1834
1835                         /*
1836                          * Blocking time is in units of nanosecs, so shift by
1837                          * 20 to get a milliseconds-range estimation of the
1838                          * amount of time that the task spent sleeping:
1839                          */
1840                         if (unlikely(prof_on == SLEEP_PROFILING)) {
1841                                 profile_hits(SLEEP_PROFILING,
1842                                                 (void *)get_wchan(tsk),
1843                                                 delta >> 20);
1844                         }
1845                         account_scheduler_latency(tsk, delta >> 10, 0);
1846                 }
1847         }
1848 #endif
1849 }
1850
1851 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1852 {
1853 #ifdef CONFIG_SCHED_DEBUG
1854         s64 d = se->vruntime - cfs_rq->min_vruntime;
1855
1856         if (d < 0)
1857                 d = -d;
1858
1859         if (d > 3*sysctl_sched_latency)
1860                 schedstat_inc(cfs_rq, nr_spread_over);
1861 #endif
1862 }
1863
1864 static void
1865 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1866 {
1867         u64 vruntime = cfs_rq->min_vruntime;
1868
1869         /*
1870          * The 'current' period is already promised to the current tasks,
1871          * however the extra weight of the new task will slow them down a
1872          * little, place the new task so that it fits in the slot that
1873          * stays open at the end.
1874          */
1875         if (initial && sched_feat(START_DEBIT))
1876                 vruntime += sched_vslice(cfs_rq, se);
1877
1878         /* sleeps up to a single latency don't count. */
1879         if (!initial) {
1880                 unsigned long thresh = sysctl_sched_latency;
1881
1882                 /*
1883                  * Halve their sleep time's effect, to allow
1884                  * for a gentler effect of sleepers:
1885                  */
1886                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1887                         thresh >>= 1;
1888
1889                 vruntime -= thresh;
1890         }
1891
1892         /* ensure we never gain time by being placed backwards. */
1893         se->vruntime = max_vruntime(se->vruntime, vruntime);
1894 }
1895
1896 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1897
1898 static void
1899 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1900 {
1901         /*
1902          * Update the normalized vruntime before updating min_vruntime
1903          * through callig update_curr().
1904          */
1905         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1906                 se->vruntime += cfs_rq->min_vruntime;
1907
1908         /*
1909          * Update run-time statistics of the 'current'.
1910          */
1911         update_curr(cfs_rq);
1912         enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1913         account_entity_enqueue(cfs_rq, se);
1914         update_cfs_shares(cfs_rq);
1915
1916         if (flags & ENQUEUE_WAKEUP) {
1917                 place_entity(cfs_rq, se, 0);
1918                 enqueue_sleeper(cfs_rq, se);
1919         }
1920
1921         update_stats_enqueue(cfs_rq, se);
1922         check_spread(cfs_rq, se);
1923         if (se != cfs_rq->curr)
1924                 __enqueue_entity(cfs_rq, se);
1925         se->on_rq = 1;
1926
1927         if (cfs_rq->nr_running == 1) {
1928                 list_add_leaf_cfs_rq(cfs_rq);
1929                 check_enqueue_throttle(cfs_rq);
1930         }
1931 }
1932
1933 static void __clear_buddies_last(struct sched_entity *se)
1934 {
1935         for_each_sched_entity(se) {
1936                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1937                 if (cfs_rq->last == se)
1938                         cfs_rq->last = NULL;
1939                 else
1940                         break;
1941         }
1942 }
1943
1944 static void __clear_buddies_next(struct sched_entity *se)
1945 {
1946         for_each_sched_entity(se) {
1947                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1948                 if (cfs_rq->next == se)
1949                         cfs_rq->next = NULL;
1950                 else
1951                         break;
1952         }
1953 }
1954
1955 static void __clear_buddies_skip(struct sched_entity *se)
1956 {
1957         for_each_sched_entity(se) {
1958                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1959                 if (cfs_rq->skip == se)
1960                         cfs_rq->skip = NULL;
1961                 else
1962                         break;
1963         }
1964 }
1965
1966 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1967 {
1968         if (cfs_rq->last == se)
1969                 __clear_buddies_last(se);
1970
1971         if (cfs_rq->next == se)
1972                 __clear_buddies_next(se);
1973
1974         if (cfs_rq->skip == se)
1975                 __clear_buddies_skip(se);
1976 }
1977
1978 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1979
1980 static void
1981 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1982 {
1983         /*
1984          * Update run-time statistics of the 'current'.
1985          */
1986         update_curr(cfs_rq);
1987         dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1988
1989         update_stats_dequeue(cfs_rq, se);
1990         if (flags & DEQUEUE_SLEEP) {
1991 #ifdef CONFIG_SCHEDSTATS
1992                 if (entity_is_task(se)) {
1993                         struct task_struct *tsk = task_of(se);
1994
1995                         if (tsk->state & TASK_INTERRUPTIBLE)
1996                                 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1997                         if (tsk->state & TASK_UNINTERRUPTIBLE)
1998                                 se->statistics.block_start = rq_of(cfs_rq)->clock;
1999                 }
2000 #endif
2001         }
2002
2003         clear_buddies(cfs_rq, se);
2004
2005         if (se != cfs_rq->curr)
2006                 __dequeue_entity(cfs_rq, se);
2007         se->on_rq = 0;
2008         account_entity_dequeue(cfs_rq, se);
2009
2010         /*
2011          * Normalize the entity after updating the min_vruntime because the
2012          * update can refer to the ->curr item and we need to reflect this
2013          * movement in our normalized position.
2014          */
2015         if (!(flags & DEQUEUE_SLEEP))
2016                 se->vruntime -= cfs_rq->min_vruntime;
2017
2018         /* return excess runtime on last dequeue */
2019         return_cfs_rq_runtime(cfs_rq);
2020
2021         update_min_vruntime(cfs_rq);
2022         update_cfs_shares(cfs_rq);
2023 }
2024
2025 /*
2026  * Preempt the current task with a newly woken task if needed:
2027  */
2028 static void
2029 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2030 {
2031         unsigned long ideal_runtime, delta_exec;
2032         struct sched_entity *se;
2033         s64 delta;
2034
2035         ideal_runtime = sched_slice(cfs_rq, curr);
2036         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2037         if (delta_exec > ideal_runtime) {
2038                 resched_task(rq_of(cfs_rq)->curr);
2039                 /*
2040                  * The current task ran long enough, ensure it doesn't get
2041                  * re-elected due to buddy favours.
2042                  */
2043                 clear_buddies(cfs_rq, curr);
2044                 return;
2045         }
2046
2047         /*
2048          * Ensure that a task that missed wakeup preemption by a
2049          * narrow margin doesn't have to wait for a full slice.
2050          * This also mitigates buddy induced latencies under load.
2051          */
2052         if (delta_exec < sysctl_sched_min_granularity)
2053                 return;
2054
2055         se = __pick_first_entity(cfs_rq);
2056         delta = curr->vruntime - se->vruntime;
2057
2058         if (delta < 0)
2059                 return;
2060
2061         if (delta > ideal_runtime)
2062                 resched_task(rq_of(cfs_rq)->curr);
2063 }
2064
2065 static void
2066 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2067 {
2068         /* 'current' is not kept within the tree. */
2069         if (se->on_rq) {
2070                 /*
2071                  * Any task has to be enqueued before it get to execute on
2072                  * a CPU. So account for the time it spent waiting on the
2073                  * runqueue.
2074                  */
2075                 update_stats_wait_end(cfs_rq, se);
2076                 __dequeue_entity(cfs_rq, se);
2077                 update_entity_load_avg(se, 1);
2078         }
2079
2080         update_stats_curr_start(cfs_rq, se);
2081         cfs_rq->curr = se;
2082 #ifdef CONFIG_SCHEDSTATS
2083         /*
2084          * Track our maximum slice length, if the CPU's load is at
2085          * least twice that of our own weight (i.e. dont track it
2086          * when there are only lesser-weight tasks around):
2087          */
2088         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2089                 se->statistics.slice_max = max(se->statistics.slice_max,
2090                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
2091         }
2092 #endif
2093         se->prev_sum_exec_runtime = se->sum_exec_runtime;
2094 }
2095
2096 static int
2097 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2098
2099 /*
2100  * Pick the next process, keeping these things in mind, in this order:
2101  * 1) keep things fair between processes/task groups
2102  * 2) pick the "next" process, since someone really wants that to run
2103  * 3) pick the "last" process, for cache locality
2104  * 4) do not run the "skip" process, if something else is available
2105  */
2106 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2107 {
2108         struct sched_entity *se = __pick_first_entity(cfs_rq);
2109         struct sched_entity *left = se;
2110
2111         /*
2112          * Avoid running the skip buddy, if running something else can
2113          * be done without getting too unfair.
2114          */
2115         if (cfs_rq->skip == se) {
2116                 struct sched_entity *second = __pick_next_entity(se);
2117                 if (second && wakeup_preempt_entity(second, left) < 1)
2118                         se = second;
2119         }
2120
2121         /*
2122          * Prefer last buddy, try to return the CPU to a preempted task.
2123          */
2124         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2125                 se = cfs_rq->last;
2126
2127         /*
2128          * Someone really wants this to run. If it's not unfair, run it.
2129          */
2130         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2131                 se = cfs_rq->next;
2132
2133         clear_buddies(cfs_rq, se);
2134
2135         return se;
2136 }
2137
2138 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2139
2140 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2141 {
2142         /*
2143          * If still on the runqueue then deactivate_task()
2144          * was not called and update_curr() has to be done:
2145          */
2146         if (prev->on_rq)
2147                 update_curr(cfs_rq);
2148
2149         /* throttle cfs_rqs exceeding runtime */
2150         check_cfs_rq_runtime(cfs_rq);
2151
2152         check_spread(cfs_rq, prev);
2153         if (prev->on_rq) {
2154                 update_stats_wait_start(cfs_rq, prev);
2155                 /* Put 'current' back into the tree. */
2156                 __enqueue_entity(cfs_rq, prev);
2157                 /* in !on_rq case, update occurred at dequeue */
2158                 update_entity_load_avg(prev, 1);
2159         }
2160         cfs_rq->curr = NULL;
2161 }
2162
2163 static void
2164 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2165 {
2166         /*
2167          * Update run-time statistics of the 'current'.
2168          */
2169         update_curr(cfs_rq);
2170
2171         /*
2172          * Ensure that runnable average is periodically updated.
2173          */
2174         update_entity_load_avg(curr, 1);
2175         update_cfs_rq_blocked_load(cfs_rq, 1);
2176         update_cfs_shares(cfs_rq);
2177
2178 #ifdef CONFIG_SCHED_HRTICK
2179         /*
2180          * queued ticks are scheduled to match the slice, so don't bother
2181          * validating it and just reschedule.
2182          */
2183         if (queued) {
2184                 resched_task(rq_of(cfs_rq)->curr);
2185                 return;
2186         }
2187         /*
2188          * don't let the period tick interfere with the hrtick preemption
2189          */
2190         if (!sched_feat(DOUBLE_TICK) &&
2191                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2192                 return;
2193 #endif
2194
2195         if (cfs_rq->nr_running > 1)
2196                 check_preempt_tick(cfs_rq, curr);
2197 }
2198
2199
2200 /**************************************************
2201  * CFS bandwidth control machinery
2202  */
2203
2204 #ifdef CONFIG_CFS_BANDWIDTH
2205
2206 #ifdef HAVE_JUMP_LABEL
2207 static struct static_key __cfs_bandwidth_used;
2208
2209 static inline bool cfs_bandwidth_used(void)
2210 {
2211         return static_key_false(&__cfs_bandwidth_used);
2212 }
2213
2214 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2215 {
2216         /* only need to count groups transitioning between enabled/!enabled */
2217         if (enabled && !was_enabled)
2218                 static_key_slow_inc(&__cfs_bandwidth_used);
2219         else if (!enabled && was_enabled)
2220                 static_key_slow_dec(&__cfs_bandwidth_used);
2221 }
2222 #else /* HAVE_JUMP_LABEL */
2223 static bool cfs_bandwidth_used(void)
2224 {
2225         return true;
2226 }
2227
2228 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2229 #endif /* HAVE_JUMP_LABEL */
2230
2231 /*
2232  * default period for cfs group bandwidth.
2233  * default: 0.1s, units: nanoseconds
2234  */
2235 static inline u64 default_cfs_period(void)
2236 {
2237         return 100000000ULL;
2238 }
2239
2240 static inline u64 sched_cfs_bandwidth_slice(void)
2241 {
2242         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2243 }
2244
2245 /*
2246  * Replenish runtime according to assigned quota and update expiration time.
2247  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2248  * additional synchronization around rq->lock.
2249  *
2250  * requires cfs_b->lock
2251  */
2252 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2253 {
2254         u64 now;
2255
2256         if (cfs_b->quota == RUNTIME_INF)
2257                 return;
2258
2259         now = sched_clock_cpu(smp_processor_id());
2260         cfs_b->runtime = cfs_b->quota;
2261         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2262 }
2263
2264 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2265 {
2266         return &tg->cfs_bandwidth;
2267 }
2268
2269 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2270 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2271 {
2272         if (unlikely(cfs_rq->throttle_count))
2273                 return cfs_rq->throttled_clock_task;
2274
2275         return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2276 }
2277
2278 /* returns 0 on failure to allocate runtime */
2279 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2280 {
2281         struct task_group *tg = cfs_rq->tg;
2282         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2283         u64 amount = 0, min_amount, expires;
2284
2285         /* note: this is a positive sum as runtime_remaining <= 0 */
2286         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2287
2288         raw_spin_lock(&cfs_b->lock);
2289         if (cfs_b->quota == RUNTIME_INF)
2290                 amount = min_amount;
2291         else {
2292                 /*
2293                  * If the bandwidth pool has become inactive, then at least one
2294                  * period must have elapsed since the last consumption.
2295                  * Refresh the global state and ensure bandwidth timer becomes
2296                  * active.
2297                  */
2298                 if (!cfs_b->timer_active) {
2299                         __refill_cfs_bandwidth_runtime(cfs_b);
2300                         __start_cfs_bandwidth(cfs_b);
2301                 }
2302
2303                 if (cfs_b->runtime > 0) {
2304                         amount = min(cfs_b->runtime, min_amount);
2305                         cfs_b->runtime -= amount;
2306                         cfs_b->idle = 0;
2307                 }
2308         }
2309         expires = cfs_b->runtime_expires;
2310         raw_spin_unlock(&cfs_b->lock);
2311
2312         cfs_rq->runtime_remaining += amount;
2313         /*
2314          * we may have advanced our local expiration to account for allowed
2315          * spread between our sched_clock and the one on which runtime was
2316          * issued.
2317          */
2318         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2319                 cfs_rq->runtime_expires = expires;
2320
2321         return cfs_rq->runtime_remaining > 0;
2322 }
2323
2324 /*
2325  * Note: This depends on the synchronization provided by sched_clock and the
2326  * fact that rq->clock snapshots this value.
2327  */
2328 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2329 {
2330         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2331         struct rq *rq = rq_of(cfs_rq);
2332
2333         /* if the deadline is ahead of our clock, nothing to do */
2334         if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2335                 return;
2336
2337         if (cfs_rq->runtime_remaining < 0)
2338                 return;
2339
2340         /*
2341          * If the local deadline has passed we have to consider the
2342          * possibility that our sched_clock is 'fast' and the global deadline
2343          * has not truly expired.
2344          *
2345          * Fortunately we can check determine whether this the case by checking
2346          * whether the global deadline has advanced.
2347          */
2348
2349         if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2350                 /* extend local deadline, drift is bounded above by 2 ticks */
2351                 cfs_rq->runtime_expires += TICK_NSEC;
2352         } else {
2353                 /* global deadline is ahead, expiration has passed */
2354                 cfs_rq->runtime_remaining = 0;
2355         }
2356 }
2357
2358 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2359                                      unsigned long delta_exec)
2360 {
2361         /* dock delta_exec before expiring quota (as it could span periods) */
2362         cfs_rq->runtime_remaining -= delta_exec;
2363         expire_cfs_rq_runtime(cfs_rq);
2364
2365         if (likely(cfs_rq->runtime_remaining > 0))
2366                 return;
2367
2368         /*
2369          * if we're unable to extend our runtime we resched so that the active
2370          * hierarchy can be throttled
2371          */
2372         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2373                 resched_task(rq_of(cfs_rq)->curr);
2374 }
2375
2376 static __always_inline
2377 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2378 {
2379         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2380                 return;
2381
2382         __account_cfs_rq_runtime(cfs_rq, delta_exec);
2383 }
2384
2385 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2386 {
2387         return cfs_bandwidth_used() && cfs_rq->throttled;
2388 }
2389
2390 /* check whether cfs_rq, or any parent, is throttled */
2391 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2392 {
2393         return cfs_bandwidth_used() && cfs_rq->throttle_count;
2394 }
2395
2396 /*
2397  * Ensure that neither of the group entities corresponding to src_cpu or
2398  * dest_cpu are members of a throttled hierarchy when performing group
2399  * load-balance operations.
2400  */
2401 static inline int throttled_lb_pair(struct task_group *tg,
2402                                     int src_cpu, int dest_cpu)
2403 {
2404         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2405
2406         src_cfs_rq = tg->cfs_rq[src_cpu];
2407         dest_cfs_rq = tg->cfs_rq[dest_cpu];
2408
2409         return throttled_hierarchy(src_cfs_rq) ||
2410                throttled_hierarchy(dest_cfs_rq);
2411 }
2412
2413 /* updated child weight may affect parent so we have to do this bottom up */
2414 static int tg_unthrottle_up(struct task_group *tg, void *data)
2415 {
2416         struct rq *rq = data;
2417         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2418
2419         cfs_rq->throttle_count--;
2420 #ifdef CONFIG_SMP
2421         if (!cfs_rq->throttle_count) {
2422                 /* adjust cfs_rq_clock_task() */
2423                 cfs_rq->throttled_clock_task_time += rq->clock_task -
2424                                              cfs_rq->throttled_clock_task;
2425         }
2426 #endif
2427
2428         return 0;
2429 }
2430
2431 static int tg_throttle_down(struct task_group *tg, void *data)
2432 {
2433         struct rq *rq = data;
2434         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2435
2436         /* group is entering throttled state, stop time */
2437         if (!cfs_rq->throttle_count)
2438                 cfs_rq->throttled_clock_task = rq->clock_task;
2439         cfs_rq->throttle_count++;
2440
2441         return 0;
2442 }
2443
2444 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2445 {
2446         struct rq *rq = rq_of(cfs_rq);
2447         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2448         struct sched_entity *se;
2449         long task_delta, dequeue = 1;
2450
2451         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2452
2453         /* freeze hierarchy runnable averages while throttled */
2454         rcu_read_lock();
2455         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2456         rcu_read_unlock();
2457
2458         task_delta = cfs_rq->h_nr_running;
2459         for_each_sched_entity(se) {
2460                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2461                 /* throttled entity or throttle-on-deactivate */
2462                 if (!se->on_rq)
2463                         break;
2464
2465                 if (dequeue)
2466                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2467                 qcfs_rq->h_nr_running -= task_delta;
2468
2469                 if (qcfs_rq->load.weight)
2470                         dequeue = 0;
2471         }
2472
2473         if (!se)
2474                 rq->nr_running -= task_delta;
2475
2476         cfs_rq->throttled = 1;
2477         cfs_rq->throttled_clock = rq->clock;
2478         raw_spin_lock(&cfs_b->lock);
2479         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2480         raw_spin_unlock(&cfs_b->lock);
2481 }
2482
2483 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2484 {
2485         struct rq *rq = rq_of(cfs_rq);
2486         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2487         struct sched_entity *se;
2488         int enqueue = 1;
2489         long task_delta;
2490
2491         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2492
2493         cfs_rq->throttled = 0;
2494         raw_spin_lock(&cfs_b->lock);
2495         cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2496         list_del_rcu(&cfs_rq->throttled_list);
2497         raw_spin_unlock(&cfs_b->lock);
2498
2499         update_rq_clock(rq);
2500         /* update hierarchical throttle state */
2501         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2502
2503         if (!cfs_rq->load.weight)
2504                 return;
2505
2506         task_delta = cfs_rq->h_nr_running;
2507         for_each_sched_entity(se) {
2508                 if (se->on_rq)
2509                         enqueue = 0;
2510
2511                 cfs_rq = cfs_rq_of(se);
2512                 if (enqueue)
2513                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2514                 cfs_rq->h_nr_running += task_delta;
2515
2516                 if (cfs_rq_throttled(cfs_rq))
2517                         break;
2518         }
2519
2520         if (!se)
2521                 rq->nr_running += task_delta;
2522
2523         /* determine whether we need to wake up potentially idle cpu */
2524         if (rq->curr == rq->idle && rq->cfs.nr_running)
2525                 resched_task(rq->curr);
2526 }
2527
2528 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2529                 u64 remaining, u64 expires)
2530 {
2531         struct cfs_rq *cfs_rq;
2532         u64 runtime = remaining;
2533
2534         rcu_read_lock();
2535         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2536                                 throttled_list) {
2537                 struct rq *rq = rq_of(cfs_rq);
2538
2539                 raw_spin_lock(&rq->lock);
2540                 if (!cfs_rq_throttled(cfs_rq))
2541                         goto next;
2542
2543                 runtime = -cfs_rq->runtime_remaining + 1;
2544                 if (runtime > remaining)
2545                         runtime = remaining;
2546                 remaining -= runtime;
2547
2548                 cfs_rq->runtime_remaining += runtime;
2549                 cfs_rq->runtime_expires = expires;
2550
2551                 /* we check whether we're throttled above */
2552                 if (cfs_rq->runtime_remaining > 0)
2553                         unthrottle_cfs_rq(cfs_rq);
2554
2555 next:
2556                 raw_spin_unlock(&rq->lock);
2557
2558                 if (!remaining)
2559                         break;
2560         }
2561         rcu_read_unlock();
2562
2563         return remaining;
2564 }
2565
2566 /*
2567  * Responsible for refilling a task_group's bandwidth and unthrottling its
2568  * cfs_rqs as appropriate. If there has been no activity within the last
2569  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2570  * used to track this state.
2571  */
2572 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2573 {
2574         u64 runtime, runtime_expires;
2575         int idle = 1, throttled;
2576
2577         raw_spin_lock(&cfs_b->lock);
2578         /* no need to continue the timer with no bandwidth constraint */
2579         if (cfs_b->quota == RUNTIME_INF)
2580                 goto out_unlock;
2581
2582         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2583         /* idle depends on !throttled (for the case of a large deficit) */
2584         idle = cfs_b->idle && !throttled;
2585         cfs_b->nr_periods += overrun;
2586
2587         /* if we're going inactive then everything else can be deferred */
2588         if (idle)
2589                 goto out_unlock;
2590
2591         __refill_cfs_bandwidth_runtime(cfs_b);
2592
2593         if (!throttled) {
2594                 /* mark as potentially idle for the upcoming period */
2595                 cfs_b->idle = 1;
2596                 goto out_unlock;
2597         }
2598
2599         /* account preceding periods in which throttling occurred */
2600         cfs_b->nr_throttled += overrun;
2601
2602         /*
2603          * There are throttled entities so we must first use the new bandwidth
2604          * to unthrottle them before making it generally available.  This
2605          * ensures that all existing debts will be paid before a new cfs_rq is
2606          * allowed to run.
2607          */
2608         runtime = cfs_b->runtime;
2609         runtime_expires = cfs_b->runtime_expires;
2610         cfs_b->runtime = 0;
2611
2612         /*
2613          * This check is repeated as we are holding onto the new bandwidth
2614          * while we unthrottle.  This can potentially race with an unthrottled
2615          * group trying to acquire new bandwidth from the global pool.
2616          */
2617         while (throttled && runtime > 0) {
2618                 raw_spin_unlock(&cfs_b->lock);
2619                 /* we can't nest cfs_b->lock while distributing bandwidth */
2620                 runtime = distribute_cfs_runtime(cfs_b, runtime,
2621                                                  runtime_expires);
2622                 raw_spin_lock(&cfs_b->lock);
2623
2624                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2625         }
2626
2627         /* return (any) remaining runtime */
2628         cfs_b->runtime = runtime;
2629         /*
2630          * While we are ensured activity in the period following an
2631          * unthrottle, this also covers the case in which the new bandwidth is
2632          * insufficient to cover the existing bandwidth deficit.  (Forcing the
2633          * timer to remain active while there are any throttled entities.)
2634          */
2635         cfs_b->idle = 0;
2636 out_unlock:
2637         if (idle)
2638                 cfs_b->timer_active = 0;
2639         raw_spin_unlock(&cfs_b->lock);
2640
2641         return idle;
2642 }
2643
2644 /* a cfs_rq won't donate quota below this amount */
2645 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2646 /* minimum remaining period time to redistribute slack quota */
2647 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2648 /* how long we wait to gather additional slack before distributing */
2649 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2650
2651 /* are we near the end of the current quota period? */
2652 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2653 {
2654         struct hrtimer *refresh_timer = &cfs_b->period_timer;
2655         u64 remaining;
2656
2657         /* if the call-back is running a quota refresh is already occurring */
2658         if (hrtimer_callback_running(refresh_timer))
2659                 return 1;
2660
2661         /* is a quota refresh about to occur? */
2662         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2663         if (remaining < min_expire)
2664                 return 1;
2665
2666         return 0;
2667 }
2668
2669 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2670 {
2671         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2672
2673         /* if there's a quota refresh soon don't bother with slack */
2674         if (runtime_refresh_within(cfs_b, min_left))
2675                 return;
2676
2677         start_bandwidth_timer(&cfs_b->slack_timer,
2678                                 ns_to_ktime(cfs_bandwidth_slack_period));
2679 }
2680
2681 /* we know any runtime found here is valid as update_curr() precedes return */
2682 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2683 {
2684         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2685         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2686
2687         if (slack_runtime <= 0)
2688                 return;
2689
2690         raw_spin_lock(&cfs_b->lock);
2691         if (cfs_b->quota != RUNTIME_INF &&
2692             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2693                 cfs_b->runtime += slack_runtime;
2694
2695                 /* we are under rq->lock, defer unthrottling using a timer */
2696                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2697                     !list_empty(&cfs_b->throttled_cfs_rq))
2698                         start_cfs_slack_bandwidth(cfs_b);
2699         }
2700         raw_spin_unlock(&cfs_b->lock);
2701
2702         /* even if it's not valid for return we don't want to try again */
2703         cfs_rq->runtime_remaining -= slack_runtime;
2704 }
2705
2706 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2707 {
2708         if (!cfs_bandwidth_used())
2709                 return;
2710
2711         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2712                 return;
2713
2714         __return_cfs_rq_runtime(cfs_rq);
2715 }
2716
2717 /*
2718  * This is done with a timer (instead of inline with bandwidth return) since
2719  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2720  */
2721 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2722 {
2723         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2724         u64 expires;
2725
2726         /* confirm we're still not at a refresh boundary */
2727         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2728                 return;
2729
2730         raw_spin_lock(&cfs_b->lock);
2731         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2732                 runtime = cfs_b->runtime;
2733                 cfs_b->runtime = 0;
2734         }
2735         expires = cfs_b->runtime_expires;
2736         raw_spin_unlock(&cfs_b->lock);
2737
2738         if (!runtime)
2739                 return;
2740
2741         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2742
2743         raw_spin_lock(&cfs_b->lock);
2744         if (expires == cfs_b->runtime_expires)
2745                 cfs_b->runtime = runtime;
2746         raw_spin_unlock(&cfs_b->lock);
2747 }
2748
2749 /*
2750  * When a group wakes up we want to make sure that its quota is not already
2751  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2752  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2753  */
2754 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2755 {
2756         if (!cfs_bandwidth_used())
2757                 return;
2758
2759         /* an active group must be handled by the update_curr()->put() path */
2760         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2761                 return;
2762
2763         /* ensure the group is not already throttled */
2764         if (cfs_rq_throttled(cfs_rq))
2765                 return;
2766
2767         /* update runtime allocation */
2768         account_cfs_rq_runtime(cfs_rq, 0);
2769         if (cfs_rq->runtime_remaining <= 0)
2770                 throttle_cfs_rq(cfs_rq);
2771 }
2772
2773 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2774 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2775 {
2776         if (!cfs_bandwidth_used())
2777                 return;
2778
2779         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2780                 return;
2781
2782         /*
2783          * it's possible for a throttled entity to be forced into a running
2784          * state (e.g. set_curr_task), in this case we're finished.
2785          */
2786         if (cfs_rq_throttled(cfs_rq))
2787                 return;
2788
2789         throttle_cfs_rq(cfs_rq);
2790 }
2791
2792 static inline u64 default_cfs_period(void);
2793 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2794 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2795
2796 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2797 {
2798         struct cfs_bandwidth *cfs_b =
2799                 container_of(timer, struct cfs_bandwidth, slack_timer);
2800         do_sched_cfs_slack_timer(cfs_b);
2801
2802         return HRTIMER_NORESTART;
2803 }
2804
2805 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2806 {
2807         struct cfs_bandwidth *cfs_b =
2808                 container_of(timer, struct cfs_bandwidth, period_timer);
2809         ktime_t now;
2810         int overrun;
2811         int idle = 0;
2812
2813         for (;;) {
2814                 now = hrtimer_cb_get_time(timer);
2815                 overrun = hrtimer_forward(timer, now, cfs_b->period);
2816
2817                 if (!overrun)
2818                         break;
2819
2820                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2821         }
2822
2823         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2824 }
2825
2826 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2827 {
2828         raw_spin_lock_init(&cfs_b->lock);
2829         cfs_b->runtime = 0;
2830         cfs_b->quota = RUNTIME_INF;
2831         cfs_b->period = ns_to_ktime(default_cfs_period());
2832
2833         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2834         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2835         cfs_b->period_timer.function = sched_cfs_period_timer;
2836         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2837         cfs_b->slack_timer.function = sched_cfs_slack_timer;
2838 }
2839
2840 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2841 {
2842         cfs_rq->runtime_enabled = 0;
2843         INIT_LIST_HEAD(&cfs_rq->throttled_list);
2844 }
2845
2846 /* requires cfs_b->lock, may release to reprogram timer */
2847 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2848 {
2849         /*
2850          * The timer may be active because we're trying to set a new bandwidth
2851          * period or because we're racing with the tear-down path
2852          * (timer_active==0 becomes visible before the hrtimer call-back
2853          * terminates).  In either case we ensure that it's re-programmed
2854          */
2855         while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2856                 raw_spin_unlock(&cfs_b->lock);
2857                 /* ensure cfs_b->lock is available while we wait */
2858                 hrtimer_cancel(&cfs_b->period_timer);
2859
2860                 raw_spin_lock(&cfs_b->lock);
2861                 /* if someone else restarted the timer then we're done */
2862                 if (cfs_b->timer_active)
2863                         return;
2864         }
2865
2866         cfs_b->timer_active = 1;
2867         start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2868 }
2869
2870 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2871 {
2872         hrtimer_cancel(&cfs_b->period_timer);
2873         hrtimer_cancel(&cfs_b->slack_timer);
2874 }
2875
2876 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2877 {
2878         struct cfs_rq *cfs_rq;
2879
2880         for_each_leaf_cfs_rq(rq, cfs_rq) {
2881                 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2882
2883                 if (!cfs_rq->runtime_enabled)
2884                         continue;
2885
2886                 /*
2887                  * clock_task is not advancing so we just need to make sure
2888                  * there's some valid quota amount
2889                  */
2890                 cfs_rq->runtime_remaining = cfs_b->quota;
2891                 if (cfs_rq_throttled(cfs_rq))
2892                         unthrottle_cfs_rq(cfs_rq);
2893         }
2894 }
2895
2896 #else /* CONFIG_CFS_BANDWIDTH */
2897 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2898 {
2899         return rq_of(cfs_rq)->clock_task;
2900 }
2901
2902 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2903                                      unsigned long delta_exec) {}
2904 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2905 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2906 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2907
2908 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2909 {
2910         return 0;
2911 }
2912
2913 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2914 {
2915         return 0;
2916 }
2917
2918 static inline int throttled_lb_pair(struct task_group *tg,
2919                                     int src_cpu, int dest_cpu)
2920 {
2921         return 0;
2922 }
2923
2924 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2925
2926 #ifdef CONFIG_FAIR_GROUP_SCHED
2927 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2928 #endif
2929
2930 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2931 {
2932         return NULL;
2933 }
2934 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2935 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2936
2937 #endif /* CONFIG_CFS_BANDWIDTH */
2938
2939 /**************************************************
2940  * CFS operations on tasks:
2941  */
2942
2943 #ifdef CONFIG_SCHED_HRTICK
2944 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2945 {
2946         struct sched_entity *se = &p->se;
2947         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2948
2949         WARN_ON(task_rq(p) != rq);
2950
2951         if (cfs_rq->nr_running > 1) {
2952                 u64 slice = sched_slice(cfs_rq, se);
2953                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2954                 s64 delta = slice - ran;
2955
2956                 if (delta < 0) {
2957                         if (rq->curr == p)
2958                                 resched_task(p);
2959                         return;
2960                 }
2961
2962                 /*
2963                  * Don't schedule slices shorter than 10000ns, that just
2964                  * doesn't make sense. Rely on vruntime for fairness.
2965                  */
2966                 if (rq->curr != p)
2967                         delta = max_t(s64, 10000LL, delta);
2968
2969                 hrtick_start(rq, delta);
2970         }
2971 }
2972
2973 /*
2974  * called from enqueue/dequeue and updates the hrtick when the
2975  * current task is from our class and nr_running is low enough
2976  * to matter.
2977  */
2978 static void hrtick_update(struct rq *rq)
2979 {
2980         struct task_struct *curr = rq->curr;
2981
2982         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2983                 return;
2984
2985         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2986                 hrtick_start_fair(rq, curr);
2987 }
2988 #else /* !CONFIG_SCHED_HRTICK */
2989 static inline void
2990 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2991 {
2992 }
2993
2994 static inline void hrtick_update(struct rq *rq)
2995 {
2996 }
2997 #endif
2998
2999 /*
3000  * The enqueue_task method is called before nr_running is
3001  * increased. Here we update the fair scheduling stats and
3002  * then put the task into the rbtree:
3003  */
3004 static void
3005 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3006 {
3007         struct cfs_rq *cfs_rq;
3008         struct sched_entity *se = &p->se;
3009
3010         for_each_sched_entity(se) {
3011                 if (se->on_rq)
3012                         break;
3013                 cfs_rq = cfs_rq_of(se);
3014                 enqueue_entity(cfs_rq, se, flags);
3015
3016                 /*
3017                  * end evaluation on encountering a throttled cfs_rq
3018                  *
3019                  * note: in the case of encountering a throttled cfs_rq we will
3020                  * post the final h_nr_running increment below.
3021                 */
3022                 if (cfs_rq_throttled(cfs_rq))
3023                         break;
3024                 cfs_rq->h_nr_running++;
3025
3026                 flags = ENQUEUE_WAKEUP;
3027         }
3028
3029         for_each_sched_entity(se) {
3030                 cfs_rq = cfs_rq_of(se);
3031                 cfs_rq->h_nr_running++;
3032
3033                 if (cfs_rq_throttled(cfs_rq))
3034                         break;
3035
3036                 update_cfs_shares(cfs_rq);
3037                 update_entity_load_avg(se, 1);
3038         }
3039
3040         if (!se) {
3041                 update_rq_runnable_avg(rq, rq->nr_running);
3042                 inc_nr_running(rq);
3043         }
3044         hrtick_update(rq);
3045 }
3046
3047 static void set_next_buddy(struct sched_entity *se);
3048
3049 /*
3050  * The dequeue_task method is called before nr_running is
3051  * decreased. We remove the task from the rbtree and
3052  * update the fair scheduling stats:
3053  */
3054 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3055 {
3056         struct cfs_rq *cfs_rq;
3057         struct sched_entity *se = &p->se;
3058         int task_sleep = flags & DEQUEUE_SLEEP;
3059
3060         for_each_sched_entity(se) {
3061                 cfs_rq = cfs_rq_of(se);
3062                 dequeue_entity(cfs_rq, se, flags);
3063
3064                 /*
3065                  * end evaluation on encountering a throttled cfs_rq
3066                  *
3067                  * note: in the case of encountering a throttled cfs_rq we will
3068                  * post the final h_nr_running decrement below.
3069                 */
3070                 if (cfs_rq_throttled(cfs_rq))
3071                         break;
3072                 cfs_rq->h_nr_running--;
3073
3074                 /* Don't dequeue parent if it has other entities besides us */
3075                 if (cfs_rq->load.weight) {
3076                         /*
3077                          * Bias pick_next to pick a task from this cfs_rq, as
3078                          * p is sleeping when it is within its sched_slice.
3079                          */
3080                         if (task_sleep && parent_entity(se))
3081                                 set_next_buddy(parent_entity(se));
3082
3083                         /* avoid re-evaluating load for this entity */
3084                         se = parent_entity(se);
3085                         break;
3086                 }
3087                 flags |= DEQUEUE_SLEEP;
3088         }
3089
3090         for_each_sched_entity(se) {
3091                 cfs_rq = cfs_rq_of(se);
3092                 cfs_rq->h_nr_running--;
3093
3094                 if (cfs_rq_throttled(cfs_rq))
3095                         break;
3096
3097                 update_cfs_shares(cfs_rq);
3098                 update_entity_load_avg(se, 1);
3099         }
3100
3101         if (!se) {
3102                 dec_nr_running(rq);
3103                 update_rq_runnable_avg(rq, 1);
3104         }
3105         hrtick_update(rq);
3106 }
3107
3108 #ifdef CONFIG_SMP
3109 /* Used instead of source_load when we know the type == 0 */
3110 static unsigned long weighted_cpuload(const int cpu)
3111 {
3112         return cpu_rq(cpu)->load.weight;
3113 }
3114
3115 /*
3116  * Return a low guess at the load of a migration-source cpu weighted
3117  * according to the scheduling class and "nice" value.
3118  *
3119  * We want to under-estimate the load of migration sources, to
3120  * balance conservatively.
3121  */
3122 static unsigned long source_load(int cpu, int type)
3123 {
3124         struct rq *rq = cpu_rq(cpu);
3125         unsigned long total = weighted_cpuload(cpu);
3126
3127         if (type == 0 || !sched_feat(LB_BIAS))
3128                 return total;
3129
3130         return min(rq->cpu_load[type-1], total);
3131 }
3132
3133 /*
3134  * Return a high guess at the load of a migration-target cpu weighted
3135  * according to the scheduling class and "nice" value.
3136  */
3137 static unsigned long target_load(int cpu, int type)
3138 {
3139         struct rq *rq = cpu_rq(cpu);
3140         unsigned long total = weighted_cpuload(cpu);
3141
3142         if (type == 0 || !sched_feat(LB_BIAS))
3143                 return total;
3144
3145         return max(rq->cpu_load[type-1], total);
3146 }
3147
3148 static unsigned long power_of(int cpu)
3149 {
3150         return cpu_rq(cpu)->cpu_power;
3151 }
3152
3153 static unsigned long cpu_avg_load_per_task(int cpu)
3154 {
3155         struct rq *rq = cpu_rq(cpu);
3156         unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3157
3158         if (nr_running)
3159                 return rq->load.weight / nr_running;
3160
3161         return 0;
3162 }
3163
3164
3165 static void task_waking_fair(struct task_struct *p)
3166 {
3167         struct sched_entity *se = &p->se;
3168         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3169         u64 min_vruntime;
3170
3171 #ifndef CONFIG_64BIT
3172         u64 min_vruntime_copy;
3173
3174         do {
3175                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3176                 smp_rmb();
3177                 min_vruntime = cfs_rq->min_vruntime;
3178         } while (min_vruntime != min_vruntime_copy);
3179 #else
3180         min_vruntime = cfs_rq->min_vruntime;
3181 #endif
3182
3183         se->vruntime -= min_vruntime;
3184 }
3185
3186 #ifdef CONFIG_FAIR_GROUP_SCHED
3187 /*
3188  * effective_load() calculates the load change as seen from the root_task_group
3189  *
3190  * Adding load to a group doesn't make a group heavier, but can cause movement
3191  * of group shares between cpus. Assuming the shares were perfectly aligned one
3192  * can calculate the shift in shares.
3193  *
3194  * Calculate the effective load difference if @wl is added (subtracted) to @tg
3195  * on this @cpu and results in a total addition (subtraction) of @wg to the
3196  * total group weight.
3197  *
3198  * Given a runqueue weight distribution (rw_i) we can compute a shares
3199  * distribution (s_i) using:
3200  *
3201  *   s_i = rw_i / \Sum rw_j                                             (1)
3202  *
3203  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3204  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3205  * shares distribution (s_i):
3206  *
3207  *   rw_i = {   2,   4,   1,   0 }
3208  *   s_i  = { 2/7, 4/7, 1/7,   0 }
3209  *
3210  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3211  * task used to run on and the CPU the waker is running on), we need to
3212  * compute the effect of waking a task on either CPU and, in case of a sync
3213  * wakeup, compute the effect of the current task going to sleep.
3214  *
3215  * So for a change of @wl to the local @cpu with an overall group weight change
3216  * of @wl we can compute the new shares distribution (s'_i) using:
3217  *
3218  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
3219  *
3220  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3221  * differences in waking a task to CPU 0. The additional task changes the
3222  * weight and shares distributions like:
3223  *
3224  *   rw'_i = {   3,   4,   1,   0 }
3225  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3226  *
3227  * We can then compute the difference in effective weight by using:
3228  *
3229  *   dw_i = S * (s'_i - s_i)                                            (3)
3230  *
3231  * Where 'S' is the group weight as seen by its parent.
3232  *
3233  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3234  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3235  * 4/7) times the weight of the group.
3236  */
3237 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3238 {
3239         struct sched_entity *se = tg->se[cpu];
3240
3241         if (!tg->parent)        /* the trivial, non-cgroup case */
3242                 return wl;
3243
3244         for_each_sched_entity(se) {
3245                 long w, W;
3246
3247                 tg = se->my_q->tg;
3248
3249                 /*
3250                  * W = @wg + \Sum rw_j
3251                  */
3252                 W = wg + calc_tg_weight(tg, se->my_q);
3253
3254                 /*
3255                  * w = rw_i + @wl
3256                  */
3257                 w = se->my_q->load.weight + wl;
3258
3259                 /*
3260                  * wl = S * s'_i; see (2)
3261                  */
3262                 if (W > 0 && w < W)
3263                         wl = (w * tg->shares) / W;
3264                 else
3265                         wl = tg->shares;
3266
3267                 /*
3268                  * Per the above, wl is the new se->load.weight value; since
3269                  * those are clipped to [MIN_SHARES, ...) do so now. See
3270                  * calc_cfs_shares().
3271                  */
3272                 if (wl < MIN_SHARES)
3273                         wl = MIN_SHARES;
3274
3275                 /*
3276                  * wl = dw_i = S * (s'_i - s_i); see (3)
3277                  */
3278                 wl -= se->load.weight;
3279
3280                 /*
3281                  * Recursively apply this logic to all parent groups to compute
3282                  * the final effective load change on the root group. Since
3283                  * only the @tg group gets extra weight, all parent groups can
3284                  * only redistribute existing shares. @wl is the shift in shares
3285                  * resulting from this level per the above.
3286                  */
3287                 wg = 0;
3288         }
3289
3290         return wl;
3291 }
3292 #else
3293
3294 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3295                 unsigned long wl, unsigned long wg)
3296 {
3297         return wl;
3298 }
3299
3300 #endif
3301
3302 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3303 {
3304         s64 this_load, load;
3305         int idx, this_cpu, prev_cpu;
3306         unsigned long tl_per_task;
3307         struct task_group *tg;
3308         unsigned long weight;
3309         int balanced;
3310
3311         idx       = sd->wake_idx;
3312         this_cpu  = smp_processor_id();
3313         prev_cpu  = task_cpu(p);
3314         load      = source_load(prev_cpu, idx);
3315         this_load = target_load(this_cpu, idx);
3316
3317         /*
3318          * If sync wakeup then subtract the (maximum possible)
3319          * effect of the currently running task from the load
3320          * of the current CPU:
3321          */
3322         if (sync) {
3323                 tg = task_group(current);
3324                 weight = current->se.load.weight;
3325
3326                 this_load += effective_load(tg, this_cpu, -weight, -weight);
3327                 load += effective_load(tg, prev_cpu, 0, -weight);
3328         }
3329
3330         tg = task_group(p);
3331         weight = p->se.load.weight;
3332
3333         /*
3334          * In low-load situations, where prev_cpu is idle and this_cpu is idle
3335          * due to the sync cause above having dropped this_load to 0, we'll
3336          * always have an imbalance, but there's really nothing you can do
3337          * about that, so that's good too.
3338          *
3339          * Otherwise check if either cpus are near enough in load to allow this
3340          * task to be woken on this_cpu.
3341          */
3342         if (this_load > 0) {
3343                 s64 this_eff_load, prev_eff_load;
3344
3345                 this_eff_load = 100;
3346                 this_eff_load *= power_of(prev_cpu);
3347                 this_eff_load *= this_load +
3348                         effective_load(tg, this_cpu, weight, weight);
3349
3350                 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3351                 prev_eff_load *= power_of(this_cpu);
3352                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3353
3354                 balanced = this_eff_load <= prev_eff_load;
3355         } else
3356                 balanced = true;
3357
3358         /*
3359          * If the currently running task will sleep within
3360          * a reasonable amount of time then attract this newly
3361          * woken task:
3362          */
3363         if (sync && balanced)
3364                 return 1;
3365
3366         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3367         tl_per_task = cpu_avg_load_per_task(this_cpu);
3368
3369         if (balanced ||
3370             (this_load <= load &&
3371              this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3372                 /*
3373                  * This domain has SD_WAKE_AFFINE and
3374                  * p is cache cold in this domain, and
3375                  * there is no bad imbalance.
3376                  */
3377                 schedstat_inc(sd, ttwu_move_affine);
3378                 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3379
3380                 return 1;
3381         }
3382         return 0;
3383 }
3384
3385 /*
3386  * find_idlest_group finds and returns the least busy CPU group within the
3387  * domain.
3388  */
3389 static struct sched_group *
3390 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3391                   int this_cpu, int load_idx)
3392 {
3393         struct sched_group *idlest = NULL, *group = sd->groups;
3394         unsigned long min_load = ULONG_MAX, this_load = 0;
3395         int imbalance = 100 + (sd->imbalance_pct-100)/2;
3396
3397         do {
3398                 unsigned long load, avg_load;
3399                 int local_group;
3400                 int i;
3401
3402                 /* Skip over this group if it has no CPUs allowed */
3403                 if (!cpumask_intersects(sched_group_cpus(group),
3404                                         tsk_cpus_allowed(p)))
3405                         continue;
3406
3407                 local_group = cpumask_test_cpu(this_cpu,
3408                                                sched_group_cpus(group));
3409
3410                 /* Tally up the load of all CPUs in the group */
3411                 avg_load = 0;
3412
3413                 for_each_cpu(i, sched_group_cpus(group)) {
3414                         /* Bias balancing toward cpus of our domain */
3415                         if (local_group)
3416                                 load = source_load(i, load_idx);
3417                         else
3418                                 load = target_load(i, load_idx);
3419
3420                         avg_load += load;
3421                 }
3422
3423                 /* Adjust by relative CPU power of the group */
3424                 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3425
3426                 if (local_group) {
3427                         this_load = avg_load;
3428                 } else if (avg_load < min_load) {
3429                         min_load = avg_load;
3430                         idlest = group;
3431                 }
3432         } while (group = group->next, group != sd->groups);
3433
3434         if (!idlest || 100*this_load < imbalance*min_load)
3435                 return NULL;
3436         return idlest;
3437 }
3438
3439 /*
3440  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3441  */
3442 static int
3443 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3444 {
3445         unsigned long load, min_load = ULONG_MAX;
3446         int idlest = -1;
3447         int i;
3448
3449         /* Traverse only the allowed CPUs */
3450         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3451                 load = weighted_cpuload(i);
3452
3453                 if (load < min_load || (load == min_load && i == this_cpu)) {
3454                         min_load = load;
3455                         idlest = i;
3456                 }
3457         }
3458
3459         return idlest;
3460 }
3461
3462 /*
3463  * Try and locate an idle CPU in the sched_domain.
3464  */
3465 static int select_idle_sibling(struct task_struct *p, int target)
3466 {
3467         struct sched_domain *sd;
3468         struct sched_group *sg;
3469         int i = task_cpu(p);
3470
3471         if (idle_cpu(target))
3472                 return target;
3473
3474         /*
3475          * If the prevous cpu is cache affine and idle, don't be stupid.
3476          */
3477         if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3478                 return i;
3479
3480         /*
3481          * Otherwise, iterate the domains and find an elegible idle cpu.
3482          */
3483         sd = rcu_dereference(per_cpu(sd_llc, target));
3484         for_each_lower_domain(sd) {
3485                 sg = sd->groups;
3486                 do {
3487                         if (!cpumask_intersects(sched_group_cpus(sg),
3488                                                 tsk_cpus_allowed(p)))
3489                                 goto next;
3490
3491                         for_each_cpu(i, sched_group_cpus(sg)) {
3492                                 if (i == target || !idle_cpu(i))
3493                                         goto next;
3494                         }
3495
3496                         target = cpumask_first_and(sched_group_cpus(sg),
3497                                         tsk_cpus_allowed(p));
3498                         goto done;
3499 next:
3500                         sg = sg->next;
3501                 } while (sg != sd->groups);
3502         }
3503 done:
3504         return target;
3505 }
3506
3507 #ifdef CONFIG_SCHED_HMP
3508 /*
3509  * Heterogenous multiprocessor (HMP) optimizations
3510  *
3511  * The cpu types are distinguished using a list of hmp_domains
3512  * which each represent one cpu type using a cpumask.
3513  * The list is assumed ordered by compute capacity with the
3514  * fastest domain first.
3515  */
3516 DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
3517 static const int hmp_max_tasks = 5;
3518
3519 extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
3520
3521 /* Setup hmp_domains */
3522 static int __init hmp_cpu_mask_setup(void)
3523 {
3524         char buf[64];
3525         struct hmp_domain *domain;
3526         struct list_head *pos;
3527         int dc, cpu;
3528
3529         pr_debug("Initializing HMP scheduler:\n");
3530
3531         /* Initialize hmp_domains using platform code */
3532         arch_get_hmp_domains(&hmp_domains);
3533         if (list_empty(&hmp_domains)) {
3534                 pr_debug("HMP domain list is empty!\n");
3535                 return 0;
3536         }
3537
3538         /* Print hmp_domains */
3539         dc = 0;
3540         list_for_each(pos, &hmp_domains) {
3541                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3542                 cpulist_scnprintf(buf, 64, &domain->possible_cpus);
3543                 pr_debug("  HMP domain %d: %s\n", dc, buf);
3544
3545                 for_each_cpu_mask(cpu, domain->possible_cpus) {
3546                         per_cpu(hmp_cpu_domain, cpu) = domain;
3547                 }
3548                 dc++;
3549         }
3550
3551         return 1;
3552 }
3553
3554 static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
3555 {
3556         struct hmp_domain *domain;
3557         struct list_head *pos;
3558
3559         list_for_each(pos, &hmp_domains) {
3560                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3561                 if(cpumask_test_cpu(cpu, &domain->possible_cpus))
3562                         return domain;
3563         }
3564         return NULL;
3565 }
3566
3567 static void hmp_online_cpu(int cpu)
3568 {
3569         struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3570
3571         if(domain)
3572                 cpumask_set_cpu(cpu, &domain->cpus);
3573 }
3574
3575 static void hmp_offline_cpu(int cpu)
3576 {
3577         struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3578
3579         if(domain)
3580                 cpumask_clear_cpu(cpu, &domain->cpus);
3581 }
3582 /*
3583  * Needed to determine heaviest tasks etc.
3584  */
3585 static inline unsigned int hmp_cpu_is_fastest(int cpu);
3586 static inline unsigned int hmp_cpu_is_slowest(int cpu);
3587 static inline struct hmp_domain *hmp_slower_domain(int cpu);
3588 static inline struct hmp_domain *hmp_faster_domain(int cpu);
3589
3590 /* must hold runqueue lock for queue se is currently on */
3591 static struct sched_entity *hmp_get_heaviest_task(
3592                                 struct sched_entity *se, int migrate_up)
3593 {
3594         int num_tasks = hmp_max_tasks;
3595         struct sched_entity *max_se = se;
3596         unsigned long int max_ratio = se->avg.load_avg_ratio;
3597         const struct cpumask *hmp_target_mask = NULL;
3598
3599         if (migrate_up) {
3600                 struct hmp_domain *hmp;
3601                 if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
3602                         return max_se;
3603
3604                 hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
3605                 hmp_target_mask = &hmp->cpus;
3606         }
3607         /* The currently running task is not on the runqueue */
3608         se = __pick_first_entity(cfs_rq_of(se));
3609
3610         while (num_tasks && se) {
3611                 if (entity_is_task(se) &&
3612                         (se->avg.load_avg_ratio > max_ratio &&
3613                          hmp_target_mask &&
3614                          cpumask_intersects(hmp_target_mask,
3615                                 tsk_cpus_allowed(task_of(se))))) {
3616                         max_se = se;
3617                         max_ratio = se->avg.load_avg_ratio;
3618                 }
3619                 se = __pick_next_entity(se);
3620                 num_tasks--;
3621         }
3622         return max_se;
3623 }
3624
3625 static struct sched_entity *hmp_get_lightest_task(
3626                                 struct sched_entity *se, int migrate_down)
3627 {
3628         int num_tasks = hmp_max_tasks;
3629         struct sched_entity *min_se = se;
3630         unsigned long int min_ratio = se->avg.load_avg_ratio;
3631         const struct cpumask *hmp_target_mask = NULL;
3632
3633         if (migrate_down) {
3634                 struct hmp_domain *hmp;
3635                 if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
3636                         return min_se;
3637                 hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
3638                 hmp_target_mask = &hmp->cpus;
3639         }
3640         /* The currently running task is not on the runqueue */
3641         se = __pick_first_entity(cfs_rq_of(se));
3642
3643         while (num_tasks && se) {
3644                 if (entity_is_task(se) &&
3645                         (se->avg.load_avg_ratio < min_ratio &&
3646                         hmp_target_mask &&
3647                                 cpumask_intersects(hmp_target_mask,
3648                                 tsk_cpus_allowed(task_of(se))))) {
3649                         min_se = se;
3650                         min_ratio = se->avg.load_avg_ratio;
3651                 }
3652                 se = __pick_next_entity(se);
3653                 num_tasks--;
3654         }
3655         return min_se;
3656 }
3657
3658 /*
3659  * Migration thresholds should be in the range [0..1023]
3660  * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3661  * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3662  *
3663  * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
3664  * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
3665  * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
3666  *
3667  * Small Task Packing:
3668  * We can choose to fill the littlest CPUs in an HMP system rather than
3669  * the typical spreading mechanic. This behavior is controllable using
3670  * two variables.
3671  * hmp_packing_enabled: runtime control over pack/spread
3672  * hmp_full_threshold: Consider a CPU with this much unweighted load full
3673  */
3674 unsigned int hmp_up_threshold = 700;
3675 unsigned int hmp_down_threshold = 512;
3676 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3677 unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
3678 #endif
3679 unsigned int hmp_next_up_threshold = 4096;
3680 unsigned int hmp_next_down_threshold = 4096;
3681
3682 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3683 unsigned int hmp_packing_enabled = 1;
3684 #ifndef CONFIG_ARCH_VEXPRESS_TC2
3685 unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8;
3686 #else
3687 /* TC2 has a sharp consumption curve @ around 800Mhz, so
3688    we aim to spread the load around that frequency. */
3689 unsigned int hmp_full_threshold = 650;  /*  80% of the 800Mhz freq * NICE_0_LOAD */
3690 #endif
3691 #endif
3692
3693 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
3694 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
3695 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
3696                                                 int *min_cpu, struct cpumask *affinity);
3697
3698 static inline struct hmp_domain *hmp_smallest_domain(void)
3699 {
3700         return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains);
3701 }
3702
3703 /* Check if cpu is in fastest hmp_domain */
3704 static inline unsigned int hmp_cpu_is_fastest(int cpu)
3705 {
3706         struct list_head *pos;
3707
3708         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3709         return pos == hmp_domains.next;
3710 }
3711
3712 /* Check if cpu is in slowest hmp_domain */
3713 static inline unsigned int hmp_cpu_is_slowest(int cpu)
3714 {
3715         struct list_head *pos;
3716
3717         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3718         return list_is_last(pos, &hmp_domains);
3719 }
3720
3721 /* Next (slower) hmp_domain relative to cpu */
3722 static inline struct hmp_domain *hmp_slower_domain(int cpu)
3723 {
3724         struct list_head *pos;
3725
3726         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3727         return list_entry(pos->next, struct hmp_domain, hmp_domains);
3728 }
3729
3730 /* Previous (faster) hmp_domain relative to cpu */
3731 static inline struct hmp_domain *hmp_faster_domain(int cpu)
3732 {
3733         struct list_head *pos;
3734
3735         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3736         return list_entry(pos->prev, struct hmp_domain, hmp_domains);
3737 }
3738
3739 /*
3740  * Selects a cpu in previous (faster) hmp_domain
3741  */
3742 static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
3743                                                         int cpu)
3744 {
3745         int lowest_cpu=NR_CPUS;
3746         __always_unused int lowest_ratio;
3747         struct hmp_domain *hmp;
3748
3749         if (hmp_cpu_is_fastest(cpu))
3750                 hmp = hmp_cpu_domain(cpu);
3751         else
3752                 hmp = hmp_faster_domain(cpu);
3753
3754         lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
3755                         tsk_cpus_allowed(tsk));
3756
3757         return lowest_cpu;
3758 }
3759
3760 /*
3761  * Selects a cpu in next (slower) hmp_domain
3762  * Note that cpumask_any_and() returns the first cpu in the cpumask
3763  */
3764 static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
3765                                                         int cpu)
3766 {
3767         int lowest_cpu=NR_CPUS;
3768         struct hmp_domain *hmp;
3769         __always_unused int lowest_ratio;
3770
3771         if (hmp_cpu_is_slowest(cpu))
3772                 hmp = hmp_cpu_domain(cpu);
3773         else
3774                 hmp = hmp_slower_domain(cpu);
3775
3776         lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
3777                         tsk_cpus_allowed(tsk));
3778
3779         return lowest_cpu;
3780 }
3781 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3782 /*
3783  * Select the 'best' candidate little CPU to wake up on.
3784  * Implements a packing strategy which examines CPU in
3785  * logical CPU order, and selects the first which will
3786  * have at least 10% capacity available, according to
3787  * both tracked load of the runqueue and the task.
3788  */
3789 static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
3790                 int cpu) {
3791         int tmp_cpu;
3792         unsigned long estimated_load;
3793         struct hmp_domain *hmp;
3794         struct sched_avg *avg;
3795         struct cpumask allowed_hmp_cpus;
3796
3797         if(!hmp_packing_enabled ||
3798                         tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100))
3799                 return hmp_select_slower_cpu(tsk, cpu);
3800
3801         if (hmp_cpu_is_slowest(cpu))
3802                 hmp = hmp_cpu_domain(cpu);
3803         else
3804                 hmp = hmp_slower_domain(cpu);
3805
3806         /* respect affinity */
3807         cpumask_and(&allowed_hmp_cpus, &hmp->cpus,
3808                         tsk_cpus_allowed(tsk));
3809
3810         for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) {
3811                 avg = &cpu_rq(tmp_cpu)->avg;
3812                 /* estimate new rq load if we add this task */
3813                 estimated_load = avg->load_avg_ratio +
3814                                 tsk->se.avg.load_avg_ratio;
3815                 if (estimated_load <= hmp_full_threshold) {
3816                         cpu = tmp_cpu;
3817                         break;
3818                 }
3819         }
3820         /* if no match was found, the task uses the initial value */
3821         return cpu;
3822 }
3823 #endif
3824 static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
3825 {
3826         /* hack - always use clock from first online CPU */
3827         u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
3828         se->avg.hmp_last_up_migration = now;
3829         se->avg.hmp_last_down_migration = 0;
3830         cpu_rq(cpu)->avg.hmp_last_up_migration = now;
3831         cpu_rq(cpu)->avg.hmp_last_down_migration = 0;
3832 }
3833
3834 static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
3835 {
3836         /* hack - always use clock from first online CPU */
3837         u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
3838         se->avg.hmp_last_down_migration = now;
3839         se->avg.hmp_last_up_migration = 0;
3840         cpu_rq(cpu)->avg.hmp_last_down_migration = now;
3841         cpu_rq(cpu)->avg.hmp_last_up_migration = 0;
3842 }
3843
3844 /*
3845  * Heterogenous multiprocessor (HMP) optimizations
3846  *
3847  * These functions allow to change the growing speed of the load_avg_ratio
3848  * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
3849  * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
3850  *
3851  * These functions also allow to change the up and down threshold of HMP
3852  * using /sys/kernel/hmp/{up,down}_threshold.
3853  * Both must be between 0 and 1023. The threshold that is compared
3854  * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
3855  *
3856  * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
3857  * task with a load of 0 will reach the threshold after 64ms of busy loop.
3858  *
3859  * Changing load_avg_periods_ms has the same effect than changing the
3860  * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
3861  * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
3862  * could trigger overflows.
3863  * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
3864  * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
3865  * could be overflowed for a weight > 2^12 even is the load_avg_contrib
3866  * should still be a 32bits result. This would not happen by multiplicating
3867  * delta time by 1/22 and setting load_avg_period_ms = 706.
3868  */
3869
3870 /*
3871  * By scaling the delta time it end-up increasing or decrease the
3872  * growing speed of the per entity load_avg_ratio
3873  * The scale factor hmp_data.multiplier is a fixed point
3874  * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
3875  */
3876 static inline u64 hmp_variable_scale_convert(u64 delta)
3877 {
3878 #ifdef CONFIG_HMP_VARIABLE_SCALE
3879         u64 high = delta >> 32ULL;
3880         u64 low = delta & 0xffffffffULL;
3881         low *= hmp_data.multiplier;
3882         high *= hmp_data.multiplier;
3883         return (low >> HMP_VARIABLE_SCALE_SHIFT)
3884                         + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
3885 #else
3886         return delta;
3887 #endif
3888 }
3889
3890 static ssize_t hmp_show(struct kobject *kobj,
3891                                 struct attribute *attr, char *buf)
3892 {
3893         struct hmp_global_attr *hmp_attr =
3894                 container_of(attr, struct hmp_global_attr, attr);
3895         int temp;
3896
3897         if (hmp_attr->to_sysfs_text != NULL)
3898                 return hmp_attr->to_sysfs_text(buf, PAGE_SIZE);
3899
3900         temp = *(hmp_attr->value);
3901         if (hmp_attr->to_sysfs != NULL)
3902                 temp = hmp_attr->to_sysfs(temp);
3903
3904         return (ssize_t)sprintf(buf, "%d\n", temp);
3905 }
3906
3907 static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
3908                                 const char *buf, size_t count)
3909 {
3910         int temp;
3911         ssize_t ret = count;
3912         struct hmp_global_attr *hmp_attr =
3913                 container_of(attr, struct hmp_global_attr, attr);
3914         char *str = vmalloc(count + 1);
3915         if (str == NULL)
3916                 return -ENOMEM;
3917         memcpy(str, buf, count);
3918         str[count] = 0;
3919         if (sscanf(str, "%d", &temp) < 1)
3920                 ret = -EINVAL;
3921         else {
3922                 if (hmp_attr->from_sysfs != NULL)
3923                         temp = hmp_attr->from_sysfs(temp);
3924                 if (temp < 0)
3925                         ret = -EINVAL;
3926                 else
3927                         *(hmp_attr->value) = temp;
3928         }
3929         vfree(str);
3930         return ret;
3931 }
3932
3933 static ssize_t hmp_print_domains(char *outbuf, int outbufsize)
3934 {
3935         char buf[64];
3936         const char nospace[] = "%s", space[] = " %s";
3937         const char *fmt = nospace;
3938         struct hmp_domain *domain;
3939         struct list_head *pos;
3940         int outpos = 0;
3941         list_for_each(pos, &hmp_domains) {
3942                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3943                 if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) {
3944                         outpos += sprintf(outbuf+outpos, fmt, buf);
3945                         fmt = space;
3946                 }
3947         }
3948         strcat(outbuf, "\n");
3949         return outpos+1;
3950 }
3951
3952 #ifdef CONFIG_HMP_VARIABLE_SCALE
3953 static int hmp_period_tofrom_sysfs(int value)
3954 {
3955         return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
3956 }
3957 #endif
3958 /* max value for threshold is 1024 */
3959 static int hmp_theshold_from_sysfs(int value)
3960 {
3961         if (value > 1024)
3962                 return -1;
3963         return value;
3964 }
3965 #if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \
3966                 defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)
3967 /* toggle control is only 0,1 off/on */
3968 static int hmp_toggle_from_sysfs(int value)
3969 {
3970         if (value < 0 || value > 1)
3971                 return -1;
3972         return value;
3973 }
3974 #endif
3975 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3976 /* packing value must be non-negative */
3977 static int hmp_packing_from_sysfs(int value)
3978 {
3979         if (value < 0)
3980                 return -1;
3981         return value;
3982 }
3983 #endif
3984 static void hmp_attr_add(
3985         const char *name,
3986         int *value,
3987         int (*to_sysfs)(int),
3988         int (*from_sysfs)(int),
3989         ssize_t (*to_sysfs_text)(char *, int),
3990         umode_t mode)
3991 {
3992         int i = 0;
3993         while (hmp_data.attributes[i] != NULL) {
3994                 i++;
3995                 if (i >= HMP_DATA_SYSFS_MAX)
3996                         return;
3997         }
3998         if (mode)
3999                 hmp_data.attr[i].attr.mode = mode;
4000         else
4001                 hmp_data.attr[i].attr.mode = 0644;
4002         hmp_data.attr[i].show = hmp_show;
4003         hmp_data.attr[i].store = hmp_store;
4004         hmp_data.attr[i].attr.name = name;
4005         hmp_data.attr[i].value = value;
4006         hmp_data.attr[i].to_sysfs = to_sysfs;
4007         hmp_data.attr[i].from_sysfs = from_sysfs;
4008         hmp_data.attr[i].to_sysfs_text = to_sysfs_text;
4009         hmp_data.attributes[i] = &hmp_data.attr[i].attr;
4010         hmp_data.attributes[i + 1] = NULL;
4011 }
4012
4013 static int hmp_attr_init(void)
4014 {
4015         int ret;
4016         memset(&hmp_data, sizeof(hmp_data), 0);
4017         hmp_attr_add("hmp_domains",
4018                 NULL,
4019                 NULL,
4020                 NULL,
4021                 hmp_print_domains,
4022                 0444);
4023         hmp_attr_add("up_threshold",
4024                 &hmp_up_threshold,
4025                 NULL,
4026                 hmp_theshold_from_sysfs,
4027                 NULL,
4028                 0);
4029         hmp_attr_add("down_threshold",
4030                 &hmp_down_threshold,
4031                 NULL,
4032                 hmp_theshold_from_sysfs,
4033                 NULL,
4034                 0);
4035 #ifdef CONFIG_HMP_VARIABLE_SCALE
4036         /* by default load_avg_period_ms == LOAD_AVG_PERIOD
4037          * meaning no change
4038          */
4039         hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
4040         hmp_attr_add("load_avg_period_ms",
4041                 &hmp_data.multiplier,
4042                 hmp_period_tofrom_sysfs,
4043                 hmp_period_tofrom_sysfs,
4044                 NULL,
4045                 0);
4046 #endif
4047 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4048         /* default frequency-invariant scaling ON */
4049         hmp_data.freqinvar_load_scale_enabled = 1;
4050         hmp_attr_add("frequency_invariant_load_scale",
4051                 &hmp_data.freqinvar_load_scale_enabled,
4052                 NULL,
4053                 hmp_toggle_from_sysfs,
4054                 NULL,
4055                 0);
4056 #endif
4057 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
4058         hmp_attr_add("packing_enable",
4059                 &hmp_packing_enabled,
4060                 NULL,
4061                 hmp_toggle_from_sysfs,
4062                 NULL,
4063                 0);
4064         hmp_attr_add("packing_limit",
4065                 &hmp_full_threshold,
4066                 NULL,
4067                 hmp_packing_from_sysfs,
4068                 NULL,
4069                 0);
4070 #endif
4071         hmp_data.attr_group.name = "hmp";
4072         hmp_data.attr_group.attrs = hmp_data.attributes;
4073         ret = sysfs_create_group(kernel_kobj,
4074                 &hmp_data.attr_group);
4075         return 0;
4076 }
4077 late_initcall(hmp_attr_init);
4078 /*
4079  * return the load of the lowest-loaded CPU in a given HMP domain
4080  * min_cpu optionally points to an int to receive the CPU.
4081  * affinity optionally points to a cpumask containing the
4082  * CPUs to be considered. note:
4083  *   + min_cpu = NR_CPUS only if no CPUs are in the set of
4084  *     affinity && hmp_domain cpus
4085  *   + min_cpu will always otherwise equal one of the CPUs in
4086  *     the hmp domain
4087  *   + when more than one CPU has the same load, the one which
4088  *     is least-recently-disturbed by an HMP migration will be
4089  *     selected
4090  *   + if all CPUs are equally loaded or idle and the times are
4091  *     all the same, the first in the set will be used
4092  *   + if affinity is not set, cpu_online_mask is used
4093  */
4094 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4095                                                 int *min_cpu, struct cpumask *affinity)
4096 {
4097         int cpu;
4098         int min_cpu_runnable_temp = NR_CPUS;
4099         u64 min_target_last_migration = ULLONG_MAX;
4100         u64 curr_last_migration;
4101         unsigned long min_runnable_load = INT_MAX;
4102         unsigned long contrib;
4103         struct sched_avg *avg;
4104         struct cpumask temp_cpumask;
4105         /*
4106          * only look at CPUs allowed if specified,
4107          * otherwise look at all online CPUs in the
4108          * right HMP domain
4109          */
4110         cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask);
4111
4112         for_each_cpu_mask(cpu, temp_cpumask) {
4113                 avg = &cpu_rq(cpu)->avg;
4114                 /* used for both up and down migration */
4115                 curr_last_migration = avg->hmp_last_up_migration ?
4116                         avg->hmp_last_up_migration : avg->hmp_last_down_migration;
4117
4118                 contrib = avg->load_avg_ratio;
4119                 /*
4120                  * Consider a runqueue completely busy if there is any load
4121                  * on it. Definitely not the best for overall fairness, but
4122                  * does well in typical Android use cases.
4123                  */
4124                 if (contrib)
4125                         contrib = 1023;
4126
4127                 if ((contrib < min_runnable_load) ||
4128                         (contrib == min_runnable_load &&
4129                          curr_last_migration < min_target_last_migration)) {
4130                         /*
4131                          * if the load is the same target the CPU with
4132                          * the longest time since a migration.
4133                          * This is to spread migration load between
4134                          * members of a domain more evenly when the
4135                          * domain is fully loaded
4136                          */
4137                         min_runnable_load = contrib;
4138                         min_cpu_runnable_temp = cpu;
4139                         min_target_last_migration = curr_last_migration;
4140                 }
4141         }
4142
4143         if (min_cpu)
4144                 *min_cpu = min_cpu_runnable_temp;
4145
4146         return min_runnable_load;
4147 }
4148
4149 /*
4150  * Calculate the task starvation
4151  * This is the ratio of actually running time vs. runnable time.
4152  * If the two are equal the task is getting the cpu time it needs or
4153  * it is alone on the cpu and the cpu is fully utilized.
4154  */
4155 static inline unsigned int hmp_task_starvation(struct sched_entity *se)
4156 {
4157         u32 starvation;
4158
4159         starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
4160         starvation /= (se->avg.runnable_avg_sum + 1);
4161
4162         return scale_load(starvation);
4163 }
4164
4165 static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
4166 {
4167         int min_usage;
4168         int dest_cpu = NR_CPUS;
4169
4170         if (hmp_cpu_is_slowest(cpu))
4171                 return NR_CPUS;
4172
4173         /* Is there an idle CPU in the current domain */
4174         min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
4175         if (min_usage == 0) {
4176                 trace_sched_hmp_offload_abort(cpu, min_usage, "load");
4177                 return NR_CPUS;
4178         }
4179
4180         /* Is the task alone on the cpu? */
4181         if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
4182                 trace_sched_hmp_offload_abort(cpu,
4183                         cpu_rq(cpu)->cfs.h_nr_running, "nr_running");
4184                 return NR_CPUS;
4185         }
4186
4187         /* Is the task actually starving? */
4188         /* >=25% ratio running/runnable = starving */
4189         if (hmp_task_starvation(se) > 768) {
4190                 trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se),
4191                         "starvation");
4192                 return NR_CPUS;
4193         }
4194
4195         /* Does the slower domain have any idle CPUs? */
4196         min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
4197                         tsk_cpus_allowed(task_of(se)));
4198
4199         if (min_usage == 0) {
4200                 trace_sched_hmp_offload_succeed(cpu, dest_cpu);
4201                 return dest_cpu;
4202         } else
4203                 trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain");
4204         return NR_CPUS;
4205 }
4206 #endif /* CONFIG_SCHED_HMP */
4207
4208 /*
4209  * sched_balance_self: balance the current task (running on cpu) in domains
4210  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
4211  * SD_BALANCE_EXEC.
4212  *
4213  * Balance, ie. select the least loaded group.
4214  *
4215  * Returns the target CPU number, or the same CPU if no balancing is needed.
4216  *
4217  * preempt must be disabled.
4218  */
4219 static int
4220 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
4221 {
4222         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4223         int cpu = smp_processor_id();
4224         int prev_cpu = task_cpu(p);
4225         int new_cpu = cpu;
4226         int want_affine = 0;
4227         int sync = wake_flags & WF_SYNC;
4228
4229         if (p->nr_cpus_allowed == 1)
4230                 return prev_cpu;
4231
4232 #ifdef CONFIG_SCHED_HMP
4233         /* always put non-kernel forking tasks on a big domain */
4234         if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
4235                 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
4236                 if (new_cpu != NR_CPUS) {
4237                         hmp_next_up_delay(&p->se, new_cpu);
4238                         return new_cpu;
4239                 }
4240                 /* failed to perform HMP fork balance, use normal balance */
4241                 new_cpu = cpu;
4242         }
4243 #endif
4244
4245         if (sd_flag & SD_BALANCE_WAKE) {
4246                 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4247                         want_affine = 1;
4248                 new_cpu = prev_cpu;
4249         }
4250
4251         rcu_read_lock();
4252         for_each_domain(cpu, tmp) {
4253                 if (!(tmp->flags & SD_LOAD_BALANCE))
4254                         continue;
4255
4256                 /*
4257                  * If both cpu and prev_cpu are part of this domain,
4258                  * cpu is a valid SD_WAKE_AFFINE target.
4259                  */
4260                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4261                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4262                         affine_sd = tmp;
4263                         break;
4264                 }
4265
4266                 if (tmp->flags & sd_flag)
4267                         sd = tmp;
4268         }
4269
4270         if (affine_sd) {
4271                 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4272                         prev_cpu = cpu;
4273
4274                 new_cpu = select_idle_sibling(p, prev_cpu);
4275                 goto unlock;
4276         }
4277
4278         while (sd) {
4279                 int load_idx = sd->forkexec_idx;
4280                 struct sched_group *group;
4281                 int weight;
4282
4283                 if (!(sd->flags & sd_flag)) {
4284                         sd = sd->child;
4285                         continue;
4286                 }
4287
4288                 if (sd_flag & SD_BALANCE_WAKE)
4289                         load_idx = sd->wake_idx;
4290
4291                 group = find_idlest_group(sd, p, cpu, load_idx);
4292                 if (!group) {
4293                         sd = sd->child;
4294                         continue;
4295                 }
4296
4297                 new_cpu = find_idlest_cpu(group, p, cpu);
4298                 if (new_cpu == -1 || new_cpu == cpu) {
4299                         /* Now try balancing at a lower domain level of cpu */
4300                         sd = sd->child;
4301                         continue;
4302                 }
4303
4304                 /* Now try balancing at a lower domain level of new_cpu */
4305                 cpu = new_cpu;
4306                 weight = sd->span_weight;
4307                 sd = NULL;
4308                 for_each_domain(cpu, tmp) {
4309                         if (weight <= tmp->span_weight)
4310                                 break;
4311                         if (tmp->flags & sd_flag)
4312                                 sd = tmp;
4313                 }
4314                 /* while loop will break here if sd == NULL */
4315         }
4316 unlock:
4317         rcu_read_unlock();
4318
4319 #ifdef CONFIG_SCHED_HMP
4320         prev_cpu = task_cpu(p);
4321
4322         if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
4323                 hmp_next_up_delay(&p->se, new_cpu);
4324                 trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
4325                 return new_cpu;
4326         }
4327         if (hmp_down_migration(prev_cpu, &p->se)) {
4328 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
4329                 new_cpu = hmp_best_little_cpu(p, prev_cpu);
4330 #else
4331                 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
4332 #endif
4333                 if (new_cpu != prev_cpu) {
4334                         hmp_next_down_delay(&p->se, new_cpu);
4335                         trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
4336                         return new_cpu;
4337                 }
4338         }
4339         /* Make sure that the task stays in its previous hmp domain */
4340         if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
4341                 return prev_cpu;
4342 #endif
4343
4344         return new_cpu;
4345 }
4346
4347 /*
4348  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
4349  * removed when useful for applications beyond shares distribution (e.g.
4350  * load-balance).
4351  */
4352 #ifdef CONFIG_FAIR_GROUP_SCHED
4353 /*
4354  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4355  * cfs_rq_of(p) references at time of call are still valid and identify the
4356  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
4357  * other assumptions, including the state of rq->lock, should be made.
4358  */
4359 static void
4360 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4361 {
4362         struct sched_entity *se = &p->se;
4363         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4364
4365         /*
4366          * Load tracking: accumulate removed load so that it can be processed
4367          * when we next update owning cfs_rq under rq->lock.  Tasks contribute
4368          * to blocked load iff they have a positive decay-count.  It can never
4369          * be negative here since on-rq tasks have decay-count == 0.
4370          */
4371         if (se->avg.decay_count) {
4372                 se->avg.decay_count = -__synchronize_entity_decay(se);
4373                 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
4374         }
4375 }
4376 #endif
4377 #endif /* CONFIG_SMP */
4378
4379 static unsigned long
4380 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4381 {
4382         unsigned long gran = sysctl_sched_wakeup_granularity;
4383
4384         /*
4385          * Since its curr running now, convert the gran from real-time
4386          * to virtual-time in his units.
4387          *
4388          * By using 'se' instead of 'curr' we penalize light tasks, so
4389          * they get preempted easier. That is, if 'se' < 'curr' then
4390          * the resulting gran will be larger, therefore penalizing the
4391          * lighter, if otoh 'se' > 'curr' then the resulting gran will
4392          * be smaller, again penalizing the lighter task.
4393          *
4394          * This is especially important for buddies when the leftmost
4395          * task is higher priority than the buddy.
4396          */
4397         return calc_delta_fair(gran, se);
4398 }
4399
4400 /*
4401  * Should 'se' preempt 'curr'.
4402  *
4403  *             |s1
4404  *        |s2
4405  *   |s3
4406  *         g
4407  *      |<--->|c
4408  *
4409  *  w(c, s1) = -1
4410  *  w(c, s2) =  0
4411  *  w(c, s3) =  1
4412  *
4413  */
4414 static int
4415 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4416 {
4417         s64 gran, vdiff = curr->vruntime - se->vruntime;
4418
4419         if (vdiff <= 0)
4420                 return -1;
4421
4422         gran = wakeup_gran(curr, se);
4423         if (vdiff > gran)
4424                 return 1;
4425
4426         return 0;
4427 }
4428
4429 static void set_last_buddy(struct sched_entity *se)
4430 {
4431         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4432                 return;
4433
4434         for_each_sched_entity(se)
4435                 cfs_rq_of(se)->last = se;
4436 }
4437
4438 static void set_next_buddy(struct sched_entity *se)
4439 {
4440         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4441                 return;
4442
4443         for_each_sched_entity(se)
4444                 cfs_rq_of(se)->next = se;
4445 }
4446
4447 static void set_skip_buddy(struct sched_entity *se)
4448 {
4449         for_each_sched_entity(se)
4450                 cfs_rq_of(se)->skip = se;
4451 }
4452
4453 /*
4454  * Preempt the current task with a newly woken task if needed:
4455  */
4456 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4457 {
4458         struct task_struct *curr = rq->curr;
4459         struct sched_entity *se = &curr->se, *pse = &p->se;
4460         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4461         int scale = cfs_rq->nr_running >= sched_nr_latency;
4462         int next_buddy_marked = 0;
4463
4464         if (unlikely(se == pse))
4465                 return;
4466
4467         /*
4468          * This is possible from callers such as move_task(), in which we
4469          * unconditionally check_prempt_curr() after an enqueue (which may have
4470          * lead to a throttle).  This both saves work and prevents false
4471          * next-buddy nomination below.
4472          */
4473         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4474                 return;
4475
4476         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4477                 set_next_buddy(pse);
4478                 next_buddy_marked = 1;
4479         }
4480
4481         /*
4482          * We can come here with TIF_NEED_RESCHED already set from new task
4483          * wake up path.
4484          *
4485          * Note: this also catches the edge-case of curr being in a throttled
4486          * group (e.g. via set_curr_task), since update_curr() (in the
4487          * enqueue of curr) will have resulted in resched being set.  This
4488          * prevents us from potentially nominating it as a false LAST_BUDDY
4489          * below.
4490          */
4491         if (test_tsk_need_resched(curr))
4492                 return;
4493
4494         /* Idle tasks are by definition preempted by non-idle tasks. */
4495         if (unlikely(curr->policy == SCHED_IDLE) &&
4496             likely(p->policy != SCHED_IDLE))
4497                 goto preempt;
4498
4499         /*
4500          * Batch and idle tasks do not preempt non-idle tasks (their preemption
4501          * is driven by the tick):
4502          */
4503         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4504                 return;
4505
4506         find_matching_se(&se, &pse);
4507         update_curr(cfs_rq_of(se));
4508         BUG_ON(!pse);
4509         if (wakeup_preempt_entity(se, pse) == 1) {
4510                 /*
4511                  * Bias pick_next to pick the sched entity that is
4512                  * triggering this preemption.
4513                  */
4514                 if (!next_buddy_marked)
4515                         set_next_buddy(pse);
4516                 goto preempt;
4517         }
4518
4519         return;
4520
4521 preempt:
4522         resched_task(curr);
4523         /*
4524          * Only set the backward buddy when the current task is still
4525          * on the rq. This can happen when a wakeup gets interleaved
4526          * with schedule on the ->pre_schedule() or idle_balance()
4527          * point, either of which can * drop the rq lock.
4528          *
4529          * Also, during early boot the idle thread is in the fair class,
4530          * for obvious reasons its a bad idea to schedule back to it.
4531          */
4532         if (unlikely(!se->on_rq || curr == rq->idle))
4533                 return;
4534
4535         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4536                 set_last_buddy(se);
4537 }
4538
4539 static struct task_struct *pick_next_task_fair(struct rq *rq)
4540 {
4541         struct task_struct *p;
4542         struct cfs_rq *cfs_rq = &rq->cfs;
4543         struct sched_entity *se;
4544
4545         if (!cfs_rq->nr_running)
4546                 return NULL;
4547
4548         do {
4549                 se = pick_next_entity(cfs_rq);
4550                 set_next_entity(cfs_rq, se);
4551                 cfs_rq = group_cfs_rq(se);
4552         } while (cfs_rq);
4553
4554         p = task_of(se);
4555         if (hrtick_enabled(rq))
4556                 hrtick_start_fair(rq, p);
4557
4558         return p;
4559 }
4560
4561 /*
4562  * Account for a descheduled task:
4563  */
4564 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
4565 {
4566         struct sched_entity *se = &prev->se;
4567         struct cfs_rq *cfs_rq;
4568
4569         for_each_sched_entity(se) {
4570                 cfs_rq = cfs_rq_of(se);
4571                 put_prev_entity(cfs_rq, se);
4572         }
4573 }
4574
4575 /*
4576  * sched_yield() is very simple
4577  *
4578  * The magic of dealing with the ->skip buddy is in pick_next_entity.
4579  */
4580 static void yield_task_fair(struct rq *rq)
4581 {
4582         struct task_struct *curr = rq->curr;
4583         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4584         struct sched_entity *se = &curr->se;
4585
4586         /*
4587          * Are we the only task in the tree?
4588          */
4589         if (unlikely(rq->nr_running == 1))
4590                 return;
4591
4592         clear_buddies(cfs_rq, se);
4593
4594         if (curr->policy != SCHED_BATCH) {
4595                 update_rq_clock(rq);
4596                 /*
4597                  * Update run-time statistics of the 'current'.
4598                  */
4599                 update_curr(cfs_rq);
4600                 /*
4601                  * Tell update_rq_clock() that we've just updated,
4602                  * so we don't do microscopic update in schedule()
4603                  * and double the fastpath cost.
4604                  */
4605                  rq->skip_clock_update = 1;
4606         }
4607
4608         set_skip_buddy(se);
4609 }
4610
4611 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
4612 {
4613         struct sched_entity *se = &p->se;
4614
4615         /* throttled hierarchies are not runnable */
4616         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
4617                 return false;
4618
4619         /* Tell the scheduler that we'd really like pse to run next. */
4620         set_next_buddy(se);
4621
4622         yield_task_fair(rq);
4623
4624         return true;
4625 }
4626
4627 #ifdef CONFIG_SMP
4628 /**************************************************
4629  * Fair scheduling class load-balancing methods.
4630  *
4631  * BASICS
4632  *
4633  * The purpose of load-balancing is to achieve the same basic fairness the
4634  * per-cpu scheduler provides, namely provide a proportional amount of compute
4635  * time to each task. This is expressed in the following equation:
4636  *
4637  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
4638  *
4639  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
4640  * W_i,0 is defined as:
4641  *
4642  *   W_i,0 = \Sum_j w_i,j                                             (2)
4643  *
4644  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
4645  * is derived from the nice value as per prio_to_weight[].
4646  *
4647  * The weight average is an exponential decay average of the instantaneous
4648  * weight:
4649  *
4650  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
4651  *
4652  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
4653  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4654  * can also include other factors [XXX].
4655  *
4656  * To achieve this balance we define a measure of imbalance which follows
4657  * directly from (1):
4658  *
4659  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
4660  *
4661  * We them move tasks around to minimize the imbalance. In the continuous
4662  * function space it is obvious this converges, in the discrete case we get
4663  * a few fun cases generally called infeasible weight scenarios.
4664  *
4665  * [XXX expand on:
4666  *     - infeasible weights;
4667  *     - local vs global optima in the discrete case. ]
4668  *
4669  *
4670  * SCHED DOMAINS
4671  *
4672  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
4673  * for all i,j solution, we create a tree of cpus that follows the hardware
4674  * topology where each level pairs two lower groups (or better). This results
4675  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
4676  * tree to only the first of the previous level and we decrease the frequency
4677  * of load-balance at each level inv. proportional to the number of cpus in
4678  * the groups.
4679  *
4680  * This yields:
4681  *
4682  *     log_2 n     1     n
4683  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
4684  *     i = 0      2^i   2^i
4685  *                               `- size of each group
4686  *         |         |     `- number of cpus doing load-balance
4687  *         |         `- freq
4688  *         `- sum over all levels
4689  *
4690  * Coupled with a limit on how many tasks we can migrate every balance pass,
4691  * this makes (5) the runtime complexity of the balancer.
4692  *
4693  * An important property here is that each CPU is still (indirectly) connected
4694  * to every other cpu in at most O(log n) steps:
4695  *
4696  * The adjacency matrix of the resulting graph is given by:
4697  *
4698  *             log_2 n
4699  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
4700  *             k = 0
4701  *
4702  * And you'll find that:
4703  *
4704  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
4705  *
4706  * Showing there's indeed a path between every cpu in at most O(log n) steps.
4707  * The task movement gives a factor of O(m), giving a convergence complexity
4708  * of:
4709  *
4710  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
4711  *
4712  *
4713  * WORK CONSERVING
4714  *
4715  * In order to avoid CPUs going idle while there's still work to do, new idle
4716  * balancing is more aggressive and has the newly idle cpu iterate up the domain
4717  * tree itself instead of relying on other CPUs to bring it work.
4718  *
4719  * This adds some complexity to both (5) and (8) but it reduces the total idle
4720  * time.
4721  *
4722  * [XXX more?]
4723  *
4724  *
4725  * CGROUPS
4726  *
4727  * Cgroups make a horror show out of (2), instead of a simple sum we get:
4728  *
4729  *                                s_k,i
4730  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
4731  *                                 S_k
4732  *
4733  * Where
4734  *
4735  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
4736  *
4737  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
4738  *
4739  * The big problem is S_k, its a global sum needed to compute a local (W_i)
4740  * property.
4741  *
4742  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
4743  *      rewrite all of this once again.]
4744  */
4745
4746 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4747
4748 #define LBF_ALL_PINNED  0x01
4749 #define LBF_NEED_BREAK  0x02
4750 #define LBF_SOME_PINNED 0x04
4751
4752 struct lb_env {
4753         struct sched_domain     *sd;
4754
4755         struct rq               *src_rq;
4756         int                     src_cpu;
4757
4758         int                     dst_cpu;
4759         struct rq               *dst_rq;
4760
4761         struct cpumask          *dst_grpmask;
4762         int                     new_dst_cpu;
4763         enum cpu_idle_type      idle;
4764         long                    imbalance;
4765         /* The set of CPUs under consideration for load-balancing */
4766         struct cpumask          *cpus;
4767
4768         unsigned int            flags;
4769
4770         unsigned int            loop;
4771         unsigned int            loop_break;
4772         unsigned int            loop_max;
4773 };
4774
4775 /*
4776  * move_task - move a task from one runqueue to another runqueue.
4777  * Both runqueues must be locked.
4778  */
4779 static void move_task(struct task_struct *p, struct lb_env *env)
4780 {
4781         deactivate_task(env->src_rq, p, 0);
4782         set_task_cpu(p, env->dst_cpu);
4783         activate_task(env->dst_rq, p, 0);
4784         check_preempt_curr(env->dst_rq, p, 0);
4785 }
4786
4787 /*
4788  * Is this task likely cache-hot:
4789  */
4790 static int
4791 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4792 {
4793         s64 delta;
4794
4795         if (p->sched_class != &fair_sched_class)
4796                 return 0;
4797
4798         if (unlikely(p->policy == SCHED_IDLE))
4799                 return 0;
4800
4801         /*
4802          * Buddy candidates are cache hot:
4803          */
4804         if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4805                         (&p->se == cfs_rq_of(&p->se)->next ||
4806                          &p->se == cfs_rq_of(&p->se)->last))
4807                 return 1;
4808
4809         if (sysctl_sched_migration_cost == -1)
4810                 return 1;
4811         if (sysctl_sched_migration_cost == 0)
4812                 return 0;
4813
4814         delta = now - p->se.exec_start;
4815
4816         return delta < (s64)sysctl_sched_migration_cost;
4817 }
4818
4819 /*
4820  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4821  */
4822 static
4823 int can_migrate_task(struct task_struct *p, struct lb_env *env)
4824 {
4825         int tsk_cache_hot = 0;
4826         /*
4827          * We do not migrate tasks that are:
4828          * 1) throttled_lb_pair, or
4829          * 2) cannot be migrated to this CPU due to cpus_allowed, or
4830          * 3) running (obviously), or
4831          * 4) are cache-hot on their current CPU.
4832          */
4833         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4834                 return 0;
4835
4836         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
4837                 int cpu;
4838
4839                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
4840
4841                 /*
4842                  * Remember if this task can be migrated to any other cpu in
4843                  * our sched_group. We may want to revisit it if we couldn't
4844                  * meet load balance goals by pulling other tasks on src_cpu.
4845                  *
4846                  * Also avoid computing new_dst_cpu if we have already computed
4847                  * one in current iteration.
4848                  */
4849                 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
4850                         return 0;
4851
4852                 /* Prevent to re-select dst_cpu via env's cpus */
4853                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4854                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4855                                 env->flags |= LBF_SOME_PINNED;
4856                                 env->new_dst_cpu = cpu;
4857                                 break;
4858                         }
4859                 }
4860
4861                 return 0;
4862         }
4863
4864         /* Record that we found atleast one task that could run on dst_cpu */
4865         env->flags &= ~LBF_ALL_PINNED;
4866
4867         if (task_running(env->src_rq, p)) {
4868                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
4869                 return 0;
4870         }
4871
4872         /*
4873          * Aggressive migration if:
4874          * 1) task is cache cold, or
4875          * 2) too many balance attempts have failed.
4876          */
4877         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
4878         if (!tsk_cache_hot ||
4879                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4880
4881                 if (tsk_cache_hot) {
4882                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4883                         schedstat_inc(p, se.statistics.nr_forced_migrations);
4884                 }
4885
4886                 return 1;
4887         }
4888
4889         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4890         return 0;
4891 }
4892
4893 /*
4894  * move_one_task tries to move exactly one task from busiest to this_rq, as
4895  * part of active balancing operations within "domain".
4896  * Returns 1 if successful and 0 otherwise.
4897  *
4898  * Called with both runqueues locked.
4899  */
4900 static int move_one_task(struct lb_env *env)
4901 {
4902         struct task_struct *p, *n;
4903
4904         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4905                 if (!can_migrate_task(p, env))
4906                         continue;
4907
4908                 move_task(p, env);
4909                 /*
4910                  * Right now, this is only the second place move_task()
4911                  * is called, so we can safely collect move_task()
4912                  * stats here rather than inside move_task().
4913                  */
4914                 schedstat_inc(env->sd, lb_gained[env->idle]);
4915                 return 1;
4916         }
4917         return 0;
4918 }
4919
4920 static unsigned long task_h_load(struct task_struct *p);
4921
4922 static const unsigned int sched_nr_migrate_break = 32;
4923
4924 /*
4925  * move_tasks tries to move up to imbalance weighted load from busiest to
4926  * this_rq, as part of a balancing operation within domain "sd".
4927  * Returns 1 if successful and 0 otherwise.
4928  *
4929  * Called with both runqueues locked.
4930  */
4931 static int move_tasks(struct lb_env *env)
4932 {
4933         struct list_head *tasks = &env->src_rq->cfs_tasks;
4934         struct task_struct *p;
4935         unsigned long load;
4936         int pulled = 0;
4937
4938         if (env->imbalance <= 0)
4939                 return 0;
4940
4941         while (!list_empty(tasks)) {
4942                 p = list_first_entry(tasks, struct task_struct, se.group_node);
4943
4944                 env->loop++;
4945                 /* We've more or less seen every task there is, call it quits */
4946                 if (env->loop > env->loop_max)
4947                         break;
4948
4949                 /* take a breather every nr_migrate tasks */
4950                 if (env->loop > env->loop_break) {
4951                         env->loop_break += sched_nr_migrate_break;
4952                         env->flags |= LBF_NEED_BREAK;
4953                         break;
4954                 }
4955
4956                 if (!can_migrate_task(p, env))
4957                         goto next;
4958
4959                 load = task_h_load(p);
4960
4961                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4962                         goto next;
4963
4964                 if ((load / 2) > env->imbalance)
4965                         goto next;
4966
4967                 move_task(p, env);
4968                 pulled++;
4969                 env->imbalance -= load;
4970
4971 #ifdef CONFIG_PREEMPT
4972                 /*
4973                  * NEWIDLE balancing is a source of latency, so preemptible
4974                  * kernels will stop after the first task is pulled to minimize
4975                  * the critical section.
4976                  */
4977                 if (env->idle == CPU_NEWLY_IDLE)
4978                         break;
4979 #endif
4980
4981                 /*
4982                  * We only want to steal up to the prescribed amount of
4983                  * weighted load.
4984                  */
4985                 if (env->imbalance <= 0)
4986                         break;
4987
4988                 continue;
4989 next:
4990                 list_move_tail(&p->se.group_node, tasks);
4991         }
4992
4993         /*
4994          * Right now, this is one of only two places move_task() is called,
4995          * so we can safely collect move_task() stats here rather than
4996          * inside move_task().
4997          */
4998         schedstat_add(env->sd, lb_gained[env->idle], pulled);
4999
5000         return pulled;
5001 }
5002
5003 #ifdef CONFIG_FAIR_GROUP_SCHED
5004 /*
5005  * update tg->load_weight by folding this cpu's load_avg
5006  */
5007 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5008 {
5009         struct sched_entity *se = tg->se[cpu];
5010         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5011
5012         /* throttled entities do not contribute to load */
5013         if (throttled_hierarchy(cfs_rq))
5014                 return;
5015
5016         update_cfs_rq_blocked_load(cfs_rq, 1);
5017
5018         if (se) {
5019                 update_entity_load_avg(se, 1);
5020                 /*
5021                  * We pivot on our runnable average having decayed to zero for
5022                  * list removal.  This generally implies that all our children
5023                  * have also been removed (modulo rounding error or bandwidth
5024                  * control); however, such cases are rare and we can fix these
5025                  * at enqueue.
5026                  *
5027                  * TODO: fix up out-of-order children on enqueue.
5028                  */
5029                 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5030                         list_del_leaf_cfs_rq(cfs_rq);
5031         } else {
5032                 struct rq *rq = rq_of(cfs_rq);
5033                 update_rq_runnable_avg(rq, rq->nr_running);
5034         }
5035 }
5036
5037 static void update_blocked_averages(int cpu)
5038 {
5039         struct rq *rq = cpu_rq(cpu);
5040         struct cfs_rq *cfs_rq;
5041         unsigned long flags;
5042
5043         raw_spin_lock_irqsave(&rq->lock, flags);
5044         update_rq_clock(rq);
5045         /*
5046          * Iterates the task_group tree in a bottom up fashion, see
5047          * list_add_leaf_cfs_rq() for details.
5048          */
5049         for_each_leaf_cfs_rq(rq, cfs_rq) {
5050                 /*
5051                  * Note: We may want to consider periodically releasing
5052                  * rq->lock about these updates so that creating many task
5053                  * groups does not result in continually extending hold time.
5054                  */
5055                 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5056         }
5057
5058         raw_spin_unlock_irqrestore(&rq->lock, flags);
5059 }
5060
5061 /*
5062  * Compute the cpu's hierarchical load factor for each task group.
5063  * This needs to be done in a top-down fashion because the load of a child
5064  * group is a fraction of its parents load.
5065  */
5066 static int tg_load_down(struct task_group *tg, void *data)
5067 {
5068         unsigned long load;
5069         long cpu = (long)data;
5070
5071         if (!tg->parent) {
5072                 load = cpu_rq(cpu)->load.weight;
5073         } else {
5074                 load = tg->parent->cfs_rq[cpu]->h_load;
5075                 load *= tg->se[cpu]->load.weight;
5076                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
5077         }
5078
5079         tg->cfs_rq[cpu]->h_load = load;
5080
5081         return 0;
5082 }
5083
5084 static void update_h_load(long cpu)
5085 {
5086         struct rq *rq = cpu_rq(cpu);
5087         unsigned long now = jiffies;
5088
5089         if (rq->h_load_throttle == now)
5090                 return;
5091
5092         rq->h_load_throttle = now;
5093
5094         rcu_read_lock();
5095         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
5096         rcu_read_unlock();
5097 }
5098
5099 static unsigned long task_h_load(struct task_struct *p)
5100 {
5101         struct cfs_rq *cfs_rq = task_cfs_rq(p);
5102         unsigned long load;
5103
5104         load = p->se.load.weight;
5105         load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
5106
5107         return load;
5108 }
5109 #else
5110 static inline void update_blocked_averages(int cpu)
5111 {
5112 }
5113
5114 static inline void update_h_load(long cpu)
5115 {
5116 }
5117
5118 static unsigned long task_h_load(struct task_struct *p)
5119 {
5120         return p->se.load.weight;
5121 }
5122 #endif
5123
5124 /********** Helpers for find_busiest_group ************************/
5125 /*
5126  * sd_lb_stats - Structure to store the statistics of a sched_domain
5127  *              during load balancing.
5128  */
5129 struct sd_lb_stats {
5130         struct sched_group *busiest; /* Busiest group in this sd */
5131         struct sched_group *this;  /* Local group in this sd */
5132         unsigned long total_load;  /* Total load of all groups in sd */
5133         unsigned long total_pwr;   /*   Total power of all groups in sd */
5134         unsigned long avg_load;    /* Average load across all groups in sd */
5135
5136         /** Statistics of this group */
5137         unsigned long this_load;
5138         unsigned long this_load_per_task;
5139         unsigned long this_nr_running;
5140         unsigned long this_has_capacity;
5141         unsigned int  this_idle_cpus;
5142
5143         /* Statistics of the busiest group */
5144         unsigned int  busiest_idle_cpus;
5145         unsigned long max_load;
5146         unsigned long busiest_load_per_task;
5147         unsigned long busiest_nr_running;
5148         unsigned long busiest_group_capacity;
5149         unsigned long busiest_has_capacity;
5150         unsigned int  busiest_group_weight;
5151
5152         int group_imb; /* Is there imbalance in this sd */
5153 };
5154
5155 /*
5156  * sg_lb_stats - stats of a sched_group required for load_balancing
5157  */
5158 struct sg_lb_stats {
5159         unsigned long avg_load; /*Avg load across the CPUs of the group */
5160         unsigned long group_load; /* Total load over the CPUs of the group */
5161         unsigned long sum_nr_running; /* Nr tasks running in the group */
5162         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5163         unsigned long group_capacity;
5164         unsigned long idle_cpus;
5165         unsigned long group_weight;
5166         int group_imb; /* Is there an imbalance in the group ? */
5167         int group_has_capacity; /* Is there extra capacity in the group? */
5168 };
5169
5170 /**
5171  * get_sd_load_idx - Obtain the load index for a given sched domain.
5172  * @sd: The sched_domain whose load_idx is to be obtained.
5173  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
5174  */
5175 static inline int get_sd_load_idx(struct sched_domain *sd,
5176                                         enum cpu_idle_type idle)
5177 {
5178         int load_idx;
5179
5180         switch (idle) {
5181         case CPU_NOT_IDLE:
5182                 load_idx = sd->busy_idx;
5183                 break;
5184
5185         case CPU_NEWLY_IDLE:
5186                 load_idx = sd->newidle_idx;
5187                 break;
5188         default:
5189                 load_idx = sd->idle_idx;
5190                 break;
5191         }
5192
5193         return load_idx;
5194 }
5195
5196 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
5197 {
5198         return SCHED_POWER_SCALE;
5199 }
5200
5201 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
5202 {
5203         return default_scale_freq_power(sd, cpu);
5204 }
5205
5206 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5207 {
5208         unsigned long weight = sd->span_weight;
5209         unsigned long smt_gain = sd->smt_gain;
5210
5211         smt_gain /= weight;
5212
5213         return smt_gain;
5214 }
5215
5216 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
5217 {
5218         return default_scale_smt_power(sd, cpu);
5219 }
5220
5221 static unsigned long scale_rt_power(int cpu)
5222 {
5223         struct rq *rq = cpu_rq(cpu);
5224         u64 total, available, age_stamp, avg;
5225
5226         /*
5227          * Since we're reading these variables without serialization make sure
5228          * we read them once before doing sanity checks on them.
5229          */
5230         age_stamp = ACCESS_ONCE(rq->age_stamp);
5231         avg = ACCESS_ONCE(rq->rt_avg);
5232
5233         total = sched_avg_period() + (rq->clock - age_stamp);
5234
5235         if (unlikely(total < avg)) {
5236                 /* Ensures that power won't end up being negative */
5237                 available = 0;
5238         } else {
5239                 available = total - avg;
5240         }
5241
5242         if (unlikely((s64)total < SCHED_POWER_SCALE))
5243                 total = SCHED_POWER_SCALE;
5244
5245         total >>= SCHED_POWER_SHIFT;
5246
5247         return div_u64(available, total);
5248 }
5249
5250 static void update_cpu_power(struct sched_domain *sd, int cpu)
5251 {
5252         unsigned long weight = sd->span_weight;
5253         unsigned long power = SCHED_POWER_SCALE;
5254         struct sched_group *sdg = sd->groups;
5255
5256         if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
5257                 if (sched_feat(ARCH_POWER))
5258                         power *= arch_scale_smt_power(sd, cpu);
5259                 else
5260                         power *= default_scale_smt_power(sd, cpu);
5261
5262                 power >>= SCHED_POWER_SHIFT;
5263         }
5264
5265         sdg->sgp->power_orig = power;
5266
5267         if (sched_feat(ARCH_POWER))
5268                 power *= arch_scale_freq_power(sd, cpu);
5269         else
5270                 power *= default_scale_freq_power(sd, cpu);
5271
5272         power >>= SCHED_POWER_SHIFT;
5273
5274         power *= scale_rt_power(cpu);
5275         power >>= SCHED_POWER_SHIFT;
5276
5277         if (!power)
5278                 power = 1;
5279
5280         cpu_rq(cpu)->cpu_power = power;
5281         sdg->sgp->power = power;
5282 }
5283
5284 void update_group_power(struct sched_domain *sd, int cpu)
5285 {
5286         struct sched_domain *child = sd->child;
5287         struct sched_group *group, *sdg = sd->groups;
5288         unsigned long power;
5289         unsigned long interval;
5290
5291         interval = msecs_to_jiffies(sd->balance_interval);
5292         interval = clamp(interval, 1UL, max_load_balance_interval);
5293         sdg->sgp->next_update = jiffies + interval;
5294
5295         if (!child) {
5296                 update_cpu_power(sd, cpu);
5297                 return;
5298         }
5299
5300         power = 0;
5301
5302         if (child->flags & SD_OVERLAP) {
5303                 /*
5304                  * SD_OVERLAP domains cannot assume that child groups
5305                  * span the current group.
5306                  */
5307
5308                 for_each_cpu(cpu, sched_group_cpus(sdg))
5309                         power += power_of(cpu);
5310         } else  {
5311                 /*
5312                  * !SD_OVERLAP domains can assume that child groups
5313                  * span the current group.
5314                  */
5315
5316                 group = child->groups;
5317                 do {
5318                         power += group->sgp->power;
5319                         group = group->next;
5320                 } while (group != child->groups);
5321         }
5322
5323         sdg->sgp->power_orig = sdg->sgp->power = power;
5324 }
5325
5326 /*
5327  * Try and fix up capacity for tiny siblings, this is needed when
5328  * things like SD_ASYM_PACKING need f_b_g to select another sibling
5329  * which on its own isn't powerful enough.
5330  *
5331  * See update_sd_pick_busiest() and check_asym_packing().
5332  */
5333 static inline int
5334 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5335 {
5336         /*
5337          * Only siblings can have significantly less than SCHED_POWER_SCALE
5338          */
5339         if (!(sd->flags & SD_SHARE_CPUPOWER))
5340                 return 0;
5341
5342         /*
5343          * If ~90% of the cpu_power is still there, we're good.
5344          */
5345         if (group->sgp->power * 32 > group->sgp->power_orig * 29)
5346                 return 1;
5347
5348         return 0;
5349 }
5350
5351 /**
5352  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5353  * @env: The load balancing environment.
5354  * @group: sched_group whose statistics are to be updated.
5355  * @load_idx: Load index of sched_domain of this_cpu for load calc.
5356  * @local_group: Does group contain this_cpu.
5357  * @balance: Should we balance.
5358  * @sgs: variable to hold the statistics for this group.
5359  */
5360 static inline void update_sg_lb_stats(struct lb_env *env,
5361                         struct sched_group *group, int load_idx,
5362                         int local_group, int *balance, struct sg_lb_stats *sgs)
5363 {
5364         unsigned long nr_running, max_nr_running, min_nr_running;
5365         unsigned long load, max_cpu_load, min_cpu_load;
5366         unsigned int balance_cpu = -1, first_idle_cpu = 0;
5367         unsigned long avg_load_per_task = 0;
5368         int i;
5369
5370         if (local_group)
5371                 balance_cpu = group_balance_cpu(group);
5372
5373         /* Tally up the load of all CPUs in the group */
5374         max_cpu_load = 0;
5375         min_cpu_load = ~0UL;
5376         max_nr_running = 0;
5377         min_nr_running = ~0UL;
5378
5379         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5380                 struct rq *rq = cpu_rq(i);
5381
5382                 nr_running = rq->nr_running;
5383
5384                 /* Bias balancing toward cpus of our domain */
5385                 if (local_group) {
5386                         if (idle_cpu(i) && !first_idle_cpu &&
5387                                         cpumask_test_cpu(i, sched_group_mask(group))) {
5388                                 first_idle_cpu = 1;
5389                                 balance_cpu = i;
5390                         }
5391
5392                         load = target_load(i, load_idx);
5393                 } else {
5394                         load = source_load(i, load_idx);
5395                         if (load > max_cpu_load)
5396                                 max_cpu_load = load;
5397                         if (min_cpu_load > load)
5398                                 min_cpu_load = load;
5399
5400                         if (nr_running > max_nr_running)
5401                                 max_nr_running = nr_running;
5402                         if (min_nr_running > nr_running)
5403                                 min_nr_running = nr_running;
5404                 }
5405
5406                 sgs->group_load += load;
5407                 sgs->sum_nr_running += nr_running;
5408                 sgs->sum_weighted_load += weighted_cpuload(i);
5409                 if (idle_cpu(i))
5410                         sgs->idle_cpus++;
5411         }
5412
5413         /*
5414          * First idle cpu or the first cpu(busiest) in this sched group
5415          * is eligible for doing load balancing at this and above
5416          * domains. In the newly idle case, we will allow all the cpu's
5417          * to do the newly idle load balance.
5418          */
5419         if (local_group) {
5420                 if (env->idle != CPU_NEWLY_IDLE) {
5421                         if (balance_cpu != env->dst_cpu) {
5422                                 *balance = 0;
5423                                 return;
5424                         }
5425                         update_group_power(env->sd, env->dst_cpu);
5426                 } else if (time_after_eq(jiffies, group->sgp->next_update))
5427                         update_group_power(env->sd, env->dst_cpu);
5428         }
5429
5430         /* Adjust by relative CPU power of the group */
5431         sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
5432
5433         /*
5434          * Consider the group unbalanced when the imbalance is larger
5435          * than the average weight of a task.
5436          *
5437          * APZ: with cgroup the avg task weight can vary wildly and
5438          *      might not be a suitable number - should we keep a
5439          *      normalized nr_running number somewhere that negates
5440          *      the hierarchy?
5441          */
5442         if (sgs->sum_nr_running)
5443                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5444
5445         if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
5446             (max_nr_running - min_nr_running) > 1)
5447                 sgs->group_imb = 1;
5448
5449         sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
5450                                                 SCHED_POWER_SCALE);
5451         if (!sgs->group_capacity)
5452                 sgs->group_capacity = fix_small_capacity(env->sd, group);
5453         sgs->group_weight = group->group_weight;
5454
5455         if (sgs->group_capacity > sgs->sum_nr_running)
5456                 sgs->group_has_capacity = 1;
5457 }
5458
5459 /**
5460  * update_sd_pick_busiest - return 1 on busiest group
5461  * @env: The load balancing environment.
5462  * @sds: sched_domain statistics
5463  * @sg: sched_group candidate to be checked for being the busiest
5464  * @sgs: sched_group statistics
5465  *
5466  * Determine if @sg is a busier group than the previously selected
5467  * busiest group.
5468  */
5469 static bool update_sd_pick_busiest(struct lb_env *env,
5470                                    struct sd_lb_stats *sds,
5471                                    struct sched_group *sg,
5472                                    struct sg_lb_stats *sgs)
5473 {
5474         if (sgs->avg_load <= sds->max_load)
5475                 return false;
5476
5477         if (sgs->sum_nr_running > sgs->group_capacity)
5478                 return true;
5479
5480         if (sgs->group_imb)
5481                 return true;
5482
5483         /*
5484          * ASYM_PACKING needs to move all the work to the lowest
5485          * numbered CPUs in the group, therefore mark all groups
5486          * higher than ourself as busy.
5487          */
5488         if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5489             env->dst_cpu < group_first_cpu(sg)) {
5490                 if (!sds->busiest)
5491                         return true;
5492
5493                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5494                         return true;
5495         }
5496
5497         return false;
5498 }
5499
5500 /**
5501  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5502  * @env: The load balancing environment.
5503  * @balance: Should we balance.
5504  * @sds: variable to hold the statistics for this sched_domain.
5505  */
5506 static inline void update_sd_lb_stats(struct lb_env *env,
5507                                         int *balance, struct sd_lb_stats *sds)
5508 {
5509         struct sched_domain *child = env->sd->child;
5510         struct sched_group *sg = env->sd->groups;
5511         struct sg_lb_stats sgs;
5512         int load_idx, prefer_sibling = 0;
5513
5514         if (child && child->flags & SD_PREFER_SIBLING)
5515                 prefer_sibling = 1;
5516
5517         load_idx = get_sd_load_idx(env->sd, env->idle);
5518
5519         do {
5520                 int local_group;
5521
5522                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
5523                 memset(&sgs, 0, sizeof(sgs));
5524                 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
5525
5526                 if (local_group && !(*balance))
5527                         return;
5528
5529                 sds->total_load += sgs.group_load;
5530                 sds->total_pwr += sg->sgp->power;
5531
5532                 /*
5533                  * In case the child domain prefers tasks go to siblings
5534                  * first, lower the sg capacity to one so that we'll try
5535                  * and move all the excess tasks away. We lower the capacity
5536                  * of a group only if the local group has the capacity to fit
5537                  * these excess tasks, i.e. nr_running < group_capacity. The
5538                  * extra check prevents the case where you always pull from the
5539                  * heaviest group when it is already under-utilized (possible
5540                  * with a large weight task outweighs the tasks on the system).
5541                  */
5542                 if (prefer_sibling && !local_group && sds->this_has_capacity)
5543                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
5544
5545                 if (local_group) {
5546                         sds->this_load = sgs.avg_load;
5547                         sds->this = sg;
5548                         sds->this_nr_running = sgs.sum_nr_running;
5549                         sds->this_load_per_task = sgs.sum_weighted_load;
5550                         sds->this_has_capacity = sgs.group_has_capacity;
5551                         sds->this_idle_cpus = sgs.idle_cpus;
5552                 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
5553                         sds->max_load = sgs.avg_load;
5554                         sds->busiest = sg;
5555                         sds->busiest_nr_running = sgs.sum_nr_running;
5556                         sds->busiest_idle_cpus = sgs.idle_cpus;
5557                         sds->busiest_group_capacity = sgs.group_capacity;
5558                         sds->busiest_load_per_task = sgs.sum_weighted_load;
5559                         sds->busiest_has_capacity = sgs.group_has_capacity;
5560                         sds->busiest_group_weight = sgs.group_weight;
5561                         sds->group_imb = sgs.group_imb;
5562                 }
5563
5564                 sg = sg->next;
5565         } while (sg != env->sd->groups);
5566 }
5567
5568 /**
5569  * check_asym_packing - Check to see if the group is packed into the
5570  *                      sched doman.
5571  *
5572  * This is primarily intended to used at the sibling level.  Some
5573  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
5574  * case of POWER7, it can move to lower SMT modes only when higher
5575  * threads are idle.  When in lower SMT modes, the threads will
5576  * perform better since they share less core resources.  Hence when we
5577  * have idle threads, we want them to be the higher ones.
5578  *
5579  * This packing function is run on idle threads.  It checks to see if
5580  * the busiest CPU in this domain (core in the P7 case) has a higher
5581  * CPU number than the packing function is being run on.  Here we are
5582  * assuming lower CPU number will be equivalent to lower a SMT thread
5583  * number.
5584  *
5585  * Returns 1 when packing is required and a task should be moved to
5586  * this CPU.  The amount of the imbalance is returned in *imbalance.
5587  *
5588  * @env: The load balancing environment.
5589  * @sds: Statistics of the sched_domain which is to be packed
5590  */
5591 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
5592 {
5593         int busiest_cpu;
5594
5595         if (!(env->sd->flags & SD_ASYM_PACKING))
5596                 return 0;
5597
5598         if (!sds->busiest)
5599                 return 0;
5600
5601         busiest_cpu = group_first_cpu(sds->busiest);
5602         if (env->dst_cpu > busiest_cpu)
5603                 return 0;
5604
5605         env->imbalance = DIV_ROUND_CLOSEST(
5606                 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
5607
5608         return 1;
5609 }
5610
5611 /**
5612  * fix_small_imbalance - Calculate the minor imbalance that exists
5613  *                      amongst the groups of a sched_domain, during
5614  *                      load balancing.
5615  * @env: The load balancing environment.
5616  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
5617  */
5618 static inline
5619 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5620 {
5621         unsigned long tmp, pwr_now = 0, pwr_move = 0;
5622         unsigned int imbn = 2;
5623         unsigned long scaled_busy_load_per_task;
5624
5625         if (sds->this_nr_running) {
5626                 sds->this_load_per_task /= sds->this_nr_running;
5627                 if (sds->busiest_load_per_task >
5628                                 sds->this_load_per_task)
5629                         imbn = 1;
5630         } else {
5631                 sds->this_load_per_task =
5632                         cpu_avg_load_per_task(env->dst_cpu);
5633         }
5634
5635         scaled_busy_load_per_task = sds->busiest_load_per_task
5636                                          * SCHED_POWER_SCALE;
5637         scaled_busy_load_per_task /= sds->busiest->sgp->power;
5638
5639         if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
5640                         (scaled_busy_load_per_task * imbn)) {
5641                 env->imbalance = sds->busiest_load_per_task;
5642                 return;
5643         }
5644
5645         /*
5646          * OK, we don't have enough imbalance to justify moving tasks,
5647          * however we may be able to increase total CPU power used by
5648          * moving them.
5649          */
5650
5651         pwr_now += sds->busiest->sgp->power *
5652                         min(sds->busiest_load_per_task, sds->max_load);
5653         pwr_now += sds->this->sgp->power *
5654                         min(sds->this_load_per_task, sds->this_load);
5655         pwr_now /= SCHED_POWER_SCALE;
5656
5657         /* Amount of load we'd subtract */
5658         tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
5659                 sds->busiest->sgp->power;
5660         if (sds->max_load > tmp)
5661                 pwr_move += sds->busiest->sgp->power *
5662                         min(sds->busiest_load_per_task, sds->max_load - tmp);
5663
5664         /* Amount of load we'd add */
5665         if (sds->max_load * sds->busiest->sgp->power <
5666                 sds->busiest_load_per_task * SCHED_POWER_SCALE)
5667                 tmp = (sds->max_load * sds->busiest->sgp->power) /
5668                         sds->this->sgp->power;
5669         else
5670                 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
5671                         sds->this->sgp->power;
5672         pwr_move += sds->this->sgp->power *
5673                         min(sds->this_load_per_task, sds->this_load + tmp);
5674         pwr_move /= SCHED_POWER_SCALE;
5675
5676         /* Move if we gain throughput */
5677         if (pwr_move > pwr_now)
5678                 env->imbalance = sds->busiest_load_per_task;
5679 }
5680
5681 /**
5682  * calculate_imbalance - Calculate the amount of imbalance present within the
5683  *                       groups of a given sched_domain during load balance.
5684  * @env: load balance environment
5685  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
5686  */
5687 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5688 {
5689         unsigned long max_pull, load_above_capacity = ~0UL;
5690
5691         sds->busiest_load_per_task /= sds->busiest_nr_running;
5692         if (sds->group_imb) {
5693                 sds->busiest_load_per_task =
5694                         min(sds->busiest_load_per_task, sds->avg_load);
5695         }
5696
5697         /*
5698          * In the presence of smp nice balancing, certain scenarios can have
5699          * max load less than avg load(as we skip the groups at or below
5700          * its cpu_power, while calculating max_load..)
5701          */
5702         if (sds->max_load < sds->avg_load) {
5703                 env->imbalance = 0;
5704                 return fix_small_imbalance(env, sds);
5705         }
5706
5707         if (!sds->group_imb) {
5708                 /*
5709                  * Don't want to pull so many tasks that a group would go idle.
5710                  */
5711                 load_above_capacity = (sds->busiest_nr_running -
5712                                                 sds->busiest_group_capacity);
5713
5714                 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
5715
5716                 load_above_capacity /= sds->busiest->sgp->power;
5717         }
5718
5719         /*
5720          * We're trying to get all the cpus to the average_load, so we don't
5721          * want to push ourselves above the average load, nor do we wish to
5722          * reduce the max loaded cpu below the average load. At the same time,
5723          * we also don't want to reduce the group load below the group capacity
5724          * (so that we can implement power-savings policies etc). Thus we look
5725          * for the minimum possible imbalance.
5726          * Be careful of negative numbers as they'll appear as very large values
5727          * with unsigned longs.
5728          */
5729         max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
5730
5731         /* How much load to actually move to equalise the imbalance */
5732         env->imbalance = min(max_pull * sds->busiest->sgp->power,
5733                 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
5734                         / SCHED_POWER_SCALE;
5735
5736         /*
5737          * if *imbalance is less than the average load per runnable task
5738          * there is no guarantee that any tasks will be moved so we'll have
5739          * a think about bumping its value to force at least one task to be
5740          * moved
5741          */
5742         if (env->imbalance < sds->busiest_load_per_task)
5743                 return fix_small_imbalance(env, sds);
5744
5745 }
5746
5747 /******* find_busiest_group() helpers end here *********************/
5748
5749 /**
5750  * find_busiest_group - Returns the busiest group within the sched_domain
5751  * if there is an imbalance. If there isn't an imbalance, and
5752  * the user has opted for power-savings, it returns a group whose
5753  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
5754  * such a group exists.
5755  *
5756  * Also calculates the amount of weighted load which should be moved
5757  * to restore balance.
5758  *
5759  * @env: The load balancing environment.
5760  * @balance: Pointer to a variable indicating if this_cpu
5761  *      is the appropriate cpu to perform load balancing at this_level.
5762  *
5763  * Returns:     - the busiest group if imbalance exists.
5764  *              - If no imbalance and user has opted for power-savings balance,
5765  *                 return the least loaded group whose CPUs can be
5766  *                 put to idle by rebalancing its tasks onto our group.
5767  */
5768 static struct sched_group *
5769 find_busiest_group(struct lb_env *env, int *balance)
5770 {
5771         struct sd_lb_stats sds;
5772
5773         memset(&sds, 0, sizeof(sds));
5774
5775         /*
5776          * Compute the various statistics relavent for load balancing at
5777          * this level.
5778          */
5779         update_sd_lb_stats(env, balance, &sds);
5780
5781         /*
5782          * this_cpu is not the appropriate cpu to perform load balancing at
5783          * this level.
5784          */
5785         if (!(*balance))
5786                 goto ret;
5787
5788         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
5789             check_asym_packing(env, &sds))
5790                 return sds.busiest;
5791
5792         /* There is no busy sibling group to pull tasks from */
5793         if (!sds.busiest || sds.busiest_nr_running == 0)
5794                 goto out_balanced;
5795
5796         sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
5797
5798         /*
5799          * If the busiest group is imbalanced the below checks don't
5800          * work because they assumes all things are equal, which typically
5801          * isn't true due to cpus_allowed constraints and the like.
5802          */
5803         if (sds.group_imb)
5804                 goto force_balance;
5805
5806         /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
5807         if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
5808                         !sds.busiest_has_capacity)
5809                 goto force_balance;
5810
5811         /*
5812          * If the local group is more busy than the selected busiest group
5813          * don't try and pull any tasks.
5814          */
5815         if (sds.this_load >= sds.max_load)
5816                 goto out_balanced;
5817
5818         /*
5819          * Don't pull any tasks if this group is already above the domain
5820          * average load.
5821          */
5822         if (sds.this_load >= sds.avg_load)
5823                 goto out_balanced;
5824
5825         if (env->idle == CPU_IDLE) {
5826                 /*
5827                  * This cpu is idle. If the busiest group load doesn't
5828                  * have more tasks than the number of available cpu's and
5829                  * there is no imbalance between this and busiest group
5830                  * wrt to idle cpu's, it is balanced.
5831                  */
5832                 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
5833                     sds.busiest_nr_running <= sds.busiest_group_weight)
5834                         goto out_balanced;
5835         } else {
5836                 /*
5837                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
5838                  * imbalance_pct to be conservative.
5839                  */
5840                 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
5841                         goto out_balanced;
5842         }
5843
5844 force_balance:
5845         /* Looks like there is an imbalance. Compute it */
5846         calculate_imbalance(env, &sds);
5847         return sds.busiest;
5848
5849 out_balanced:
5850 ret:
5851         env->imbalance = 0;
5852         return NULL;
5853 }
5854
5855 /*
5856  * find_busiest_queue - find the busiest runqueue among the cpus in group.
5857  */
5858 static struct rq *find_busiest_queue(struct lb_env *env,
5859                                      struct sched_group *group)
5860 {
5861         struct rq *busiest = NULL, *rq;
5862         unsigned long max_load = 0;
5863         int i;
5864
5865         for_each_cpu(i, sched_group_cpus(group)) {
5866                 unsigned long power = power_of(i);
5867                 unsigned long capacity = DIV_ROUND_CLOSEST(power,
5868                                                            SCHED_POWER_SCALE);
5869                 unsigned long wl;
5870
5871                 if (!capacity)
5872                         capacity = fix_small_capacity(env->sd, group);
5873
5874                 if (!cpumask_test_cpu(i, env->cpus))
5875                         continue;
5876
5877                 rq = cpu_rq(i);
5878                 wl = weighted_cpuload(i);
5879
5880                 /*
5881                  * When comparing with imbalance, use weighted_cpuload()
5882                  * which is not scaled with the cpu power.
5883                  */
5884                 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
5885                         continue;
5886
5887                 /*
5888                  * For the load comparisons with the other cpu's, consider
5889                  * the weighted_cpuload() scaled with the cpu power, so that
5890                  * the load can be moved away from the cpu that is potentially
5891                  * running at a lower capacity.
5892                  */
5893                 wl = (wl * SCHED_POWER_SCALE) / power;
5894
5895                 if (wl > max_load) {
5896                         max_load = wl;
5897                         busiest = rq;
5898                 }
5899         }
5900
5901         return busiest;
5902 }
5903
5904 /*
5905  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
5906  * so long as it is large enough.
5907  */
5908 #define MAX_PINNED_INTERVAL     512
5909
5910 /* Working cpumask for load_balance and load_balance_newidle. */
5911 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5912
5913 static int need_active_balance(struct lb_env *env)
5914 {
5915         struct sched_domain *sd = env->sd;
5916
5917         if (env->idle == CPU_NEWLY_IDLE) {
5918
5919                 /*
5920                  * ASYM_PACKING needs to force migrate tasks from busy but
5921                  * higher numbered CPUs in order to pack all tasks in the
5922                  * lowest numbered CPUs.
5923                  */
5924                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
5925                         return 1;
5926         }
5927
5928         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5929 }
5930
5931 static int active_load_balance_cpu_stop(void *data);
5932
5933 /*
5934  * Check this_cpu to ensure it is balanced within domain. Attempt to move
5935  * tasks if there is an imbalance.
5936  */
5937 static int load_balance(int this_cpu, struct rq *this_rq,
5938                         struct sched_domain *sd, enum cpu_idle_type idle,
5939                         int *balance)
5940 {
5941         int ld_moved, cur_ld_moved, active_balance = 0;
5942         struct sched_group *group;
5943         struct rq *busiest;
5944         unsigned long flags;
5945         struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5946
5947         struct lb_env env = {
5948                 .sd             = sd,
5949                 .dst_cpu        = this_cpu,
5950                 .dst_rq         = this_rq,
5951                 .dst_grpmask    = sched_group_cpus(sd->groups),
5952                 .idle           = idle,
5953                 .loop_break     = sched_nr_migrate_break,
5954                 .cpus           = cpus,
5955         };
5956
5957         /*
5958          * For NEWLY_IDLE load_balancing, we don't need to consider
5959          * other cpus in our group
5960          */
5961         if (idle == CPU_NEWLY_IDLE)
5962                 env.dst_grpmask = NULL;
5963
5964         cpumask_copy(cpus, cpu_active_mask);
5965
5966         schedstat_inc(sd, lb_count[idle]);
5967
5968 redo:
5969         group = find_busiest_group(&env, balance);
5970
5971         if (*balance == 0)
5972                 goto out_balanced;
5973
5974         if (!group) {
5975                 schedstat_inc(sd, lb_nobusyg[idle]);
5976                 goto out_balanced;
5977         }
5978
5979         busiest = find_busiest_queue(&env, group);
5980         if (!busiest) {
5981                 schedstat_inc(sd, lb_nobusyq[idle]);
5982                 goto out_balanced;
5983         }
5984
5985         BUG_ON(busiest == env.dst_rq);
5986
5987         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5988
5989         ld_moved = 0;
5990         if (busiest->nr_running > 1) {
5991                 /*
5992                  * Attempt to move tasks. If find_busiest_group has found
5993                  * an imbalance but busiest->nr_running <= 1, the group is
5994                  * still unbalanced. ld_moved simply stays zero, so it is
5995                  * correctly treated as an imbalance.
5996                  */
5997                 env.flags |= LBF_ALL_PINNED;
5998                 env.src_cpu   = busiest->cpu;
5999                 env.src_rq    = busiest;
6000                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
6001
6002                 update_h_load(env.src_cpu);
6003 more_balance:
6004                 local_irq_save(flags);
6005                 double_rq_lock(env.dst_rq, busiest);
6006
6007                 /*
6008                  * cur_ld_moved - load moved in current iteration
6009                  * ld_moved     - cumulative load moved across iterations
6010                  */
6011                 cur_ld_moved = move_tasks(&env);
6012                 ld_moved += cur_ld_moved;
6013                 double_rq_unlock(env.dst_rq, busiest);
6014                 local_irq_restore(flags);
6015
6016                 /*
6017                  * some other cpu did the load balance for us.
6018                  */
6019                 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
6020                         resched_cpu(env.dst_cpu);
6021
6022                 if (env.flags & LBF_NEED_BREAK) {
6023                         env.flags &= ~LBF_NEED_BREAK;
6024                         goto more_balance;
6025                 }
6026
6027                 /*
6028                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
6029                  * us and move them to an alternate dst_cpu in our sched_group
6030                  * where they can run. The upper limit on how many times we
6031                  * iterate on same src_cpu is dependent on number of cpus in our
6032                  * sched_group.
6033                  *
6034                  * This changes load balance semantics a bit on who can move
6035                  * load to a given_cpu. In addition to the given_cpu itself
6036                  * (or a ilb_cpu acting on its behalf where given_cpu is
6037                  * nohz-idle), we now have balance_cpu in a position to move
6038                  * load to given_cpu. In rare situations, this may cause
6039                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
6040                  * _independently_ and at _same_ time to move some load to
6041                  * given_cpu) causing exceess load to be moved to given_cpu.
6042                  * This however should not happen so much in practice and
6043                  * moreover subsequent load balance cycles should correct the
6044                  * excess load moved.
6045                  */
6046                 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6047
6048                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
6049                         env.dst_cpu      = env.new_dst_cpu;
6050                         env.flags       &= ~LBF_SOME_PINNED;
6051                         env.loop         = 0;
6052                         env.loop_break   = sched_nr_migrate_break;
6053
6054                         /* Prevent to re-select dst_cpu via env's cpus */
6055                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
6056
6057                         /*
6058                          * Go back to "more_balance" rather than "redo" since we
6059                          * need to continue with same src_cpu.
6060                          */
6061                         goto more_balance;
6062                 }
6063
6064                 /* All tasks on this runqueue were pinned by CPU affinity */
6065                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
6066                         cpumask_clear_cpu(cpu_of(busiest), cpus);
6067                         if (!cpumask_empty(cpus)) {
6068                                 env.loop = 0;
6069                                 env.loop_break = sched_nr_migrate_break;
6070                                 goto redo;
6071                         }
6072                         goto out_balanced;
6073                 }
6074         }
6075
6076         if (!ld_moved) {
6077                 schedstat_inc(sd, lb_failed[idle]);
6078                 /*
6079                  * Increment the failure counter only on periodic balance.
6080                  * We do not want newidle balance, which can be very
6081                  * frequent, pollute the failure counter causing
6082                  * excessive cache_hot migrations and active balances.
6083                  */
6084                 if (idle != CPU_NEWLY_IDLE)
6085                         sd->nr_balance_failed++;
6086
6087                 if (need_active_balance(&env)) {
6088                         raw_spin_lock_irqsave(&busiest->lock, flags);
6089
6090                         /* don't kick the active_load_balance_cpu_stop,
6091                          * if the curr task on busiest cpu can't be
6092                          * moved to this_cpu
6093                          */
6094                         if (!cpumask_test_cpu(this_cpu,
6095                                         tsk_cpus_allowed(busiest->curr))) {
6096                                 raw_spin_unlock_irqrestore(&busiest->lock,
6097                                                             flags);
6098                                 env.flags |= LBF_ALL_PINNED;
6099                                 goto out_one_pinned;
6100                         }
6101
6102                         /*
6103                          * ->active_balance synchronizes accesses to
6104                          * ->active_balance_work.  Once set, it's cleared
6105                          * only after active load balance is finished.
6106                          */
6107                         if (!busiest->active_balance) {
6108                                 busiest->active_balance = 1;
6109                                 busiest->push_cpu = this_cpu;
6110                                 active_balance = 1;
6111                         }
6112                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
6113
6114                         if (active_balance) {
6115                                 stop_one_cpu_nowait(cpu_of(busiest),
6116                                         active_load_balance_cpu_stop, busiest,
6117                                         &busiest->active_balance_work);
6118                         }
6119
6120                         /*
6121                          * We've kicked active balancing, reset the failure
6122                          * counter.
6123                          */
6124                         sd->nr_balance_failed = sd->cache_nice_tries+1;
6125                 }
6126         } else
6127                 sd->nr_balance_failed = 0;
6128
6129         if (likely(!active_balance)) {
6130                 /* We were unbalanced, so reset the balancing interval */
6131                 sd->balance_interval = sd->min_interval;
6132         } else {
6133                 /*
6134                  * If we've begun active balancing, start to back off. This
6135                  * case may not be covered by the all_pinned logic if there
6136                  * is only 1 task on the busy runqueue (because we don't call
6137                  * move_tasks).
6138                  */
6139                 if (sd->balance_interval < sd->max_interval)
6140                         sd->balance_interval *= 2;
6141         }
6142
6143         goto out;
6144
6145 out_balanced:
6146         schedstat_inc(sd, lb_balanced[idle]);
6147
6148         sd->nr_balance_failed = 0;
6149
6150 out_one_pinned:
6151         /* tune up the balancing interval */
6152         if (((env.flags & LBF_ALL_PINNED) &&
6153                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
6154                         (sd->balance_interval < sd->max_interval))
6155                 sd->balance_interval *= 2;
6156
6157         ld_moved = 0;
6158 out:
6159         return ld_moved;
6160 }
6161 #ifdef CONFIG_SCHED_HMP
6162 static unsigned int hmp_idle_pull(int this_cpu);
6163 #endif
6164 /*
6165  * idle_balance is called by schedule() if this_cpu is about to become
6166  * idle. Attempts to pull tasks from other CPUs.
6167  */
6168 void idle_balance(int this_cpu, struct rq *this_rq)
6169 {
6170         struct sched_domain *sd;
6171         int pulled_task = 0;
6172         unsigned long next_balance = jiffies + HZ;
6173
6174         this_rq->idle_stamp = this_rq->clock;
6175
6176         if (this_rq->avg_idle < sysctl_sched_migration_cost)
6177                 return;
6178
6179         /*
6180          * Drop the rq->lock, but keep IRQ/preempt disabled.
6181          */
6182         raw_spin_unlock(&this_rq->lock);
6183
6184         update_blocked_averages(this_cpu);
6185         rcu_read_lock();
6186         for_each_domain(this_cpu, sd) {
6187                 unsigned long interval;
6188                 int balance = 1;
6189
6190                 if (!(sd->flags & SD_LOAD_BALANCE))
6191                         continue;
6192
6193                 if (sd->flags & SD_BALANCE_NEWIDLE) {
6194                         /* If we've pulled tasks over stop searching: */
6195                         pulled_task = load_balance(this_cpu, this_rq,
6196                                                    sd, CPU_NEWLY_IDLE, &balance);
6197                 }
6198
6199                 interval = msecs_to_jiffies(sd->balance_interval);
6200                 if (time_after(next_balance, sd->last_balance + interval))
6201                         next_balance = sd->last_balance + interval;
6202                 if (pulled_task) {
6203                         this_rq->idle_stamp = 0;
6204                         break;
6205                 }
6206         }
6207         rcu_read_unlock();
6208 #ifdef CONFIG_SCHED_HMP
6209         if (!pulled_task)
6210                 pulled_task = hmp_idle_pull(this_cpu);
6211 #endif
6212         raw_spin_lock(&this_rq->lock);
6213
6214         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6215                 /*
6216                  * We are going idle. next_balance may be set based on
6217                  * a busy processor. So reset next_balance.
6218                  */
6219                 this_rq->next_balance = next_balance;
6220         }
6221 }
6222
6223 /*
6224  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
6225  * running tasks off the busiest CPU onto idle CPUs. It requires at
6226  * least 1 task to be running on each physical CPU where possible, and
6227  * avoids physical / logical imbalances.
6228  */
6229 static int active_load_balance_cpu_stop(void *data)
6230 {
6231         struct rq *busiest_rq = data;
6232         int busiest_cpu = cpu_of(busiest_rq);
6233         int target_cpu = busiest_rq->push_cpu;
6234         struct rq *target_rq = cpu_rq(target_cpu);
6235         struct sched_domain *sd;
6236
6237         raw_spin_lock_irq(&busiest_rq->lock);
6238
6239         /* make sure the requested cpu hasn't gone down in the meantime */
6240         if (unlikely(busiest_cpu != smp_processor_id() ||
6241                      !busiest_rq->active_balance))
6242                 goto out_unlock;
6243
6244         /* Is there any task to move? */
6245         if (busiest_rq->nr_running <= 1)
6246                 goto out_unlock;
6247
6248         /*
6249          * This condition is "impossible", if it occurs
6250          * we need to fix it. Originally reported by
6251          * Bjorn Helgaas on a 128-cpu setup.
6252          */
6253         BUG_ON(busiest_rq == target_rq);
6254
6255         /* move a task from busiest_rq to target_rq */
6256         double_lock_balance(busiest_rq, target_rq);
6257
6258         /* Search for an sd spanning us and the target CPU. */
6259         rcu_read_lock();
6260         for_each_domain(target_cpu, sd) {
6261                 if ((sd->flags & SD_LOAD_BALANCE) &&
6262                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6263                                 break;
6264         }
6265
6266         if (likely(sd)) {
6267                 struct lb_env env = {
6268                         .sd             = sd,
6269                         .dst_cpu        = target_cpu,
6270                         .dst_rq         = target_rq,
6271                         .src_cpu        = busiest_rq->cpu,
6272                         .src_rq         = busiest_rq,
6273                         .idle           = CPU_IDLE,
6274                 };
6275
6276                 schedstat_inc(sd, alb_count);
6277
6278                 if (move_one_task(&env))
6279                         schedstat_inc(sd, alb_pushed);
6280                 else
6281                         schedstat_inc(sd, alb_failed);
6282         }
6283         rcu_read_unlock();
6284         double_unlock_balance(busiest_rq, target_rq);
6285 out_unlock:
6286         busiest_rq->active_balance = 0;
6287         raw_spin_unlock_irq(&busiest_rq->lock);
6288         return 0;
6289 }
6290
6291 #ifdef CONFIG_NO_HZ_COMMON
6292 /*
6293  * idle load balancing details
6294  * - When one of the busy CPUs notice that there may be an idle rebalancing
6295  *   needed, they will kick the idle load balancer, which then does idle
6296  *   load balancing for all the idle CPUs.
6297  */
6298 static struct {
6299         cpumask_var_t idle_cpus_mask;
6300         atomic_t nr_cpus;
6301         unsigned long next_balance;     /* in jiffy units */
6302 } nohz ____cacheline_aligned;
6303
6304 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6305 /*
6306  * Decide if the tasks on the busy CPUs in the
6307  * littlest domain would benefit from an idle balance
6308  */
6309 static int hmp_packing_ilb_needed(int cpu)
6310 {
6311         struct hmp_domain *hmp;
6312         /* always allow ilb on non-slowest domain */
6313         if (!hmp_cpu_is_slowest(cpu))
6314                 return 1;
6315
6316         hmp = hmp_cpu_domain(cpu);
6317         for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) {
6318                 /* only idle balance if a CPU is loaded over threshold */
6319                 if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold)
6320                         return 1;
6321         }
6322         return 0;
6323 }
6324 #endif
6325
6326 static inline int find_new_ilb(int call_cpu)
6327 {
6328         int ilb = cpumask_first(nohz.idle_cpus_mask);
6329 #ifdef CONFIG_SCHED_HMP
6330         int ilb_needed = 1;
6331
6332         /* restrict nohz balancing to occur in the same hmp domain */
6333         ilb = cpumask_first_and(nohz.idle_cpus_mask,
6334                         &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
6335
6336 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6337         if (ilb < nr_cpu_ids)
6338                 ilb_needed = hmp_packing_ilb_needed(ilb);
6339 #endif
6340
6341         if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb))
6342                 return ilb;
6343 #else
6344         if (ilb < nr_cpu_ids && idle_cpu(ilb))
6345                 return ilb;
6346 #endif
6347
6348         return nr_cpu_ids;
6349 }
6350
6351 /*
6352  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
6353  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6354  * CPU (if there is one).
6355  */
6356 static void nohz_balancer_kick(int cpu)
6357 {
6358         int ilb_cpu;
6359
6360         nohz.next_balance++;
6361
6362         ilb_cpu = find_new_ilb(cpu);
6363
6364         if (ilb_cpu >= nr_cpu_ids)
6365                 return;
6366
6367         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6368                 return;
6369         /*
6370          * Use smp_send_reschedule() instead of resched_cpu().
6371          * This way we generate a sched IPI on the target cpu which
6372          * is idle. And the softirq performing nohz idle load balance
6373          * will be run before returning from the IPI.
6374          */
6375         smp_send_reschedule(ilb_cpu);
6376         return;
6377 }
6378
6379 static inline void nohz_balance_exit_idle(int cpu)
6380 {
6381         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6382                 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6383                 atomic_dec(&nohz.nr_cpus);
6384                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6385         }
6386 }
6387
6388 static inline void set_cpu_sd_state_busy(void)
6389 {
6390         struct sched_domain *sd;
6391         int cpu = smp_processor_id();
6392
6393         rcu_read_lock();
6394         sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
6395
6396         if (!sd || !sd->nohz_idle)
6397                 goto unlock;
6398         sd->nohz_idle = 0;
6399
6400         for (; sd; sd = sd->parent)
6401                 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
6402 unlock:
6403         rcu_read_unlock();
6404 }
6405
6406 void set_cpu_sd_state_idle(void)
6407 {
6408         struct sched_domain *sd;
6409         int cpu = smp_processor_id();
6410
6411         rcu_read_lock();
6412         sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
6413
6414         if (!sd || sd->nohz_idle)
6415                 goto unlock;
6416         sd->nohz_idle = 1;
6417
6418         for (; sd; sd = sd->parent)
6419                 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
6420 unlock:
6421         rcu_read_unlock();
6422 }
6423
6424 /*
6425  * This routine will record that the cpu is going idle with tick stopped.
6426  * This info will be used in performing idle load balancing in the future.
6427  */
6428 void nohz_balance_enter_idle(int cpu)
6429 {
6430         /*
6431          * If this cpu is going down, then nothing needs to be done.
6432          */
6433         if (!cpu_active(cpu))
6434                 return;
6435
6436         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6437                 return;
6438
6439         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6440         atomic_inc(&nohz.nr_cpus);
6441         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6442 }
6443
6444 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
6445                                         unsigned long action, void *hcpu)
6446 {
6447         switch (action & ~CPU_TASKS_FROZEN) {
6448         case CPU_DYING:
6449                 nohz_balance_exit_idle(smp_processor_id());
6450                 return NOTIFY_OK;
6451         default:
6452                 return NOTIFY_DONE;
6453         }
6454 }
6455 #endif
6456
6457 static DEFINE_SPINLOCK(balancing);
6458
6459 /*
6460  * Scale the max load_balance interval with the number of CPUs in the system.
6461  * This trades load-balance latency on larger machines for less cross talk.
6462  */
6463 void update_max_interval(void)
6464 {
6465         max_load_balance_interval = HZ*num_online_cpus()/10;
6466 }
6467
6468 /*
6469  * It checks each scheduling domain to see if it is due to be balanced,
6470  * and initiates a balancing operation if so.
6471  *
6472  * Balancing parameters are set up in init_sched_domains.
6473  */
6474 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
6475 {
6476         int balance = 1;
6477         struct rq *rq = cpu_rq(cpu);
6478         unsigned long interval;
6479         struct sched_domain *sd;
6480         /* Earliest time when we have to do rebalance again */
6481         unsigned long next_balance = jiffies + 60*HZ;
6482         int update_next_balance = 0;
6483         int need_serialize;
6484
6485         update_blocked_averages(cpu);
6486
6487         rcu_read_lock();
6488         for_each_domain(cpu, sd) {
6489                 if (!(sd->flags & SD_LOAD_BALANCE))
6490                         continue;
6491
6492                 interval = sd->balance_interval;
6493                 if (idle != CPU_IDLE)
6494                         interval *= sd->busy_factor;
6495
6496                 /* scale ms to jiffies */
6497                 interval = msecs_to_jiffies(interval);
6498                 interval = clamp(interval, 1UL, max_load_balance_interval);
6499
6500                 need_serialize = sd->flags & SD_SERIALIZE;
6501
6502                 if (need_serialize) {
6503                         if (!spin_trylock(&balancing))
6504                                 goto out;
6505                 }
6506
6507                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
6508                         if (load_balance(cpu, rq, sd, idle, &balance)) {
6509                                 /*
6510                                  * The LBF_SOME_PINNED logic could have changed
6511                                  * env->dst_cpu, so we can't know our idle
6512                                  * state even if we migrated tasks. Update it.
6513                                  */
6514                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
6515                         }
6516                         sd->last_balance = jiffies;
6517                 }
6518                 if (need_serialize)
6519                         spin_unlock(&balancing);
6520 out:
6521                 if (time_after(next_balance, sd->last_balance + interval)) {
6522                         next_balance = sd->last_balance + interval;
6523                         update_next_balance = 1;
6524                 }
6525
6526                 /*
6527                  * Stop the load balance at this level. There is another
6528                  * CPU in our sched group which is doing load balancing more
6529                  * actively.
6530                  */
6531                 if (!balance)
6532                         break;
6533         }
6534         rcu_read_unlock();
6535
6536         /*
6537          * next_balance will be updated only when there is a need.
6538          * When the cpu is attached to null domain for ex, it will not be
6539          * updated.
6540          */
6541         if (likely(update_next_balance))
6542                 rq->next_balance = next_balance;
6543 }
6544
6545 #ifdef CONFIG_NO_HZ_COMMON
6546 /*
6547  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6548  * rebalancing for all the cpus for whom scheduler ticks are stopped.
6549  */
6550 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6551 {
6552         struct rq *this_rq = cpu_rq(this_cpu);
6553         struct rq *rq;
6554         int balance_cpu;
6555
6556         if (idle != CPU_IDLE ||
6557             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
6558                 goto end;
6559
6560         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
6561                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
6562                         continue;
6563
6564                 /*
6565                  * If this cpu gets work to do, stop the load balancing
6566                  * work being done for other cpus. Next load
6567                  * balancing owner will pick it up.
6568                  */
6569                 if (need_resched())
6570                         break;
6571
6572                 rq = cpu_rq(balance_cpu);
6573
6574                 raw_spin_lock_irq(&rq->lock);
6575                 update_rq_clock(rq);
6576                 update_idle_cpu_load(rq);
6577                 raw_spin_unlock_irq(&rq->lock);
6578
6579                 rebalance_domains(balance_cpu, CPU_IDLE);
6580
6581                 if (time_after(this_rq->next_balance, rq->next_balance))
6582                         this_rq->next_balance = rq->next_balance;
6583         }
6584         nohz.next_balance = this_rq->next_balance;
6585 end:
6586         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
6587 }
6588
6589 /*
6590  * Current heuristic for kicking the idle load balancer in the presence
6591  * of an idle cpu is the system.
6592  *   - This rq has more than one task.
6593  *   - At any scheduler domain level, this cpu's scheduler group has multiple
6594  *     busy cpu's exceeding the group's power.
6595  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6596  *     domain span are idle.
6597  */
6598 static inline int nohz_kick_needed(struct rq *rq, int cpu)
6599 {
6600         unsigned long now = jiffies;
6601         struct sched_domain *sd;
6602
6603         if (unlikely(idle_cpu(cpu)))
6604                 return 0;
6605
6606        /*
6607         * We may be recently in ticked or tickless idle mode. At the first
6608         * busy tick after returning from idle, we will update the busy stats.
6609         */
6610         set_cpu_sd_state_busy();
6611         nohz_balance_exit_idle(cpu);
6612
6613         /*
6614          * None are in tickless mode and hence no need for NOHZ idle load
6615          * balancing.
6616          */
6617         if (likely(!atomic_read(&nohz.nr_cpus)))
6618                 return 0;
6619
6620         if (time_before(now, nohz.next_balance))
6621                 return 0;
6622
6623 #ifdef CONFIG_SCHED_HMP
6624         /*
6625          * Bail out if there are no nohz CPUs in our
6626          * HMP domain, since we will move tasks between
6627          * domains through wakeup and force balancing
6628          * as necessary based upon task load.
6629          */
6630         if (cpumask_first_and(nohz.idle_cpus_mask,
6631                         &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
6632                 return 0;
6633 #endif
6634
6635         if (rq->nr_running >= 2)
6636                 goto need_kick;
6637
6638         rcu_read_lock();
6639         for_each_domain(cpu, sd) {
6640                 struct sched_group *sg = sd->groups;
6641                 struct sched_group_power *sgp = sg->sgp;
6642                 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
6643
6644                 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
6645                         goto need_kick_unlock;
6646
6647                 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
6648                     && (cpumask_first_and(nohz.idle_cpus_mask,
6649                                           sched_domain_span(sd)) < cpu))
6650                         goto need_kick_unlock;
6651
6652                 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
6653                         break;
6654         }
6655         rcu_read_unlock();
6656         return 0;
6657
6658 need_kick_unlock:
6659         rcu_read_unlock();
6660 need_kick:
6661         return 1;
6662 }
6663 #else
6664 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6665 #endif
6666
6667 #ifdef CONFIG_SCHED_HMP
6668 /* Check if task should migrate to a faster cpu */
6669 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
6670 {
6671         struct task_struct *p = task_of(se);
6672         int temp_target_cpu;
6673         u64 now;
6674
6675         if (hmp_cpu_is_fastest(cpu))
6676                 return 0;
6677
6678 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
6679         /* Filter by task priority */
6680         if (p->prio >= hmp_up_prio)
6681                 return 0;
6682 #endif
6683         if (se->avg.load_avg_ratio < hmp_up_threshold)
6684                 return 0;
6685
6686         /* Let the task load settle before doing another up migration */
6687         /* hack - always use clock from first online CPU */
6688         now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
6689         if (((now - se->avg.hmp_last_up_migration) >> 10)
6690                                         < hmp_next_up_threshold)
6691                 return 0;
6692
6693         /* hmp_domain_min_load only returns 0 for an
6694          * idle CPU or 1023 for any partly-busy one.
6695          * Be explicit about requirement for an idle CPU.
6696          */
6697         if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu,
6698                         tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) {
6699                 if(target_cpu)
6700                         *target_cpu = temp_target_cpu;
6701                 return 1;
6702         }
6703         return 0;
6704 }
6705
6706 /* Check if task should migrate to a slower cpu */
6707 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
6708 {
6709         struct task_struct *p = task_of(se);
6710         u64 now;
6711
6712         if (hmp_cpu_is_slowest(cpu)) {
6713 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6714                 if(hmp_packing_enabled)
6715                         return 1;
6716                 else
6717 #endif
6718                 return 0;
6719         }
6720
6721 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
6722         /* Filter by task priority */
6723         if ((p->prio >= hmp_up_prio) &&
6724                 cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
6725                                         tsk_cpus_allowed(p))) {
6726                 return 1;
6727         }
6728 #endif
6729
6730         /* Let the task load settle before doing another down migration */
6731         /* hack - always use clock from first online CPU */
6732         now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
6733         if (((now - se->avg.hmp_last_down_migration) >> 10)
6734                                         < hmp_next_down_threshold)
6735                 return 0;
6736
6737         if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
6738                                         tsk_cpus_allowed(p))
6739                 && se->avg.load_avg_ratio < hmp_down_threshold) {
6740                 return 1;
6741         }
6742         return 0;
6743 }
6744
6745 /*
6746  * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6747  * Ideally this function should be merged with can_migrate_task() to avoid
6748  * redundant code.
6749  */
6750 static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
6751 {
6752         int tsk_cache_hot = 0;
6753
6754         /*
6755          * We do not migrate tasks that are:
6756          * 1) running (obviously), or
6757          * 2) cannot be migrated to this CPU due to cpus_allowed
6758          */
6759         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6760                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6761                 return 0;
6762         }
6763         env->flags &= ~LBF_ALL_PINNED;
6764
6765         if (task_running(env->src_rq, p)) {
6766                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6767                 return 0;
6768         }
6769
6770         /*
6771          * Aggressive migration if:
6772          * 1) task is cache cold, or
6773          * 2) too many balance attempts have failed.
6774          */
6775
6776         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
6777         if (!tsk_cache_hot ||
6778                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6779 #ifdef CONFIG_SCHEDSTATS
6780                 if (tsk_cache_hot) {
6781                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6782                         schedstat_inc(p, se.statistics.nr_forced_migrations);
6783                 }
6784 #endif
6785                 return 1;
6786         }
6787
6788         return 1;
6789 }
6790
6791 /*
6792  * move_specific_task tries to move a specific task.
6793  * Returns 1 if successful and 0 otherwise.
6794  * Called with both runqueues locked.
6795  */
6796 static int move_specific_task(struct lb_env *env, struct task_struct *pm)
6797 {
6798         struct task_struct *p, *n;
6799
6800         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6801         if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
6802                                 env->dst_cpu))
6803                 continue;
6804
6805                 if (!hmp_can_migrate_task(p, env))
6806                         continue;
6807                 /* Check if we found the right task */
6808                 if (p != pm)
6809                         continue;
6810
6811                 move_task(p, env);
6812                 /*
6813                  * Right now, this is only the third place move_task()
6814                  * is called, so we can safely collect move_task()
6815                  * stats here rather than inside move_task().
6816                  */
6817                 schedstat_inc(env->sd, lb_gained[env->idle]);
6818                 return 1;
6819         }
6820         return 0;
6821 }
6822
6823 /*
6824  * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
6825  * migrate a specific task from one runqueue to another.
6826  * hmp_force_up_migration uses this to push a currently running task
6827  * off a runqueue.
6828  * Based on active_load_balance_stop_cpu and can potentially be merged.
6829  */
6830 static int hmp_active_task_migration_cpu_stop(void *data)
6831 {
6832         struct rq *busiest_rq = data;
6833         struct task_struct *p = busiest_rq->migrate_task;
6834         int busiest_cpu = cpu_of(busiest_rq);
6835         int target_cpu = busiest_rq->push_cpu;
6836         struct rq *target_rq = cpu_rq(target_cpu);
6837         struct sched_domain *sd;
6838
6839         raw_spin_lock_irq(&busiest_rq->lock);
6840         /* make sure the requested cpu hasn't gone down in the meantime */
6841         if (unlikely(busiest_cpu != smp_processor_id() ||
6842                 !busiest_rq->active_balance)) {
6843                 goto out_unlock;
6844         }
6845         /* Is there any task to move? */
6846         if (busiest_rq->nr_running <= 1)
6847                 goto out_unlock;
6848         /* Task has migrated meanwhile, abort forced migration */
6849         if (task_rq(p) != busiest_rq)
6850                 goto out_unlock;
6851         /*
6852          * This condition is "impossible", if it occurs
6853          * we need to fix it. Originally reported by
6854          * Bjorn Helgaas on a 128-cpu setup.
6855          */
6856         BUG_ON(busiest_rq == target_rq);
6857
6858         /* move a task from busiest_rq to target_rq */
6859         double_lock_balance(busiest_rq, target_rq);
6860
6861         /* Search for an sd spanning us and the target CPU. */
6862         rcu_read_lock();
6863         for_each_domain(target_cpu, sd) {
6864                 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6865                         break;
6866         }
6867
6868         if (likely(sd)) {
6869                 struct lb_env env = {
6870                         .sd             = sd,
6871                         .dst_cpu        = target_cpu,
6872                         .dst_rq         = target_rq,
6873                         .src_cpu        = busiest_rq->cpu,
6874                         .src_rq         = busiest_rq,
6875                         .idle           = CPU_IDLE,
6876                 };
6877
6878                 schedstat_inc(sd, alb_count);
6879
6880                 if (move_specific_task(&env, p))
6881                         schedstat_inc(sd, alb_pushed);
6882                 else
6883                         schedstat_inc(sd, alb_failed);
6884         }
6885         rcu_read_unlock();
6886         double_unlock_balance(busiest_rq, target_rq);
6887 out_unlock:
6888         put_task_struct(p);
6889         busiest_rq->active_balance = 0;
6890         raw_spin_unlock_irq(&busiest_rq->lock);
6891         return 0;
6892 }
6893
6894 /*
6895  * hmp_idle_pull_cpu_stop is run by cpu stopper and used to
6896  * migrate a specific task from one runqueue to another.
6897  * hmp_idle_pull uses this to push a currently running task
6898  * off a runqueue to a faster CPU.
6899  * Locking is slightly different than usual.
6900  * Based on active_load_balance_stop_cpu and can potentially be merged.
6901  */
6902 static int hmp_idle_pull_cpu_stop(void *data)
6903 {
6904         struct rq *busiest_rq = data;
6905         struct task_struct *p = busiest_rq->migrate_task;
6906         int busiest_cpu = cpu_of(busiest_rq);
6907         int target_cpu = busiest_rq->push_cpu;
6908         struct rq *target_rq = cpu_rq(target_cpu);
6909         struct sched_domain *sd;
6910
6911         raw_spin_lock_irq(&busiest_rq->lock);
6912
6913         /* make sure the requested cpu hasn't gone down in the meantime */
6914         if (unlikely(busiest_cpu != smp_processor_id() ||
6915                 !busiest_rq->active_balance))
6916                 goto out_unlock;
6917
6918         /* Is there any task to move? */
6919         if (busiest_rq->nr_running <= 1)
6920                 goto out_unlock;
6921
6922         /* Task has migrated meanwhile, abort forced migration */
6923         if (task_rq(p) != busiest_rq)
6924                 goto out_unlock;
6925
6926         /*
6927          * This condition is "impossible", if it occurs
6928          * we need to fix it. Originally reported by
6929          * Bjorn Helgaas on a 128-cpu setup.
6930          */
6931         BUG_ON(busiest_rq == target_rq);
6932
6933         /* move a task from busiest_rq to target_rq */
6934         double_lock_balance(busiest_rq, target_rq);
6935
6936         /* Search for an sd spanning us and the target CPU. */
6937         rcu_read_lock();
6938         for_each_domain(target_cpu, sd) {
6939                 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6940                         break;
6941         }
6942         if (likely(sd)) {
6943                 struct lb_env env = {
6944                         .sd             = sd,
6945                         .dst_cpu        = target_cpu,
6946                         .dst_rq         = target_rq,
6947                         .src_cpu        = busiest_rq->cpu,
6948                         .src_rq         = busiest_rq,
6949                         .idle           = CPU_IDLE,
6950                 };
6951
6952                 schedstat_inc(sd, alb_count);
6953
6954                 if (move_specific_task(&env, p))
6955                         schedstat_inc(sd, alb_pushed);
6956                 else
6957                         schedstat_inc(sd, alb_failed);
6958         }
6959         rcu_read_unlock();
6960         double_unlock_balance(busiest_rq, target_rq);
6961 out_unlock:
6962         put_task_struct(p);
6963         busiest_rq->active_balance = 0;
6964         raw_spin_unlock_irq(&busiest_rq->lock);
6965         return 0;
6966 }
6967
6968 /*
6969  * Move task in a runnable state to another CPU.
6970  *
6971  * Tailored on 'active_load_balance_stop_cpu' with slight
6972  * modification to locking and pre-transfer checks.  Note
6973  * rq->lock must be held before calling.
6974  */
6975 static void hmp_migrate_runnable_task(struct rq *rq)
6976 {
6977         struct sched_domain *sd;
6978         int src_cpu = cpu_of(rq);
6979         struct rq *src_rq = rq;
6980         int dst_cpu = rq->push_cpu;
6981         struct rq *dst_rq = cpu_rq(dst_cpu);
6982         struct task_struct *p = rq->migrate_task;
6983         /*
6984          * One last check to make sure nobody else is playing
6985          * with the source rq.
6986          */
6987         if (src_rq->active_balance)
6988                 return;
6989
6990         if (src_rq->nr_running <= 1)
6991                 return;
6992
6993         if (task_rq(p) != src_rq)
6994                 return;
6995         /*
6996          * Not sure if this applies here but one can never
6997          * be too cautious
6998          */
6999         BUG_ON(src_rq == dst_rq);
7000
7001         double_lock_balance(src_rq, dst_rq);
7002
7003         rcu_read_lock();
7004         for_each_domain(dst_cpu, sd) {
7005                 if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
7006                         break;
7007         }
7008
7009         if (likely(sd)) {
7010                 struct lb_env env = {
7011                         .sd             = sd,
7012                         .dst_cpu        = dst_cpu,
7013                         .dst_rq         = dst_rq,
7014                         .src_cpu        = src_cpu,
7015                         .src_rq         = src_rq,
7016                         .idle           = CPU_IDLE,
7017                 };
7018
7019                 schedstat_inc(sd, alb_count);
7020
7021                 if (move_specific_task(&env, p))
7022                         schedstat_inc(sd, alb_pushed);
7023                 else
7024                         schedstat_inc(sd, alb_failed);
7025         }
7026
7027         rcu_read_unlock();
7028         double_unlock_balance(src_rq, dst_rq);
7029 }
7030
7031 static DEFINE_SPINLOCK(hmp_force_migration);
7032
7033 /*
7034  * hmp_force_up_migration checks runqueues for tasks that need to
7035  * be actively migrated to a faster cpu.
7036  */
7037 static void hmp_force_up_migration(int this_cpu)
7038 {
7039         int cpu, target_cpu;
7040         struct sched_entity *curr, *orig;
7041         struct rq *target;
7042         unsigned long flags;
7043         unsigned int force, got_target;
7044         struct task_struct *p;
7045
7046         if (!spin_trylock(&hmp_force_migration))
7047                 return;
7048         for_each_online_cpu(cpu) {
7049                 force = 0;
7050                 got_target = 0;
7051                 target = cpu_rq(cpu);
7052                 raw_spin_lock_irqsave(&target->lock, flags);
7053                 curr = target->cfs.curr;
7054                 if (!curr) {
7055                         raw_spin_unlock_irqrestore(&target->lock, flags);
7056                         continue;
7057                 }
7058                 if (!entity_is_task(curr)) {
7059                         struct cfs_rq *cfs_rq;
7060
7061                         cfs_rq = group_cfs_rq(curr);
7062                         while (cfs_rq) {
7063                                 curr = cfs_rq->curr;
7064                                 cfs_rq = group_cfs_rq(curr);
7065                         }
7066                 }
7067                 orig = curr;
7068                 curr = hmp_get_heaviest_task(curr, 1);
7069                 p = task_of(curr);
7070                 if (hmp_up_migration(cpu, &target_cpu, curr)) {
7071                         if (!target->active_balance) {
7072                                 get_task_struct(p);
7073                                 target->push_cpu = target_cpu;
7074                                 target->migrate_task = p;
7075                                 got_target = 1;
7076                                 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
7077                                 hmp_next_up_delay(&p->se, target->push_cpu);
7078                         }
7079                 }
7080                 if (!got_target && !target->active_balance) {
7081                         /*
7082                          * For now we just check the currently running task.
7083                          * Selecting the lightest task for offloading will
7084                          * require extensive book keeping.
7085                          */
7086                         curr = hmp_get_lightest_task(orig, 1);
7087                         p = task_of(curr);
7088                         target->push_cpu = hmp_offload_down(cpu, curr);
7089                         if (target->push_cpu < NR_CPUS) {
7090                                 get_task_struct(p);
7091                                 target->migrate_task = p;
7092                                 got_target = 1;
7093                                 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
7094                                 hmp_next_down_delay(&p->se, target->push_cpu);
7095                         }
7096                 }
7097                 /*
7098                  * We have a target with no active_balance.  If the task
7099                  * is not currently running move it, otherwise let the
7100                  * CPU stopper take care of it.
7101                  */
7102                 if (got_target && !target->active_balance) {
7103                         if (!task_running(target, p)) {
7104                                 trace_sched_hmp_migrate_force_running(p, 0);
7105                                 hmp_migrate_runnable_task(target);
7106                         } else {
7107                                 target->active_balance = 1;
7108                                 force = 1;
7109                         }
7110                 }
7111
7112                 raw_spin_unlock_irqrestore(&target->lock, flags);
7113
7114                 if (force)
7115                         stop_one_cpu_nowait(cpu_of(target),
7116                                 hmp_active_task_migration_cpu_stop,
7117                                 target, &target->active_balance_work);
7118         }
7119         spin_unlock(&hmp_force_migration);
7120 }
7121 /*
7122  * hmp_idle_pull looks at little domain runqueues to see
7123  * if a task should be pulled.
7124  *
7125  * Reuses hmp_force_migration spinlock.
7126  *
7127  */
7128 static unsigned int hmp_idle_pull(int this_cpu)
7129 {
7130         int cpu;
7131         struct sched_entity *curr, *orig;
7132         struct hmp_domain *hmp_domain = NULL;
7133         struct rq *target = NULL, *rq;
7134         unsigned long flags, ratio = 0;
7135         unsigned int force = 0;
7136         struct task_struct *p = NULL;
7137
7138         if (!hmp_cpu_is_slowest(this_cpu))
7139                 hmp_domain = hmp_slower_domain(this_cpu);
7140         if (!hmp_domain)
7141                 return 0;
7142
7143         if (!spin_trylock(&hmp_force_migration))
7144                 return 0;
7145
7146         /* first select a task */
7147         for_each_cpu(cpu, &hmp_domain->cpus) {
7148                 rq = cpu_rq(cpu);
7149                 raw_spin_lock_irqsave(&rq->lock, flags);
7150                 curr = rq->cfs.curr;
7151                 if (!curr) {
7152                         raw_spin_unlock_irqrestore(&rq->lock, flags);
7153                         continue;
7154                 }
7155                 if (!entity_is_task(curr)) {
7156                         struct cfs_rq *cfs_rq;
7157
7158                         cfs_rq = group_cfs_rq(curr);
7159                         while (cfs_rq) {
7160                                 curr = cfs_rq->curr;
7161                                 if (!entity_is_task(curr))
7162                                         cfs_rq = group_cfs_rq(curr);
7163                                 else
7164                                         cfs_rq = NULL;
7165                         }
7166                 }
7167                 orig = curr;
7168                 curr = hmp_get_heaviest_task(curr, 1);
7169                 if (curr->avg.load_avg_ratio > hmp_up_threshold &&
7170                         curr->avg.load_avg_ratio > ratio) {
7171                         p = task_of(curr);
7172                         target = rq;
7173                         ratio = curr->avg.load_avg_ratio;
7174                 }
7175                 raw_spin_unlock_irqrestore(&rq->lock, flags);
7176         }
7177
7178         if (!p)
7179                 goto done;
7180
7181         /* now we have a candidate */
7182         raw_spin_lock_irqsave(&target->lock, flags);
7183         if (!target->active_balance && task_rq(p) == target) {
7184                 get_task_struct(p);
7185                 target->push_cpu = this_cpu;
7186                 target->migrate_task = p;
7187                 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
7188                 hmp_next_up_delay(&p->se, target->push_cpu);
7189                 /*
7190                  * if the task isn't running move it right away.
7191                  * Otherwise setup the active_balance mechanic and let
7192                  * the CPU stopper do its job.
7193                  */
7194                 if (!task_running(target, p)) {
7195                         trace_sched_hmp_migrate_idle_running(p, 0);
7196                         hmp_migrate_runnable_task(target);
7197                 } else {
7198                         target->active_balance = 1;
7199                         force = 1;
7200                 }
7201         }
7202         raw_spin_unlock_irqrestore(&target->lock, flags);
7203
7204         if (force) {
7205                 stop_one_cpu_nowait(cpu_of(target),
7206                         hmp_idle_pull_cpu_stop,
7207                         target, &target->active_balance_work);
7208         }
7209 done:
7210         spin_unlock(&hmp_force_migration);
7211         return force;
7212 }
7213 #else
7214 static void hmp_force_up_migration(int this_cpu) { }
7215 #endif /* CONFIG_SCHED_HMP */
7216
7217 /*
7218  * run_rebalance_domains is triggered when needed from the scheduler tick.
7219  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7220  */
7221 static void run_rebalance_domains(struct softirq_action *h)
7222 {
7223         int this_cpu = smp_processor_id();
7224         struct rq *this_rq = cpu_rq(this_cpu);
7225         enum cpu_idle_type idle = this_rq->idle_balance ?
7226                                                 CPU_IDLE : CPU_NOT_IDLE;
7227
7228         hmp_force_up_migration(this_cpu);
7229
7230         rebalance_domains(this_cpu, idle);
7231
7232         /*
7233          * If this cpu has a pending nohz_balance_kick, then do the
7234          * balancing on behalf of the other idle cpus whose ticks are
7235          * stopped.
7236          */
7237         nohz_idle_balance(this_cpu, idle);
7238 }
7239
7240 static inline int on_null_domain(int cpu)
7241 {
7242         return !rcu_dereference_sched(cpu_rq(cpu)->sd);
7243 }
7244
7245 /*
7246  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7247  */
7248 void trigger_load_balance(struct rq *rq, int cpu)
7249 {
7250         /* Don't need to rebalance while attached to NULL domain */
7251         if (time_after_eq(jiffies, rq->next_balance) &&
7252             likely(!on_null_domain(cpu)))
7253                 raise_softirq(SCHED_SOFTIRQ);
7254 #ifdef CONFIG_NO_HZ_COMMON
7255         if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
7256                 nohz_balancer_kick(cpu);
7257 #endif
7258 }
7259
7260 static void rq_online_fair(struct rq *rq)
7261 {
7262 #ifdef CONFIG_SCHED_HMP
7263         hmp_online_cpu(rq->cpu);
7264 #endif
7265         update_sysctl();
7266 }
7267
7268 static void rq_offline_fair(struct rq *rq)
7269 {
7270 #ifdef CONFIG_SCHED_HMP
7271         hmp_offline_cpu(rq->cpu);
7272 #endif
7273         update_sysctl();
7274
7275         /* Ensure any throttled groups are reachable by pick_next_task */
7276         unthrottle_offline_cfs_rqs(rq);
7277 }
7278
7279 #endif /* CONFIG_SMP */
7280
7281 /*
7282  * scheduler tick hitting a task of our scheduling class:
7283  */
7284 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7285 {
7286         struct cfs_rq *cfs_rq;
7287         struct sched_entity *se = &curr->se;
7288
7289         for_each_sched_entity(se) {
7290                 cfs_rq = cfs_rq_of(se);
7291                 entity_tick(cfs_rq, se, queued);
7292         }
7293
7294         if (sched_feat_numa(NUMA))
7295                 task_tick_numa(rq, curr);
7296
7297         update_rq_runnable_avg(rq, 1);
7298 }
7299
7300 /*
7301  * called on fork with the child task as argument from the parent's context
7302  *  - child not yet on the tasklist
7303  *  - preemption disabled
7304  */
7305 static void task_fork_fair(struct task_struct *p)
7306 {
7307         struct cfs_rq *cfs_rq;
7308         struct sched_entity *se = &p->se, *curr;
7309         int this_cpu = smp_processor_id();
7310         struct rq *rq = this_rq();
7311         unsigned long flags;
7312
7313         raw_spin_lock_irqsave(&rq->lock, flags);
7314
7315         update_rq_clock(rq);
7316
7317         cfs_rq = task_cfs_rq(current);
7318         curr = cfs_rq->curr;
7319
7320         /*
7321          * Not only the cpu but also the task_group of the parent might have
7322          * been changed after parent->se.parent,cfs_rq were copied to
7323          * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7324          * of child point to valid ones.
7325          */
7326         rcu_read_lock();
7327         __set_task_cpu(p, this_cpu);
7328         rcu_read_unlock();
7329
7330         update_curr(cfs_rq);
7331
7332         if (curr)
7333                 se->vruntime = curr->vruntime;
7334         place_entity(cfs_rq, se, 1);
7335
7336         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7337                 /*
7338                  * Upon rescheduling, sched_class::put_prev_task() will place
7339                  * 'current' within the tree based on its new key value.
7340                  */
7341                 swap(curr->vruntime, se->vruntime);
7342                 resched_task(rq->curr);
7343         }
7344
7345         se->vruntime -= cfs_rq->min_vruntime;
7346
7347         raw_spin_unlock_irqrestore(&rq->lock, flags);
7348 }
7349
7350 /*
7351  * Priority of the task has changed. Check to see if we preempt
7352  * the current task.
7353  */
7354 static void
7355 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7356 {
7357         if (!p->se.on_rq)
7358                 return;
7359
7360         /*
7361          * Reschedule if we are currently running on this runqueue and
7362          * our priority decreased, or if we are not currently running on
7363          * this runqueue and our priority is higher than the current's
7364          */
7365         if (rq->curr == p) {
7366                 if (p->prio > oldprio)
7367                         resched_task(rq->curr);
7368         } else
7369                 check_preempt_curr(rq, p, 0);
7370 }
7371
7372 static void switched_from_fair(struct rq *rq, struct task_struct *p)
7373 {
7374         struct sched_entity *se = &p->se;
7375         struct cfs_rq *cfs_rq = cfs_rq_of(se);
7376
7377         /*
7378          * Ensure the task's vruntime is normalized, so that when its
7379          * switched back to the fair class the enqueue_entity(.flags=0) will
7380          * do the right thing.
7381          *
7382          * If it was on_rq, then the dequeue_entity(.flags=0) will already
7383          * have normalized the vruntime, if it was !on_rq, then only when
7384          * the task is sleeping will it still have non-normalized vruntime.
7385          */
7386         if (!se->on_rq && p->state != TASK_RUNNING) {
7387                 /*
7388                  * Fix up our vruntime so that the current sleep doesn't
7389                  * cause 'unlimited' sleep bonus.
7390                  */
7391                 place_entity(cfs_rq, se, 0);
7392                 se->vruntime -= cfs_rq->min_vruntime;
7393         }
7394
7395 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
7396         /*
7397         * Remove our load from contribution when we leave sched_fair
7398         * and ensure we don't carry in an old decay_count if we
7399         * switch back.
7400         */
7401         if (p->se.avg.decay_count) {
7402                 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
7403                 __synchronize_entity_decay(&p->se);
7404                 subtract_blocked_load_contrib(cfs_rq,
7405                                 p->se.avg.load_avg_contrib);
7406         }
7407 #endif
7408 }
7409
7410 /*
7411  * We switched to the sched_fair class.
7412  */
7413 static void switched_to_fair(struct rq *rq, struct task_struct *p)
7414 {
7415         if (!p->se.on_rq)
7416                 return;
7417
7418         /*
7419          * We were most likely switched from sched_rt, so
7420          * kick off the schedule if running, otherwise just see
7421          * if we can still preempt the current task.
7422          */
7423         if (rq->curr == p)
7424                 resched_task(rq->curr);
7425         else
7426                 check_preempt_curr(rq, p, 0);
7427 }
7428
7429 /* Account for a task changing its policy or group.
7430  *
7431  * This routine is mostly called to set cfs_rq->curr field when a task
7432  * migrates between groups/classes.
7433  */
7434 static void set_curr_task_fair(struct rq *rq)
7435 {
7436         struct sched_entity *se = &rq->curr->se;
7437
7438         for_each_sched_entity(se) {
7439                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7440
7441                 set_next_entity(cfs_rq, se);
7442                 /* ensure bandwidth has been allocated on our new cfs_rq */
7443                 account_cfs_rq_runtime(cfs_rq, 0);
7444         }
7445 }
7446
7447 void init_cfs_rq(struct cfs_rq *cfs_rq)
7448 {
7449         cfs_rq->tasks_timeline = RB_ROOT;
7450         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7451 #ifndef CONFIG_64BIT
7452         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7453 #endif
7454 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
7455         atomic64_set(&cfs_rq->decay_counter, 1);
7456         atomic64_set(&cfs_rq->removed_load, 0);
7457 #endif
7458 }
7459
7460 #ifdef CONFIG_FAIR_GROUP_SCHED
7461 static void task_move_group_fair(struct task_struct *p, int on_rq)
7462 {
7463         struct cfs_rq *cfs_rq;
7464         /*
7465          * If the task was not on the rq at the time of this cgroup movement
7466          * it must have been asleep, sleeping tasks keep their ->vruntime
7467          * absolute on their old rq until wakeup (needed for the fair sleeper
7468          * bonus in place_entity()).
7469          *
7470          * If it was on the rq, we've just 'preempted' it, which does convert
7471          * ->vruntime to a relative base.
7472          *
7473          * Make sure both cases convert their relative position when migrating
7474          * to another cgroup's rq. This does somewhat interfere with the
7475          * fair sleeper stuff for the first placement, but who cares.
7476          */
7477         /*
7478          * When !on_rq, vruntime of the task has usually NOT been normalized.
7479          * But there are some cases where it has already been normalized:
7480          *
7481          * - Moving a forked child which is waiting for being woken up by
7482          *   wake_up_new_task().
7483          * - Moving a task which has been woken up by try_to_wake_up() and
7484          *   waiting for actually being woken up by sched_ttwu_pending().
7485          *
7486          * To prevent boost or penalty in the new cfs_rq caused by delta
7487          * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7488          */
7489         if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7490                 on_rq = 1;
7491
7492         if (!on_rq)
7493                 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
7494         set_task_rq(p, task_cpu(p));
7495         if (!on_rq) {
7496                 cfs_rq = cfs_rq_of(&p->se);
7497                 p->se.vruntime += cfs_rq->min_vruntime;
7498 #ifdef CONFIG_SMP
7499                 /*
7500                  * migrate_task_rq_fair() will have removed our previous
7501                  * contribution, but we must synchronize for ongoing future
7502                  * decay.
7503                  */
7504                 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7505                 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
7506 #endif
7507         }
7508 }
7509
7510 void free_fair_sched_group(struct task_group *tg)
7511 {
7512         int i;
7513
7514         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7515
7516         for_each_possible_cpu(i) {
7517                 if (tg->cfs_rq)
7518                         kfree(tg->cfs_rq[i]);
7519                 if (tg->se)
7520                         kfree(tg->se[i]);
7521         }
7522
7523         kfree(tg->cfs_rq);
7524         kfree(tg->se);
7525 }
7526
7527 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7528 {
7529         struct cfs_rq *cfs_rq;
7530         struct sched_entity *se;
7531         int i;
7532
7533         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7534         if (!tg->cfs_rq)
7535                 goto err;
7536         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7537         if (!tg->se)
7538                 goto err;
7539
7540         tg->shares = NICE_0_LOAD;
7541
7542         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7543
7544         for_each_possible_cpu(i) {
7545                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7546                                       GFP_KERNEL, cpu_to_node(i));
7547                 if (!cfs_rq)
7548                         goto err;
7549
7550                 se = kzalloc_node(sizeof(struct sched_entity),
7551                                   GFP_KERNEL, cpu_to_node(i));
7552                 if (!se)
7553                         goto err_free_rq;
7554
7555                 init_cfs_rq(cfs_rq);
7556                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7557         }
7558
7559         return 1;
7560
7561 err_free_rq:
7562         kfree(cfs_rq);
7563 err:
7564         return 0;
7565 }
7566
7567 void unregister_fair_sched_group(struct task_group *tg, int cpu)
7568 {
7569         struct rq *rq = cpu_rq(cpu);
7570         unsigned long flags;
7571
7572         /*
7573         * Only empty task groups can be destroyed; so we can speculatively
7574         * check on_list without danger of it being re-added.
7575         */
7576         if (!tg->cfs_rq[cpu]->on_list)
7577                 return;
7578
7579         raw_spin_lock_irqsave(&rq->lock, flags);
7580         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7581         raw_spin_unlock_irqrestore(&rq->lock, flags);
7582 }
7583
7584 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7585                         struct sched_entity *se, int cpu,
7586                         struct sched_entity *parent)
7587 {
7588         struct rq *rq = cpu_rq(cpu);
7589
7590         cfs_rq->tg = tg;
7591         cfs_rq->rq = rq;
7592         init_cfs_rq_runtime(cfs_rq);
7593
7594         tg->cfs_rq[cpu] = cfs_rq;
7595         tg->se[cpu] = se;
7596
7597         /* se could be NULL for root_task_group */
7598         if (!se)
7599                 return;
7600
7601         if (!parent)
7602                 se->cfs_rq = &rq->cfs;
7603         else
7604                 se->cfs_rq = parent->my_q;
7605
7606         se->my_q = cfs_rq;
7607         update_load_set(&se->load, 0);
7608         se->parent = parent;
7609 }
7610
7611 static DEFINE_MUTEX(shares_mutex);
7612
7613 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7614 {
7615         int i;
7616         unsigned long flags;
7617
7618         /*
7619          * We can't change the weight of the root cgroup.
7620          */
7621         if (!tg->se[0])
7622                 return -EINVAL;
7623
7624         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7625
7626         mutex_lock(&shares_mutex);
7627         if (tg->shares == shares)
7628                 goto done;
7629
7630         tg->shares = shares;
7631         for_each_possible_cpu(i) {
7632                 struct rq *rq = cpu_rq(i);
7633                 struct sched_entity *se;
7634
7635                 se = tg->se[i];
7636                 /* Propagate contribution to hierarchy */
7637                 raw_spin_lock_irqsave(&rq->lock, flags);
7638                 for_each_sched_entity(se)
7639                         update_cfs_shares(group_cfs_rq(se));
7640                 raw_spin_unlock_irqrestore(&rq->lock, flags);
7641         }
7642
7643 done:
7644         mutex_unlock(&shares_mutex);
7645         return 0;
7646 }
7647 #else /* CONFIG_FAIR_GROUP_SCHED */
7648
7649 void free_fair_sched_group(struct task_group *tg) { }
7650
7651 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7652 {
7653         return 1;
7654 }
7655
7656 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7657
7658 #endif /* CONFIG_FAIR_GROUP_SCHED */
7659
7660
7661 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
7662 {
7663         struct sched_entity *se = &task->se;
7664         unsigned int rr_interval = 0;
7665
7666         /*
7667          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
7668          * idle runqueue:
7669          */
7670         if (rq->cfs.load.weight)
7671                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7672
7673         return rr_interval;
7674 }
7675
7676 /*
7677  * All the scheduling class methods:
7678  */
7679 const struct sched_class fair_sched_class = {
7680         .next                   = &idle_sched_class,
7681         .enqueue_task           = enqueue_task_fair,
7682         .dequeue_task           = dequeue_task_fair,
7683         .yield_task             = yield_task_fair,
7684         .yield_to_task          = yield_to_task_fair,
7685
7686         .check_preempt_curr     = check_preempt_wakeup,
7687
7688         .pick_next_task         = pick_next_task_fair,
7689         .put_prev_task          = put_prev_task_fair,
7690
7691 #ifdef CONFIG_SMP
7692         .select_task_rq         = select_task_rq_fair,
7693 #ifdef CONFIG_FAIR_GROUP_SCHED
7694         .migrate_task_rq        = migrate_task_rq_fair,
7695 #endif
7696         .rq_online              = rq_online_fair,
7697         .rq_offline             = rq_offline_fair,
7698
7699         .task_waking            = task_waking_fair,
7700 #endif
7701
7702         .set_curr_task          = set_curr_task_fair,
7703         .task_tick              = task_tick_fair,
7704         .task_fork              = task_fork_fair,
7705
7706         .prio_changed           = prio_changed_fair,
7707         .switched_from          = switched_from_fair,
7708         .switched_to            = switched_to_fair,
7709
7710         .get_rr_interval        = get_rr_interval_fair,
7711
7712 #ifdef CONFIG_FAIR_GROUP_SCHED
7713         .task_move_group        = task_move_group_fair,
7714 #endif
7715 };
7716
7717 #ifdef CONFIG_SCHED_DEBUG
7718 void print_cfs_stats(struct seq_file *m, int cpu)
7719 {
7720         struct cfs_rq *cfs_rq;
7721
7722         rcu_read_lock();
7723         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7724                 print_cfs_rq(m, cpu, cfs_rq);
7725         rcu_read_unlock();
7726 }
7727 #endif
7728
7729 __init void init_sched_fair_class(void)
7730 {
7731 #ifdef CONFIG_SMP
7732         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7733
7734 #ifdef CONFIG_NO_HZ_COMMON
7735         nohz.next_balance = jiffies;
7736         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7737         cpu_notifier(sched_ilb_notifier, 0);
7738 #endif
7739
7740 #ifdef CONFIG_SCHED_HMP
7741         hmp_cpu_mask_setup();
7742 #endif
7743 #endif /* SMP */
7744
7745 }
7746
7747 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
7748 static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
7749 {
7750         u32 result = curr / max;
7751         return result;
7752 }
7753
7754 /* Called when the CPU Frequency is changed.
7755  * Once for each CPU.
7756  */
7757 static int cpufreq_callback(struct notifier_block *nb,
7758                                         unsigned long val, void *data)
7759 {
7760         struct cpufreq_freqs *freq = data;
7761         int cpu = freq->cpu;
7762         struct cpufreq_extents *extents;
7763
7764         if (freq->flags & CPUFREQ_CONST_LOOPS)
7765                 return NOTIFY_OK;
7766
7767         if (val != CPUFREQ_POSTCHANGE)
7768                 return NOTIFY_OK;
7769
7770         /* if dynamic load scale is disabled, set the load scale to 1.0 */
7771         if (!hmp_data.freqinvar_load_scale_enabled) {
7772                 freq_scale[cpu].curr_scale = 1024;
7773                 return NOTIFY_OK;
7774         }
7775
7776         extents = &freq_scale[cpu];
7777         if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
7778                 /* If our governor was recognised as a single-freq governor,
7779                  * use 1.0
7780                  */
7781                 extents->curr_scale = 1024;
7782         } else {
7783                 extents->curr_scale = cpufreq_calc_scale(extents->min,
7784                                 extents->max, freq->new);
7785         }
7786
7787         return NOTIFY_OK;
7788 }
7789
7790 /* Called when the CPUFreq governor is changed.
7791  * Only called for the CPUs which are actually changed by the
7792  * userspace.
7793  */
7794 static int cpufreq_policy_callback(struct notifier_block *nb,
7795                                        unsigned long event, void *data)
7796 {
7797         struct cpufreq_policy *policy = data;
7798         struct cpufreq_extents *extents;
7799         int cpu, singleFreq = 0;
7800         static const char performance_governor[] = "performance";
7801         static const char powersave_governor[] = "powersave";
7802
7803         if (event == CPUFREQ_START)
7804                 return 0;
7805
7806         if (event != CPUFREQ_INCOMPATIBLE)
7807                 return 0;
7808
7809         /* CPUFreq governors do not accurately report the range of
7810          * CPU Frequencies they will choose from.
7811          * We recognise performance and powersave governors as
7812          * single-frequency only.
7813          */
7814         if (!strncmp(policy->governor->name, performance_governor,
7815                         strlen(performance_governor)) ||
7816                 !strncmp(policy->governor->name, powersave_governor,
7817                                 strlen(powersave_governor)))
7818                 singleFreq = 1;
7819
7820         /* Make sure that all CPUs impacted by this policy are
7821          * updated since we will only get a notification when the
7822          * user explicitly changes the policy on a CPU.
7823          */
7824         for_each_cpu(cpu, policy->cpus) {
7825                 extents = &freq_scale[cpu];
7826                 extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
7827                 extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
7828                 if (!hmp_data.freqinvar_load_scale_enabled) {
7829                         extents->curr_scale = 1024;
7830                 } else if (singleFreq) {
7831                         extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
7832                         extents->curr_scale = 1024;
7833                 } else {
7834                         extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
7835                         extents->curr_scale = cpufreq_calc_scale(extents->min,
7836                                         extents->max, policy->cur);
7837                 }
7838         }
7839
7840         return 0;
7841 }
7842
7843 static struct notifier_block cpufreq_notifier = {
7844         .notifier_call  = cpufreq_callback,
7845 };
7846 static struct notifier_block cpufreq_policy_notifier = {
7847         .notifier_call  = cpufreq_policy_callback,
7848 };
7849
7850 static int __init register_sched_cpufreq_notifier(void)
7851 {
7852         int ret = 0;
7853
7854         /* init safe defaults since there are no policies at registration */
7855         for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
7856                 /* safe defaults */
7857                 freq_scale[ret].max = 1024;
7858                 freq_scale[ret].min = 1024;
7859                 freq_scale[ret].curr_scale = 1024;
7860         }
7861
7862         pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
7863         ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
7864                         CPUFREQ_POLICY_NOTIFIER);
7865
7866         if (ret != -EINVAL)
7867                 ret = cpufreq_register_notifier(&cpufreq_notifier,
7868                         CPUFREQ_TRANSITION_NOTIFIER);
7869
7870         return ret;
7871 }
7872
7873 core_initcall(register_sched_cpufreq_notifier);
7874 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */