kernel/sched/tune.c

   1 #include <linux/cgroup.h>
   2 #include <linux/err.h>
   3 #include <linux/kernel.h>
   4 #include <linux/percpu.h>
   5 #include <linux/printk.h>
   6 #include <linux/rcupdate.h>
   7 #include <linux/slab.h>
   8
   9 #include <trace/events/sched.h>
  10
  11 #include "sched.h"
  12 #include "tune.h"
  13
  14 #ifdef CONFIG_CGROUP_SCHEDTUNE
  15 static bool schedtune_initialized = false;
  16 #endif
  17
  18 unsigned int sysctl_sched_cfs_boost __read_mostly;
  19
  20 extern struct target_nrg schedtune_target_nrg;
  21
  22 /* Performance Boost region (B) threshold params */
  23 static int perf_boost_idx;
  24
  25 /* Performance Constraint region (C) threshold params */
  26 static int perf_constrain_idx;
  27
  28 /**
  29  * Performance-Energy (P-E) Space thresholds constants
  30  */
  31 struct threshold_params {
  32         int nrg_gain;
  33         int cap_gain;
  34 };
  35
  36 /*
  37  * System specific P-E space thresholds constants
  38  */
  39 static struct threshold_params
  40 threshold_gains[] = {
  41         { 0, 5 }, /*   < 10% */
  42         { 1, 5 }, /*   < 20% */
  43         { 2, 5 }, /*   < 30% */
  44         { 3, 5 }, /*   < 40% */
  45         { 4, 5 }, /*   < 50% */
  46         { 5, 4 }, /*   < 60% */
  47         { 5, 3 }, /*   < 70% */
  48         { 5, 2 }, /*   < 80% */
  49         { 5, 1 }, /*   < 90% */
  50         { 5, 0 }  /* <= 100% */
  51 };
  52
  53 static int
  54 __schedtune_accept_deltas(int nrg_delta, int cap_delta,
  55                           int perf_boost_idx, int perf_constrain_idx)
  56 {
  57         int payoff = -INT_MAX;
  58         int gain_idx = -1;
  59
  60         /* Performance Boost (B) region */
  61         if (nrg_delta >= 0 && cap_delta > 0)
  62                 gain_idx = perf_boost_idx;
  63         /* Performance Constraint (C) region */
  64         else if (nrg_delta < 0 && cap_delta <= 0)
  65                 gain_idx = perf_constrain_idx;
  66
  67         /* Default: reject schedule candidate */
  68         if (gain_idx == -1)
  69                 return payoff;
  70
  71         /*
  72          * Evaluate "Performance Boost" vs "Energy Increase"
  73          *
  74          * - Performance Boost (B) region
  75          *
  76          *   Condition: nrg_delta > 0 && cap_delta > 0
  77          *   Payoff criteria:
  78          *     cap_gain / nrg_gain  < cap_delta / nrg_delta =
  79          *     cap_gain * nrg_delta < cap_delta * nrg_gain
  80          *   Note that since both nrg_gain and nrg_delta are positive, the
  81          *   inequality does not change. Thus:
  82          *
  83          *     payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
  84          *
  85          * - Performance Constraint (C) region
  86          *
  87          *   Condition: nrg_delta < 0 && cap_delta < 0
  88          *   payoff criteria:
  89          *     cap_gain / nrg_gain  > cap_delta / nrg_delta =
  90          *     cap_gain * nrg_delta < cap_delta * nrg_gain
  91          *   Note that since nrg_gain > 0 while nrg_delta < 0, the
  92          *   inequality change. Thus:
  93          *
  94          *     payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
  95          *
  96          * This means that, in case of same positive defined {cap,nrg}_gain
  97          * for both the B and C regions, we can use the same payoff formula
  98          * where a positive value represents the accept condition.
  99          */
 100         payoff  = cap_delta * threshold_gains[gain_idx].nrg_gain;
 101         payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
 102
 103         return payoff;
 104 }
 105
 106 #ifdef CONFIG_CGROUP_SCHEDTUNE
 107
 108 /*
 109  * EAS scheduler tunables for task groups.
 110  */
 111
 112 /* SchdTune tunables for a group of tasks */
 113 struct schedtune {
 114         /* SchedTune CGroup subsystem */
 115         struct cgroup_subsys_state css;
 116
 117         /* Boost group allocated ID */
 118         int idx;
 119
 120         /* Boost value for tasks on that SchedTune CGroup */
 121         int boost;
 122
 123         /* Performance Boost (B) region threshold params */
 124         int perf_boost_idx;
 125
 126         /* Performance Constraint (C) region threshold params */
 127         int perf_constrain_idx;
 128 };
 129
 130 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
 131 {
 132         return css ? container_of(css, struct schedtune, css) : NULL;
 133 }
 134
 135 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
 136 {
 137         return css_st(task_css(tsk, schedtune_cgrp_id));
 138 }
 139
 140 static inline struct schedtune *parent_st(struct schedtune *st)
 141 {
 142         return css_st(st->css.parent);
 143 }
 144
 145 /*
 146  * SchedTune root control group
 147  * The root control group is used to defined a system-wide boosting tuning,
 148  * which is applied to all tasks in the system.
 149  * Task specific boost tuning could be specified by creating and
 150  * configuring a child control group under the root one.
 151  * By default, system-wide boosting is disabled, i.e. no boosting is applied
 152  * to tasks which are not into a child control group.
 153  */
 154 static struct schedtune
 155 root_schedtune = {
 156         .boost  = 0,
 157         .perf_boost_idx = 0,
 158         .perf_constrain_idx = 0,
 159 };
 160
 161 int
 162 schedtune_accept_deltas(int nrg_delta, int cap_delta,
 163                         struct task_struct *task)
 164 {
 165         struct schedtune *ct;
 166         int perf_boost_idx;
 167         int perf_constrain_idx;
 168
 169         /* Optimal (O) region */
 170         if (nrg_delta < 0 && cap_delta > 0) {
 171                 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
 172                 return INT_MAX;
 173         }
 174
 175         /* Suboptimal (S) region */
 176         if (nrg_delta > 0 && cap_delta < 0) {
 177                 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
 178                 return -INT_MAX;
 179         }
 180
 181         /* Get task specific perf Boost/Constraints indexes */
 182         rcu_read_lock();
 183         ct = task_schedtune(task);
 184         perf_boost_idx = ct->perf_boost_idx;
 185         perf_constrain_idx = ct->perf_constrain_idx;
 186         rcu_read_unlock();
 187
 188         return __schedtune_accept_deltas(nrg_delta, cap_delta,
 189                         perf_boost_idx, perf_constrain_idx);
 190 }
 191
 192 /*
 193  * Maximum number of boost groups to support
 194  * When per-task boosting is used we still allow only limited number of
 195  * boost groups for two main reasons:
 196  * 1. on a real system we usually have only few classes of workloads which
 197  *    make sense to boost with different values (e.g. background vs foreground
 198  *    tasks, interactive vs low-priority tasks)
 199  * 2. a limited number allows for a simpler and more memory/time efficient
 200  *    implementation especially for the computation of the per-CPU boost
 201  *    value
 202  */
 203 #define BOOSTGROUPS_COUNT 4
 204
 205 /* Array of configured boostgroups */
 206 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
 207         &root_schedtune,
 208         NULL,
 209 };
 210
 211 /* SchedTune boost groups
 212  * Keep track of all the boost groups which impact on CPU, for example when a
 213  * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
 214  * likely with different boost values.
 215  * Since on each system we expect only a limited number of boost groups, here
 216  * we use a simple array to keep track of the metrics required to compute the
 217  * maximum per-CPU boosting value.
 218  */
 219 struct boost_groups {
 220         /* Maximum boost value for all RUNNABLE tasks on a CPU */
 221         bool idle;
 222         int boost_max;
 223         struct {
 224                 /* The boost for tasks on that boost group */
 225                 int boost;
 226                 /* Count of RUNNABLE tasks on that boost group */
 227                 unsigned tasks;
 228         } group[BOOSTGROUPS_COUNT];
 229         /* CPU's boost group locking */
 230         raw_spinlock_t lock;
 231 };
 232
 233 /* Boost groups affecting each CPU in the system */
 234 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
 235
 236 static void
 237 schedtune_cpu_update(int cpu)
 238 {
 239         struct boost_groups *bg;
 240         int boost_max;
 241         int idx;
 242
 243         bg = &per_cpu(cpu_boost_groups, cpu);
 244
 245         /* The root boost group is always active */
 246         boost_max = bg->group[0].boost;
 247         for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
 248                 /*
 249                  * A boost group affects a CPU only if it has
 250                  * RUNNABLE tasks on that CPU
 251                  */
 252                 if (bg->group[idx].tasks == 0)
 253                         continue;
 254
 255                 boost_max = max(boost_max, bg->group[idx].boost);
 256         }
 257         /* Ensures boost_max is non-negative when all cgroup boost values
 258          * are neagtive. Avoids under-accounting of cpu capacity which may cause
 259          * task stacking and frequency spikes.*/
 260         boost_max = max(boost_max, 0);
 261         bg->boost_max = boost_max;
 262 }
 263
 264 static int
 265 schedtune_boostgroup_update(int idx, int boost)
 266 {
 267         struct boost_groups *bg;
 268         int cur_boost_max;
 269         int old_boost;
 270         int cpu;
 271
 272         /* Update per CPU boost groups */
 273         for_each_possible_cpu(cpu) {
 274                 bg = &per_cpu(cpu_boost_groups, cpu);
 275
 276                 /*
 277                  * Keep track of current boost values to compute the per CPU
 278                  * maximum only when it has been affected by the new value of
 279                  * the updated boost group
 280                  */
 281                 cur_boost_max = bg->boost_max;
 282                 old_boost = bg->group[idx].boost;
 283
 284                 /* Update the boost value of this boost group */
 285                 bg->group[idx].boost = boost;
 286
 287                 /* Check if this update increase current max */
 288                 if (boost > cur_boost_max && bg->group[idx].tasks) {
 289                         bg->boost_max = boost;
 290                         trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
 291                         continue;
 292                 }
 293
 294                 /* Check if this update has decreased current max */
 295                 if (cur_boost_max == old_boost && old_boost > boost) {
 296                         schedtune_cpu_update(cpu);
 297                         trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
 298                         continue;
 299                 }
 300
 301                 trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
 302         }
 303
 304         return 0;
 305 }
 306
 307 #define ENQUEUE_TASK  1
 308 #define DEQUEUE_TASK -1
 309
 310 static inline void
 311 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
 312 {
 313         struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
 314         int tasks = bg->group[idx].tasks + task_count;
 315
 316         /* Update boosted tasks count while avoiding to make it negative */
 317         bg->group[idx].tasks = max(0, tasks);
 318
 319         trace_sched_tune_tasks_update(p, cpu, tasks, idx,
 320                         bg->group[idx].boost, bg->boost_max);
 321
 322         /* Boost group activation or deactivation on that RQ */
 323         if (tasks == 1 || tasks == 0)
 324                 schedtune_cpu_update(cpu);
 325 }
 326
 327 /*
 328  * NOTE: This function must be called while holding the lock on the CPU RQ
 329  */
 330 void schedtune_enqueue_task(struct task_struct *p, int cpu)
 331 {
 332         struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
 333         unsigned long irq_flags;
 334         struct schedtune *st;
 335         int idx;
 336
 337         if (!unlikely(schedtune_initialized))
 338                 return;
 339
 340         /*
 341          * When a task is marked PF_EXITING by do_exit() it's going to be
 342          * dequeued and enqueued multiple times in the exit path.
 343          * Thus we avoid any further update, since we do not want to change
 344          * CPU boosting while the task is exiting.
 345          */
 346         if (p->flags & PF_EXITING)
 347                 return;
 348
 349         /*
 350          * Boost group accouting is protected by a per-cpu lock and requires
 351          * interrupt to be disabled to avoid race conditions for example on
 352          * do_exit()::cgroup_exit() and task migration.
 353          */
 354         raw_spin_lock_irqsave(&bg->lock, irq_flags);
 355         rcu_read_lock();
 356
 357         st = task_schedtune(p);
 358         idx = st->idx;
 359
 360         schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
 361
 362         rcu_read_unlock();
 363         raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
 364 }
 365
 366 int schedtune_allow_attach(struct cgroup_taskset *tset)
 367 {
 368         /* We always allows tasks to be moved between existing CGroups */
 369         return 0;
 370 }
 371
 372 int schedtune_can_attach(struct cgroup_taskset *tset)
 373 {
 374         struct task_struct *task;
 375         struct cgroup_subsys_state *css;
 376         struct boost_groups *bg;
 377         unsigned long irq_flags;
 378         unsigned int cpu;
 379         struct rq *rq;
 380         int src_bg; /* Source boost group index */
 381         int dst_bg; /* Destination boost group index */
 382         int tasks;
 383
 384         if (!unlikely(schedtune_initialized))
 385                 return 0;
 386
 387
 388         cgroup_taskset_for_each(task, css, tset) {
 389
 390                 /*
 391                  * Lock the CPU's RQ the task is enqueued to avoid race
 392                  * conditions with migration code while the task is being
 393                  * accounted
 394                  */
 395                 rq = lock_rq_of(task, &irq_flags);
 396
 397                 if (!task->on_rq) {
 398                         unlock_rq_of(rq, task, &irq_flags);
 399                         continue;
 400                 }
 401
 402                 /*
 403                  * Boost group accouting is protected by a per-cpu lock and requires
 404                  * interrupt to be disabled to avoid race conditions on...
 405                  */
 406                 cpu = cpu_of(rq);
 407                 bg = &per_cpu(cpu_boost_groups, cpu);
 408                 raw_spin_lock(&bg->lock);
 409
 410                 dst_bg = css_st(css)->idx;
 411                 src_bg = task_schedtune(task)->idx;
 412
 413                 /*
 414                  * Current task is not changing boostgroup, which can
 415                  * happen when the new hierarchy is in use.
 416                  */
 417                 if (unlikely(dst_bg == src_bg)) {
 418                         raw_spin_unlock(&bg->lock);
 419                         unlock_rq_of(rq, task, &irq_flags);
 420                         continue;
 421                 }
 422
 423                 /*
 424                  * This is the case of a RUNNABLE task which is switching its
 425                  * current boost group.
 426                  */
 427
 428                 /* Move task from src to dst boost group */
 429                 tasks = bg->group[src_bg].tasks - 1;
 430                 bg->group[src_bg].tasks = max(0, tasks);
 431                 bg->group[dst_bg].tasks += 1;
 432
 433                 raw_spin_unlock(&bg->lock);
 434                 unlock_rq_of(rq, task, &irq_flags);
 435
 436                 /* Update CPU boost group */
 437                 if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
 438                         schedtune_cpu_update(task_cpu(task));
 439
 440         }
 441
 442         return 0;
 443 }
 444
 445 void schedtune_cancel_attach(struct cgroup_taskset *tset)
 446 {
 447         /* This can happen only if SchedTune controller is mounted with
 448          * other hierarchies ane one of them fails. Since usually SchedTune is
 449          * mouted on its own hierarcy, for the time being we do not implement
 450          * a proper rollback mechanism */
 451         WARN(1, "SchedTune cancel attach not implemented");
 452 }
 453
 454 /*
 455  * NOTE: This function must be called while holding the lock on the CPU RQ
 456  */
 457 void schedtune_dequeue_task(struct task_struct *p, int cpu)
 458 {
 459         struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
 460         unsigned long irq_flags;
 461         struct schedtune *st;
 462         int idx;
 463
 464         if (!unlikely(schedtune_initialized))
 465                 return;
 466
 467         /*
 468          * When a task is marked PF_EXITING by do_exit() it's going to be
 469          * dequeued and enqueued multiple times in the exit path.
 470          * Thus we avoid any further update, since we do not want to change
 471          * CPU boosting while the task is exiting.
 472          * The last dequeue is already enforce by the do_exit() code path
 473          * via schedtune_exit_task().
 474          */
 475         if (p->flags & PF_EXITING)
 476                 return;
 477
 478         /*
 479          * Boost group accouting is protected by a per-cpu lock and requires
 480          * interrupt to be disabled to avoid race conditions on...
 481          */
 482         raw_spin_lock_irqsave(&bg->lock, irq_flags);
 483         rcu_read_lock();
 484
 485         st = task_schedtune(p);
 486         idx = st->idx;
 487
 488         schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
 489
 490         rcu_read_unlock();
 491         raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
 492 }
 493
 494 void schedtune_exit_task(struct task_struct *tsk)
 495 {
 496         struct schedtune *st;
 497         unsigned long irq_flags;
 498         unsigned int cpu;
 499         struct rq *rq;
 500         int idx;
 501
 502         if (!unlikely(schedtune_initialized))
 503                 return;
 504
 505         rq = lock_rq_of(tsk, &irq_flags);
 506         rcu_read_lock();
 507
 508         cpu = cpu_of(rq);
 509         st = task_schedtune(tsk);
 510         idx = st->idx;
 511         schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
 512
 513         rcu_read_unlock();
 514         unlock_rq_of(rq, tsk, &irq_flags);
 515 }
 516
 517 int schedtune_cpu_boost(int cpu)
 518 {
 519         struct boost_groups *bg;
 520
 521         bg = &per_cpu(cpu_boost_groups, cpu);
 522         return bg->boost_max;
 523 }
 524
 525 int schedtune_task_boost(struct task_struct *p)
 526 {
 527         struct schedtune *st;
 528         int task_boost;
 529
 530         /* Get task boost value */
 531         rcu_read_lock();
 532         st = task_schedtune(p);
 533         task_boost = st->boost;
 534         rcu_read_unlock();
 535
 536         return task_boost;
 537 }
 538
 539 static s64
 540 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 541 {
 542         struct schedtune *st = css_st(css);
 543
 544         return st->boost;
 545 }
 546
 547 static int
 548 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 549             s64 boost)
 550 {
 551         struct schedtune *st = css_st(css);
 552         unsigned threshold_idx;
 553         int boost_pct;
 554
 555         if (boost < -100 || boost > 100)
 556                 return -EINVAL;
 557         boost_pct = boost;
 558
 559         /*
 560          * Update threshold params for Performance Boost (B)
 561          * and Performance Constraint (C) regions.
 562          * The current implementatio uses the same cuts for both
 563          * B and C regions.
 564          */
 565         threshold_idx = clamp(boost_pct, 0, 99) / 10;
 566         st->perf_boost_idx = threshold_idx;
 567         st->perf_constrain_idx = threshold_idx;
 568
 569         st->boost = boost;
 570         if (css == &root_schedtune.css) {
 571                 sysctl_sched_cfs_boost = boost;
 572                 perf_boost_idx  = threshold_idx;
 573                 perf_constrain_idx  = threshold_idx;
 574         }
 575
 576         /* Update CPU boost */
 577         schedtune_boostgroup_update(st->idx, st->boost);
 578
 579         trace_sched_tune_config(st->boost);
 580
 581         return 0;
 582 }
 583
 584 static struct cftype files[] = {
 585         {
 586                 .name = "boost",
 587                 .read_s64 = boost_read,
 588                 .write_s64 = boost_write,
 589         },
 590         { }     /* terminate */
 591 };
 592
 593 static int
 594 schedtune_boostgroup_init(struct schedtune *st)
 595 {
 596         struct boost_groups *bg;
 597         int cpu;
 598
 599         /* Keep track of allocated boost groups */
 600         allocated_group[st->idx] = st;
 601
 602         /* Initialize the per CPU boost groups */
 603         for_each_possible_cpu(cpu) {
 604                 bg = &per_cpu(cpu_boost_groups, cpu);
 605                 bg->group[st->idx].boost = 0;
 606                 bg->group[st->idx].tasks = 0;
 607         }
 608
 609         return 0;
 610 }
 611
 612 static struct cgroup_subsys_state *
 613 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
 614 {
 615         struct schedtune *st;
 616         int idx;
 617
 618         if (!parent_css)
 619                 return &root_schedtune.css;
 620
 621         /* Allow only single level hierachies */
 622         if (parent_css != &root_schedtune.css) {
 623                 pr_err("Nested SchedTune boosting groups not allowed\n");
 624                 return ERR_PTR(-ENOMEM);
 625         }
 626
 627         /* Allow only a limited number of boosting groups */
 628         for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
 629                 if (!allocated_group[idx])
 630                         break;
 631         if (idx == BOOSTGROUPS_COUNT) {
 632                 pr_err("Trying to create more than %d SchedTune boosting groups\n",
 633                        BOOSTGROUPS_COUNT);
 634                 return ERR_PTR(-ENOSPC);
 635         }
 636
 637         st = kzalloc(sizeof(*st), GFP_KERNEL);
 638         if (!st)
 639                 goto out;
 640
 641         /* Initialize per CPUs boost group support */
 642         st->idx = idx;
 643         if (schedtune_boostgroup_init(st))
 644                 goto release;
 645
 646         return &st->css;
 647
 648 release:
 649         kfree(st);
 650 out:
 651         return ERR_PTR(-ENOMEM);
 652 }
 653
 654 static void
 655 schedtune_boostgroup_release(struct schedtune *st)
 656 {
 657         /* Reset this boost group */
 658         schedtune_boostgroup_update(st->idx, 0);
 659
 660         /* Keep track of allocated boost groups */
 661         allocated_group[st->idx] = NULL;
 662 }
 663
 664 static void
 665 schedtune_css_free(struct cgroup_subsys_state *css)
 666 {
 667         struct schedtune *st = css_st(css);
 668
 669         schedtune_boostgroup_release(st);
 670         kfree(st);
 671 }
 672
 673 struct cgroup_subsys schedtune_cgrp_subsys = {
 674         .css_alloc      = schedtune_css_alloc,
 675         .css_free       = schedtune_css_free,
 676 //      .allow_attach   = schedtune_allow_attach,
 677         .can_attach     = schedtune_can_attach,
 678         .cancel_attach  = schedtune_cancel_attach,
 679         .legacy_cftypes = files,
 680         .early_init     = 1,
 681 };
 682
 683 static inline void
 684 schedtune_init_cgroups(void)
 685 {
 686         struct boost_groups *bg;
 687         int cpu;
 688
 689         /* Initialize the per CPU boost groups */
 690         for_each_possible_cpu(cpu) {
 691                 bg = &per_cpu(cpu_boost_groups, cpu);
 692                 memset(bg, 0, sizeof(struct boost_groups));
 693         }
 694
 695         pr_info("schedtune: configured to support %d boost groups\n",
 696                 BOOSTGROUPS_COUNT);
 697 }
 698
 699 #else /* CONFIG_CGROUP_SCHEDTUNE */
 700
 701 int
 702 schedtune_accept_deltas(int nrg_delta, int cap_delta,
 703                         struct task_struct *task)
 704 {
 705         /* Optimal (O) region */
 706         if (nrg_delta < 0 && cap_delta > 0) {
 707                 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
 708                 return INT_MAX;
 709         }
 710
 711         /* Suboptimal (S) region */
 712         if (nrg_delta > 0 && cap_delta < 0) {
 713                 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
 714                 return -INT_MAX;
 715         }
 716
 717         return __schedtune_accept_deltas(nrg_delta, cap_delta,
 718                         perf_boost_idx, perf_constrain_idx);
 719 }
 720
 721 #endif /* CONFIG_CGROUP_SCHEDTUNE */
 722
 723 int
 724 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
 725                                void __user *buffer, size_t *lenp,
 726                                loff_t *ppos)
 727 {
 728         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 729         unsigned threshold_idx;
 730         int boost_pct;
 731
 732         if (ret || !write)
 733                 return ret;
 734
 735         if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
 736                 return -EINVAL;
 737         boost_pct = sysctl_sched_cfs_boost;
 738
 739         /*
 740          * Update threshold params for Performance Boost (B)
 741          * and Performance Constraint (C) regions.
 742          * The current implementatio uses the same cuts for both
 743          * B and C regions.
 744          */
 745         threshold_idx = clamp(boost_pct, 0, 99) / 10;
 746         perf_boost_idx = threshold_idx;
 747         perf_constrain_idx = threshold_idx;
 748
 749         return 0;
 750 }
 751
 752 #ifdef CONFIG_SCHED_DEBUG
 753 static void
 754 schedtune_test_nrg(unsigned long delta_pwr)
 755 {
 756         unsigned long test_delta_pwr;
 757         unsigned long test_norm_pwr;
 758         int idx;
 759
 760         /*
 761          * Check normalization constants using some constant system
 762          * energy values
 763          */
 764         pr_info("schedtune: verify normalization constants...\n");
 765         for (idx = 0; idx < 6; ++idx) {
 766                 test_delta_pwr = delta_pwr >> idx;
 767
 768                 /* Normalize on max energy for target platform */
 769                 test_norm_pwr = reciprocal_divide(
 770                                         test_delta_pwr << SCHED_LOAD_SHIFT,
 771                                         schedtune_target_nrg.rdiv);
 772
 773                 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
 774                         idx, test_delta_pwr, test_norm_pwr);
 775         }
 776 }
 777 #else
 778 #define schedtune_test_nrg(delta_pwr)
 779 #endif
 780
 781 /*
 782  * Compute the min/max power consumption of a cluster and all its CPUs
 783  */
 784 static void
 785 schedtune_add_cluster_nrg(
 786                 struct sched_domain *sd,
 787                 struct sched_group *sg,
 788                 struct target_nrg *ste)
 789 {
 790         struct sched_domain *sd2;
 791         struct sched_group *sg2;
 792
 793         struct cpumask *cluster_cpus;
 794         char str[32];
 795
 796         unsigned long min_pwr;
 797         unsigned long max_pwr;
 798         int cpu;
 799
 800         /* Get Cluster energy using EM data for the first CPU */
 801         cluster_cpus = sched_group_cpus(sg);
 802         snprintf(str, 32, "CLUSTER[%*pbl]",
 803                  cpumask_pr_args(cluster_cpus));
 804
 805         min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
 806         max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
 807         pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
 808                 str, min_pwr, max_pwr);
 809
 810         /*
 811          * Keep track of this cluster's energy in the computation of the
 812          * overall system energy
 813          */
 814         ste->min_power += min_pwr;
 815         ste->max_power += max_pwr;
 816
 817         /* Get CPU energy using EM data for each CPU in the group */
 818         for_each_cpu(cpu, cluster_cpus) {
 819                 /* Get a SD view for the specific CPU */
 820                 for_each_domain(cpu, sd2) {
 821                         /* Get the CPU group */
 822                         sg2 = sd2->groups;
 823                         min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
 824                         max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
 825
 826                         ste->min_power += min_pwr;
 827                         ste->max_power += max_pwr;
 828
 829                         snprintf(str, 32, "CPU[%d]", cpu);
 830                         pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
 831                                 str, min_pwr, max_pwr);
 832
 833                         /*
 834                          * Assume we have EM data only at the CPU and
 835                          * the upper CLUSTER level
 836                          */
 837                         BUG_ON(!cpumask_equal(
 838                                 sched_group_cpus(sg),
 839                                 sched_group_cpus(sd2->parent->groups)
 840                                 ));
 841                         break;
 842                 }
 843         }
 844 }
 845
 846 /*
 847  * Initialize the constants required to compute normalized energy.
 848  * The values of these constants depends on the EM data for the specific
 849  * target system and topology.
 850  * Thus, this function is expected to be called by the code
 851  * that bind the EM to the topology information.
 852  */
 853 static int
 854 schedtune_init(void)
 855 {
 856         struct target_nrg *ste = &schedtune_target_nrg;
 857         unsigned long delta_pwr = 0;
 858         struct sched_domain *sd;
 859         struct sched_group *sg;
 860
 861         pr_info("schedtune: init normalization constants...\n");
 862         ste->max_power = 0;
 863         ste->min_power = 0;
 864
 865         rcu_read_lock();
 866
 867         /*
 868          * When EAS is in use, we always have a pointer to the highest SD
 869          * which provides EM data.
 870          */
 871         sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
 872         if (!sd) {
 873                 pr_info("schedtune: no energy model data\n");
 874                 goto nodata;
 875         }
 876
 877         sg = sd->groups;
 878         do {
 879                 schedtune_add_cluster_nrg(sd, sg, ste);
 880         } while (sg = sg->next, sg != sd->groups);
 881
 882         rcu_read_unlock();
 883
 884         pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
 885                 "SYSTEM", ste->min_power, ste->max_power);
 886
 887         /* Compute normalization constants */
 888         delta_pwr = ste->max_power - ste->min_power;
 889         ste->rdiv = reciprocal_value(delta_pwr);
 890         pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
 891                 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
 892
 893         schedtune_test_nrg(delta_pwr);
 894
 895 #ifdef CONFIG_CGROUP_SCHEDTUNE
 896         schedtune_init_cgroups();
 897 #else
 898         pr_info("schedtune: configured to support global boosting only\n");
 899 #endif
 900
 901         return 0;
 902
 903 nodata:
 904         rcu_read_unlock();
 905         return -EINVAL;
 906 }
 907 late_initcall(schedtune_init);