sched: Introduce Window Assisted Load Tracking (WALT)
authorSrivatsa Vaddagiri <vatsa@codeaurora.org>
Tue, 31 May 2016 16:08:38 +0000 (09:08 -0700)
committerAmit Pundir <amit.pundir@linaro.org>
Wed, 14 Sep 2016 09:32:22 +0000 (15:02 +0530)
use a window based view of time in order to track task
demand and CPU utilization in the scheduler.

Window Assisted Load Tracking (WALT) implementation credits:
 Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
 Pavan Kumar Kondeti, Olav Haugan

2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
            and Todd Kjos

Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708

Includes fixes for issues:

eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a
race resulting in watchdog resets
BUG: 29353986
Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb

Handle walt accounting anomoly during resume

During resume, there is a corner case where on wakeup, a task's
prev_runnable_sum can go negative. This is a workaround that
fixes the condition and warns (instead of crashing).

BUG: 29464099
Change-Id: I173e7874324b31a3584435530281708145773508

Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
13 files changed:
include/linux/sched.h
include/linux/sched/sysctl.h
include/trace/events/sched.h
init/Kconfig
kernel/sched/Makefile
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sched/stop_task.c
kernel/sched/walt.c [new file with mode: 0644]
kernel/sched/walt.h [new file with mode: 0644]
kernel/sysctl.c

index f1a28bafe7ea04114b9e3908e5ec0acce4bd5637..ede29e8db82ddaa03698dd9a778124b935e204fd 100644 (file)
@@ -317,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!(
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
+enum task_event {
+       PUT_PREV_TASK   = 0,
+       PICK_NEXT_TASK  = 1,
+       TASK_WAKE       = 2,
+       TASK_MIGRATE    = 3,
+       TASK_UPDATE     = 4,
+       IRQ_UPDATE      = 5,
+};
+
 #include <linux/spinlock.h>
 
 /*
@@ -1276,6 +1285,41 @@ struct sched_statistics {
 };
 #endif
 
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX  5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+       /*
+        * 'mark_start' marks the beginning of an event (task waking up, task
+        * starting to execute, task being preempted) within a window
+        *
+        * 'sum' represents how runnable a task has been within current
+        * window. It incorporates both running time and wait time and is
+        * frequency scaled.
+        *
+        * 'sum_history' keeps track of history of 'sum' seen over previous
+        * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+        * ignored.
+        *
+        * 'demand' represents maximum sum seen over previous
+        * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+        * demand for tasks.
+        *
+        * 'curr_window' represents task's contribution to cpu busy time
+        * statistics (rq->curr_runnable_sum) in current window
+        *
+        * 'prev_window' represents task's contribution to cpu busy time
+        * statistics (rq->prev_runnable_sum) in previous window
+        */
+       u64 mark_start;
+       u32 sum, demand;
+       u32 sum_history[RAVG_HIST_SIZE_MAX];
+       u32 curr_window, prev_window;
+       u16 active_windows;
+};
+#endif
+
 struct sched_entity {
        struct load_weight      load;           /* for load-balancing */
        struct rb_node          run_node;
@@ -1433,6 +1477,15 @@ struct task_struct {
        const struct sched_class *sched_class;
        struct sched_entity se;
        struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+       struct ravg ravg;
+       /*
+        * 'init_load_pct' represents the initial task load assigned to children
+        * of this task
+        */
+       u32 init_load_pct;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
        struct task_group *sched_task_group;
 #endif
index 2834841c507efbfc10fe2d3d85bef2bff47248a0..710f58a28d638f54ac85bea50f96eaac2296775c 100644 (file)
@@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little;
 extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+#endif
 
 enum sched_tunable_scaling {
        SCHED_TUNABLESCALING_NONE,
index debcf417c535a69f4e8954e7ad354e1b647977f2..fa1b3df836bc8c704253dd148b56fa93cabb2652 100644 (file)
@@ -937,6 +937,155 @@ TRACE_EVENT(sched_tune_filter,
                __entry->payoff, __entry->region)
 );
 
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+       TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+                                               u64 wallclock, u64 irqtime),
+
+       TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        pid_t,  cur_pid                 )
+               __field(unsigned int,   cur_freq                )
+               __field(        u64,    wallclock               )
+               __field(        u64,    mark_start              )
+               __field(        u64,    delta_m                 )
+               __field(        u64,    win_start               )
+               __field(        u64,    delta                   )
+               __field(        u64,    irqtime                 )
+               __field(        int,    evt                     )
+               __field(unsigned int,   demand                  )
+               __field(unsigned int,   sum                     )
+               __field(         int,   cpu                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        u32,    curr_window             )
+               __field(        u32,    prev_window             )
+               __field(        u64,    nt_cs                   )
+               __field(        u64,    nt_ps                   )
+               __field(        u32,    active_windows          )
+       ),
+
+       TP_fast_assign(
+               __entry->wallclock      = wallclock;
+               __entry->win_start      = rq->window_start;
+               __entry->delta          = (wallclock - rq->window_start);
+               __entry->evt            = evt;
+               __entry->cpu            = rq->cpu;
+               __entry->cur_pid        = rq->curr->pid;
+               __entry->cur_freq       = rq->cur_freq;
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->mark_start     = p->ravg.mark_start;
+               __entry->delta_m        = (wallclock - p->ravg.mark_start);
+               __entry->demand         = p->ravg.demand;
+               __entry->sum            = p->ravg.sum;
+               __entry->irqtime        = irqtime;
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->curr_window    = p->ravg.curr_window;
+               __entry->prev_window    = p->ravg.prev_window;
+               __entry->nt_cs          = rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = rq->nt_prev_runnable_sum;
+               __entry->active_windows = p->ravg.active_windows;
+       ),
+
+       TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+               " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
+               , __entry->wallclock, __entry->win_start, __entry->delta,
+               __entry->evt, __entry->cpu,
+               __entry->cur_freq, __entry->cur_pid,
+               __entry->pid, __entry->comm, __entry->mark_start,
+               __entry->delta_m, __entry->demand,
+               __entry->sum, __entry->irqtime,
+               __entry->cs, __entry->ps,
+               __entry->curr_window, __entry->prev_window,
+                 __entry->nt_cs, __entry->nt_ps,
+                 __entry->active_windows
+               )
+);
+
+TRACE_EVENT(walt_update_history,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+                       int evt),
+
+       TP_ARGS(rq, p, runtime, samples, evt),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(unsigned int,   runtime                 )
+               __field(         int,   samples                 )
+               __field(         int,   evt                     )
+               __field(         u64,   demand                  )
+               __field(unsigned int,   walt_avg                )
+               __field(unsigned int,   pelt_avg                )
+               __array(         u32,   hist, RAVG_HIST_SIZE_MAX)
+               __field(         int,   cpu                     )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->runtime        = runtime;
+               __entry->samples        = samples;
+               __entry->evt            = evt;
+               __entry->demand         = p->ravg.demand;
+               __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
+               __entry->pelt_avg       = p->se.avg.util_avg;
+               memcpy(__entry->hist, p->ravg.sum_history,
+                                       RAVG_HIST_SIZE_MAX * sizeof(u32));
+               __entry->cpu            = rq->cpu;
+       ),
+
+       TP_printk("%d (%s): runtime %u samples %d event %d demand %llu"
+               " walt %u pelt %u (hist: %u %u %u %u %u) cpu %d",
+               __entry->pid, __entry->comm,
+               __entry->runtime, __entry->samples, __entry->evt,
+               __entry->demand,
+               __entry->walt_avg,
+               __entry->pelt_avg,
+               __entry->hist[0], __entry->hist[1],
+               __entry->hist[2], __entry->hist[3],
+               __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p),
+
+       TP_ARGS(rq, p),
+
+       TP_STRUCT__entry(
+               __field(int,            cpu                     )
+               __field(int,            pid                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        s64,    nt_cs                   )
+               __field(        s64,    nt_ps                   )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu            = cpu_of(rq);
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->nt_cs          = (s64)rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = (s64)rq->nt_prev_runnable_sum;
+               __entry->pid            = p->pid;
+       ),
+
+       TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
+                 __entry->cpu, __entry->cs, __entry->ps,
+                 __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
+
 #endif /* CONFIG_SMP */
 
 #endif /* _TRACE_SCHED_H */
index 71f3ce8107347d325d8e37af8c9f3007ff383404..e71e35cf723c954166ff5664feee30fac635948d 100644 (file)
@@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING
 
 endchoice
 
+config SCHED_WALT
+        bool "Support window based load tracking"
+        depends on SMP
+        help
+        This feature will allow the scheduler to maintain a tunable window
+       based set of metrics for tasks and runqueues. These metrics can be
+       used to guide task placement as well as task frequency requirements
+       for cpufreq governors.
+
 config BSD_PROCESS_ACCT
        bool "BSD Process Accounting"
        depends on MULTIUSER
index 174762d8695b7c60cde937ed36b6971e563488f0..623ce4bde0d5123a3960087e7f6c9bb6e6b11c25 100644 (file)
@@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
index b814c13f850ff4f14893ac8479e528bbeb38e6d0..4c981dfc34eefce2bd7e2232ee45cd7cb8177d9f 100644 (file)
@@ -89,6 +89,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#include "walt.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1085,7 +1086,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
        dequeue_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_MIGRATING;
+       double_lock_balance(rq, cpu_rq(new_cpu));
        set_task_cpu(p, new_cpu);
+       double_unlock_balance(rq, cpu_rq(new_cpu));
        raw_spin_unlock(&rq->lock);
 
        rq = cpu_rq(new_cpu);
@@ -1309,6 +1312,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                        p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
+
+               walt_fixup_busy_time(p, new_cpu);
        }
 
        __set_task_cpu(p, new_cpu);
@@ -1937,6 +1942,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
        unsigned long flags;
        int cpu, success = 0;
+#ifdef CONFIG_SMP
+       struct rq *rq;
+       u64 wallclock;
+#endif
 
        /*
         * If we are going to wake up a thread waiting for CONDITION we
@@ -1994,6 +2003,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_rmb();
 
+       rq = cpu_rq(task_cpu(p));
+
+       raw_spin_lock(&rq->lock);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+       raw_spin_unlock(&rq->lock);
+
        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
 
@@ -2001,10 +2018,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                p->sched_class->task_waking(p);
 
        cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
        }
+
 #endif /* CONFIG_SMP */
 
        ttwu_queue(p, cpu);
@@ -2053,8 +2072,13 @@ static void try_to_wake_up_local(struct task_struct *p)
 
        trace_sched_waking(p);
 
-       if (!task_on_rq_queued(p))
+       if (!task_on_rq_queued(p)) {
+               u64 wallclock = walt_ktime_clock();
+
+               walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+               walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+       }
 
        ttwu_do_wakeup(rq, p, 0);
        ttwu_stat(p, smp_processor_id(), 0);
@@ -2120,6 +2144,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
+       walt_init_new_task_load(p);
 
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2387,6 +2412,9 @@ void wake_up_new_task(struct task_struct *p)
        struct rq *rq;
 
        raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+       walt_init_new_task_load(p);
+
        /* Initialize new task's runnable average */
        init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
@@ -2399,6 +2427,7 @@ void wake_up_new_task(struct task_struct *p)
 #endif
 
        rq = __task_rq_lock(p);
+       walt_mark_task_starting(p);
        activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
@@ -2948,9 +2977,12 @@ void scheduler_tick(void)
        sched_clock_tick();
 
        raw_spin_lock(&rq->lock);
+       walt_set_window_start(rq);
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        update_cpu_load_active(rq);
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                       walt_ktime_clock(), 0);
        calc_global_load_tick(rq);
        sched_freq_tick(cpu);
        raw_spin_unlock(&rq->lock);
@@ -3189,6 +3221,7 @@ static void __sched notrace __schedule(bool preempt)
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
+       u64 wallclock;
 
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -3250,6 +3283,9 @@ static void __sched notrace __schedule(bool preempt)
                update_rq_clock(rq);
 
        next = pick_next_task(rq, prev);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+       walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->clock_skip_update = 0;
@@ -5672,6 +5708,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action & ~CPU_TASKS_FROZEN) {
 
        case CPU_UP_PREPARE:
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               walt_set_window_start(rq);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
                rq->calc_load_update = calc_load_update;
                account_reset_rq(rq);
                break;
@@ -5692,6 +5731,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
+               walt_migrate_sync_cpu(cpu);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
@@ -7536,6 +7576,7 @@ void __init sched_init_smp(void)
 {
        cpumask_var_t non_isolated_cpus;
 
+       walt_init_cpu_efficiency();
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
index 84f5e12c8e123e1973f1bb23cd1242aa7a0e877e..15b8a8f34bd9a31aa892706fa46c7059b545a7d7 100644 (file)
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/module.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 #include "tune.h"
+#include "walt.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -56,6 +58,10 @@ unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;
 
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -4225,6 +4231,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                flags = ENQUEUE_WAKEUP;
        }
@@ -4232,6 +4239,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4246,6 +4254,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
 
        if (!se) {
+               walt_inc_cumulative_runnable_avg(rq, p);
                if (!task_new && !rq->rd->overutilized &&
                    cpu_overutilized(rq->cpu))
                        rq->rd->overutilized = true;
@@ -4295,6 +4304,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
@@ -4315,6 +4325,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4329,6 +4340,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
 
        if (!se) {
+               walt_dec_cumulative_runnable_avg(rq, p);
 
                /*
                 * We want to potentially trigger a freq switch
@@ -5228,6 +5240,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
 static inline unsigned long task_util(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               unsigned long demand = p->ravg.demand;
+               return (demand << 10) / walt_ravg_window;
+       }
+#endif
        return p->se.avg.util_avg;
 }
 
@@ -6620,7 +6638,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 
        deactivate_task(env->src_rq, p, 0);
        p->on_rq = TASK_ON_RQ_MIGRATING;
+       double_lock_balance(env->src_rq, env->dst_rq);
        set_task_cpu(p, env->dst_cpu);
+       double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
 /*
index 9694204660b7e7c4a29e070252cda0dd9e6007d6..be700bfa1ae493e8728513fd117ae1bff2da199a 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/slab.h>
 #include <linux/irq_work.h>
 
+#include "walt.h"
+
 int sched_rr_timeslice = RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1261,6 +1263,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
                rt_se->timeout = 0;
 
        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+       walt_inc_cumulative_runnable_avg(rq, p);
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1272,6 +1275,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se);
+       walt_dec_cumulative_runnable_avg(rq, p);
 
        dequeue_pushable_task(rq, p);
 }
index 1b838cff2f20da5e407313bf9915869eb3449227..f48fb371913a9bad56c4e233cfb22c26b59b6e41 100644 (file)
@@ -410,6 +410,10 @@ struct cfs_rq {
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
 
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
        int runtime_enabled;
        u64 runtime_expires;
@@ -663,6 +667,27 @@ struct rq {
        u64 max_idle_balance_cost;
 #endif
 
+#ifdef CONFIG_SCHED_WALT
+       /*
+        * max_freq = user or thermal defined maximum
+        * max_possible_freq = maximum supported by hardware
+        */
+       unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+       struct cpumask freq_domain_cpumask;
+
+       u64 cumulative_runnable_avg;
+       int efficiency; /* Differentiate cpus with different IPC capability */
+       int load_scale_factor;
+       int capacity;
+       int max_possible_capacity;
+       u64 window_start;
+       u64 curr_runnable_sum;
+       u64 prev_runnable_sum;
+       u64 nt_curr_runnable_sum;
+       u64 nt_prev_runnable_sum;
+#endif /* CONFIG_SCHED_WALT */
+
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
@@ -1513,6 +1538,10 @@ static inline unsigned long capacity_orig_of(int cpu)
        return cpu_rq(cpu)->cpu_capacity_orig;
 }
 
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+
 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
  * tasks. The unit of the return value must be the one of capacity so we can
@@ -1544,6 +1573,11 @@ static inline unsigned long __cpu_util(int cpu, int delta)
        unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
        unsigned long capacity = capacity_orig_of(cpu);
 
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+               util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
+                       walt_ravg_window;
+#endif
        delta += util;
        if (delta < 0)
                return 0;
index cbc67da109544c4f0841b609e44d7337650aa81c..61f852d46858c868fda8b00601311b7837f66552 100644 (file)
@@ -1,4 +1,5 @@
 #include "sched.h"
+#include "walt.h"
 
 /*
  * stop-task scheduling class.
@@ -42,12 +43,14 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
        add_nr_running(rq, 1);
+       walt_inc_cumulative_runnable_avg(rq, p);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
        sub_nr_running(rq, 1);
+       walt_dec_cumulative_runnable_avg(rq, p);
 }
 
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644 (file)
index 0000000..1dff3d2
--- /dev/null
@@ -0,0 +1,1098 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ *             and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT            0
+#define WINDOW_STATS_MAX               1
+#define WINDOW_STATS_MAX_RECENT_AVG    2
+#define WINDOW_STATS_AVG               3
+#define WINDOW_STATS_INVALID_POLICY    4
+
+#define EXITING_TASK_MARKER    0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+       WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly walt_disabled = 0;
+
+static unsigned int max_possible_efficiency = 1024;
+static unsigned int min_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+static unsigned int min_max_freq = 1;
+
+static unsigned int max_capacity = 1024;
+static unsigned int min_capacity = 1024;
+static unsigned int max_load_scale_factor = 1024;
+static unsigned int max_possible_capacity = 1024;
+
+/* Mask of all CPUs that have  max_possible_capacity */
+static cpumask_t mpc_mask = CPU_MASK_ALL;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+       return p->ravg.demand;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg -= p->ravg.demand;
+       BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+                             struct task_struct *p, s64 task_load_delta)
+{
+       rq->cumulative_runnable_avg += task_load_delta;
+       if ((s64)rq->cumulative_runnable_avg < 0)
+               panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+                       task_load_delta, task_load(p));
+}
+
+u64 walt_ktime_clock(void)
+{
+       if (unlikely(walt_ktime_suspended))
+               return ktime_to_ns(ktime_last);
+       return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+       walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+       ktime_last = ktime_get();
+       walt_ktime_suspended = true;
+       return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+       .resume = walt_resume,
+       .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+       register_syscore_ops(&walt_syscore_ops);
+       return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+       if (p->flags & PF_EXITING) {
+               if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+                       p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+               }
+               return 1;
+       }
+       return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+       get_option(&str, &walt_ravg_window);
+
+       walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+                               walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+       return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+       s64 delta;
+       int nr_windows;
+
+       delta = wallclock - rq->window_start;
+       BUG_ON(delta < 0);
+       if (delta < walt_ravg_window)
+               return;
+
+       nr_windows = div64_u64(delta, walt_ravg_window);
+       rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+       unsigned int cur_freq = rq->cur_freq;
+       int sf;
+
+       if (unlikely(cur_freq > max_possible_freq))
+               cur_freq = rq->max_possible_freq;
+
+       /* round up div64 */
+       delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+                         max_possible_freq);
+
+       sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+       delta *= sf;
+       delta >>= 10;
+
+       return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+       if (!walt_io_is_busy)
+               return 0;
+
+       return atomic_read(&rq->nr_iowait);
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+                                    u64 irqtime, int event)
+{
+       if (is_idle_task(p)) {
+               /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+               if (event == PICK_NEXT_TASK)
+                       return 0;
+
+               /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+               return irqtime || cpu_is_waiting_on_io(rq);
+       }
+
+       if (event == TASK_WAKE)
+               return 0;
+
+       if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+                                        event == TASK_UPDATE)
+               return 1;
+
+       /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+       return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       int new_window, nr_full_windows = 0;
+       int p_is_curr_task = (p == rq->curr);
+       u64 mark_start = p->ravg.mark_start;
+       u64 window_start = rq->window_start;
+       u32 window_size = walt_ravg_window;
+       u64 delta;
+
+       new_window = mark_start < window_start;
+       if (new_window) {
+               nr_full_windows = div64_u64((window_start - mark_start),
+                                               window_size);
+               if (p->ravg.active_windows < USHRT_MAX)
+                       p->ravg.active_windows++;
+       }
+
+       /* Handle per-task window rollover. We don't care about the idle
+        * task or exiting tasks. */
+       if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+               u32 curr_window = 0;
+
+               if (!nr_full_windows)
+                       curr_window = p->ravg.curr_window;
+
+               p->ravg.prev_window = curr_window;
+               p->ravg.curr_window = 0;
+       }
+
+       if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+               /* account_busy_for_cpu_time() = 0, so no update to the
+                * task's current window needs to be made. This could be
+                * for example
+                *
+                *   - a wakeup event on a task within the current
+                *     window (!new_window below, no action required),
+                *   - switching to a new task from idle (PICK_NEXT_TASK)
+                *     in a new window where irqtime is 0 and we aren't
+                *     waiting on IO */
+
+               if (!new_window)
+                       return;
+
+               /* A new window has started. The RQ demand must be rolled
+                * over if p is the current task. */
+               if (p_is_curr_task) {
+                       u64 prev_sum = 0;
+
+                       /* p is either idle task or an exiting task */
+                       if (!nr_full_windows) {
+                               prev_sum = rq->curr_runnable_sum;
+                       }
+
+                       rq->prev_runnable_sum = prev_sum;
+                       rq->curr_runnable_sum = 0;
+               }
+
+               return;
+       }
+
+       if (!new_window) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. No rollover
+                * since we didn't start a new window. An example of this is
+                * when a task starts execution and then sleeps within the
+                * same window. */
+
+               if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+                       delta = wallclock - mark_start;
+               else
+                       delta = irqtime;
+               delta = scale_exec_time(delta, rq);
+               rq->curr_runnable_sum += delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window += delta;
+
+               return;
+       }
+
+       if (!p_is_curr_task) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has also started, but p is not the current task, so the
+                * window is not rolled over - just split up and account
+                * as necessary into curr and prev. The window is only
+                * rolled over when a new window is processed for the current
+                * task.
+                *
+                * Irqtime can't be accounted by a task that isn't the
+                * currently running task. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window += delta;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window = delta;
+               }
+               rq->prev_runnable_sum += delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum += delta;
+               if (!exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. If any of these three above conditions are true
+                * then this busy time can't be accounted as irqtime.
+                *
+                * Busy time for the idle task or exiting tasks need not
+                * be accounted.
+                *
+                * An example of this would be a task that starts execution
+                * and then sleeps once a new window has begun. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window += delta;
+
+                       delta += rq->curr_runnable_sum;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window = delta;
+
+               }
+               /*
+                * Rollover for normal runnable sum is done here by overwriting
+                * the values in prev_runnable_sum and curr_runnable_sum.
+                * Rollover for new task runnable sum has completed by previous
+                * if-else statement.
+                */
+               rq->prev_runnable_sum = delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum = delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (irqtime) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. The current task must be the idle task because
+                * irqtime is not accounted for any other task.
+                *
+                * Irqtime will be accounted each time we process IRQ activity
+                * after a period of idleness, so we know the IRQ busy time
+                * started at wallclock - irqtime. */
+
+               BUG_ON(!is_idle_task(p));
+               mark_start = wallclock - irqtime;
+
+               /* Roll window over. If IRQ busy time was just in the current
+                * window then that is all that need be accounted. */
+               rq->prev_runnable_sum = rq->curr_runnable_sum;
+               if (mark_start > window_start) {
+                       rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+                       return;
+               }
+
+               /* The IRQ busy time spanned multiple windows. Process the
+                * busy time preceding the current window start first. */
+               delta = window_start - mark_start;
+               if (delta > window_size)
+                       delta = window_size;
+               delta = scale_exec_time(delta, rq);
+               rq->prev_runnable_sum += delta;
+
+               /* Process the remaining IRQ busy time in the current window. */
+               delta = wallclock - window_start;
+               rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+               return;
+       }
+
+       BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+       /* No need to bother updating task demand for exiting tasks
+        * or the idle task. */
+       if (exiting_task(p) || is_idle_task(p))
+               return 0;
+
+       /* When a task is waking up it is completing a segment of non-busy
+        * time. Likewise, if wait time is not treated as busy time, then
+        * when a task begins to run or is migrated, it is not running and
+        * is completing a segment of non-busy time. */
+       if (event == TASK_WAKE || (!walt_account_wait_time &&
+                        (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+                        u32 runtime, int samples, int event)
+{
+       u32 *hist = &p->ravg.sum_history[0];
+       int ridx, widx;
+       u32 max = 0, avg, demand;
+       u64 sum = 0;
+
+       /* Ignore windows where task had no activity */
+       if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+                       goto done;
+
+       /* Push new 'runtime' value onto stack */
+       widx = walt_ravg_hist_size - 1;
+       ridx = widx - samples;
+       for (; ridx >= 0; --widx, --ridx) {
+               hist[widx] = hist[ridx];
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+               hist[widx] = runtime;
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       p->ravg.sum = 0;
+
+       if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+               demand = runtime;
+       } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+               demand = max;
+       } else {
+               avg = div64_u64(sum, walt_ravg_hist_size);
+               if (walt_window_stats_policy == WINDOW_STATS_AVG)
+                       demand = avg;
+               else
+                       demand = max(avg, runtime);
+       }
+
+       /*
+        * A throttled deadline sched class task gets dequeued without
+        * changing p->on_rq. Since the dequeue decrements hmp stats
+        * avoid decrementing it here again.
+        */
+       if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+                                               !p->dl.dl_throttled))
+               fixup_cumulative_runnable_avg(rq, p, demand);
+
+       p->ravg.demand = demand;
+
+done:
+       trace_walt_update_history(rq, p, runtime, samples, event);
+       return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+                               u64 delta)
+{
+       delta = scale_exec_time(delta, rq);
+       p->ravg.sum += delta;
+       if (unlikely(p->ravg.sum > walt_ravg_window))
+               p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *     a) Task event is contained within one window.
+ *             window_start < mark_start < wallclock
+ *
+ *             ws   ms  wc
+ *             |    |   |
+ *             V    V   V
+ *             |---------------|
+ *
+ *     In this case, p->ravg.sum is updated *iff* event is appropriate
+ *     (ex: event == PUT_PREV_TASK)
+ *
+ *     b) Task event spans two windows.
+ *             mark_start < window_start < wallclock
+ *
+ *             ms   ws   wc
+ *             |    |    |
+ *             V    V    V
+ *             -----|-------------------
+ *
+ *     In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *     is appropriate, then a new window sample is recorded followed
+ *     by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *     c) Task event spans more than two windows.
+ *
+ *             ms ws_tmp                          ws  wc
+ *             |  |                               |   |
+ *             V  V                               V   V
+ *             ---|-------|-------|-------|-------|------
+ *                |                               |
+ *                |<------ nr_full_windows ------>|
+ *
+ *     In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *     event is appropriate, window sample of p->ravg.sum is recorded,
+ *     'nr_full_window' samples of window_size is also recorded *iff*
+ *     event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *     *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock)
+{
+       u64 mark_start = p->ravg.mark_start;
+       u64 delta, window_start = rq->window_start;
+       int new_window, nr_full_windows;
+       u32 window_size = walt_ravg_window;
+
+       new_window = mark_start < window_start;
+       if (!account_busy_for_task_demand(p, event)) {
+               if (new_window)
+                       /* If the time accounted isn't being accounted as
+                        * busy time, and a new window started, only the
+                        * previous window need be closed out with the
+                        * pre-existing demand. Multiple windows may have
+                        * elapsed, but since empty windows are dropped,
+                        * it is not necessary to account those. */
+                       update_history(rq, p, p->ravg.sum, 1, event);
+               return;
+       }
+
+       if (!new_window) {
+               /* The simple case - busy time contained within the existing
+                * window. */
+               add_to_task_demand(rq, p, wallclock - mark_start);
+               return;
+       }
+
+       /* Busy time spans at least two windows. Temporarily rewind
+        * window_start to first window boundary after mark_start. */
+       delta = window_start - mark_start;
+       nr_full_windows = div64_u64(delta, window_size);
+       window_start -= (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (window_start - mark_start) first */
+       add_to_task_demand(rq, p, window_start - mark_start);
+
+       /* Push new sample(s) into task's demand history */
+       update_history(rq, p, p->ravg.sum, 1, event);
+       if (nr_full_windows)
+               update_history(rq, p, scale_exec_time(window_size, rq),
+                              nr_full_windows, event);
+
+       /* Roll window_start back to current to process any remainder
+        * in current window. */
+       window_start += (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (wallclock - window_start) next */
+       mark_start = window_start;
+       add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       if (walt_disabled || !rq->window_start)
+               return;
+
+       lockdep_assert_held(&rq->lock);
+
+       update_window_start(rq, wallclock);
+
+       if (!p->ravg.mark_start)
+               goto done;
+
+       update_task_demand(p, rq, event, wallclock);
+       update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+       trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+       p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+       return SCHED_LOAD_SCALE;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+       int i, efficiency;
+       unsigned int max = 0, min = UINT_MAX;
+
+       for_each_possible_cpu(i) {
+               efficiency = arch_get_cpu_efficiency(i);
+               cpu_rq(i)->efficiency = efficiency;
+
+               if (efficiency > max)
+                       max = efficiency;
+               if (efficiency < min)
+                       min = efficiency;
+       }
+
+       if (max)
+               max_possible_efficiency = max;
+
+       if (min)
+               min_possible_efficiency = min;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+       u32 sum = 0;
+
+       if (exiting_task(p))
+               sum = EXITING_TASK_MARKER;
+
+       memset(&p->ravg, 0, sizeof(struct ravg));
+       /* Retain EXITING_TASK marker */
+       p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+       u64 wallclock;
+       struct rq *rq = task_rq(p);
+
+       if (!rq->window_start) {
+               reset_task_stats(p);
+               return;
+       }
+
+       wallclock = walt_ktime_clock();
+       p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+       int cpu = cpu_of(rq);
+       struct rq *sync_rq = cpu_rq(sync_cpu);
+
+       if (rq->window_start)
+               return;
+
+       if (cpu == sync_cpu) {
+               rq->window_start = walt_ktime_clock();
+       } else {
+               raw_spin_unlock(&rq->lock);
+               double_rq_lock(rq, sync_rq);
+               rq->window_start = cpu_rq(sync_cpu)->window_start;
+               rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+               raw_spin_unlock(&sync_rq->lock);
+       }
+
+       rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+       if (cpu == sync_cpu)
+               sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+       struct rq *src_rq = task_rq(p);
+       struct rq *dest_rq = cpu_rq(new_cpu);
+       u64 wallclock;
+
+       if (!p->on_rq && p->state != TASK_WAKING)
+               return;
+
+       if (exiting_task(p)) {
+               return;
+       }
+
+       if (p->state == TASK_WAKING)
+               double_rq_lock(src_rq, dest_rq);
+
+       wallclock = walt_ktime_clock();
+
+       walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+                       TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(dest_rq->curr, dest_rq,
+                       TASK_UPDATE, wallclock, 0);
+
+       walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+       if (p->ravg.curr_window) {
+               src_rq->curr_runnable_sum -= p->ravg.curr_window;
+               dest_rq->curr_runnable_sum += p->ravg.curr_window;
+       }
+
+       if (p->ravg.prev_window) {
+               src_rq->prev_runnable_sum -= p->ravg.prev_window;
+               dest_rq->prev_runnable_sum += p->ravg.prev_window;
+       }
+
+       if ((s64)src_rq->prev_runnable_sum < 0) {
+               src_rq->prev_runnable_sum = 0;
+               WARN_ON(1);
+       }
+       if ((s64)src_rq->curr_runnable_sum < 0) {
+               src_rq->curr_runnable_sum = 0;
+               WARN_ON(1);
+       }
+
+       trace_walt_migration_update_sum(src_rq, p);
+       trace_walt_migration_update_sum(dest_rq, p);
+
+       if (p->state == TASK_WAKING)
+               double_rq_unlock(src_rq, dest_rq);
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+       int i;
+       int max = 0, min = INT_MAX;
+
+       for_each_online_cpu(i) {
+               if (cpu_rq(i)->capacity > max)
+                       max = cpu_rq(i)->capacity;
+               if (cpu_rq(i)->capacity < min)
+                       min = cpu_rq(i)->capacity;
+       }
+
+       max_capacity = max;
+       min_capacity = min;
+}
+
+static void update_min_max_capacity(void)
+{
+       unsigned long flags;
+       int i;
+
+       local_irq_save(flags);
+       for_each_possible_cpu(i)
+               raw_spin_lock(&cpu_rq(i)->lock);
+
+       __update_min_max_capacity();
+
+       for_each_possible_cpu(i)
+               raw_spin_unlock(&cpu_rq(i)->lock);
+       local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long capacity_scale_cpu_efficiency(int cpu)
+{
+       return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(int cpu)
+{
+       return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static unsigned long load_scale_cpu_efficiency(int cpu)
+{
+       return DIV_ROUND_UP(1024 * max_possible_efficiency,
+                           cpu_rq(cpu)->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static unsigned long load_scale_cpu_freq(int cpu)
+{
+       return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
+}
+
+static int compute_capacity(int cpu)
+{
+       int capacity = 1024;
+
+       capacity *= capacity_scale_cpu_efficiency(cpu);
+       capacity >>= 10;
+
+       capacity *= capacity_scale_cpu_freq(cpu);
+       capacity >>= 10;
+
+       return capacity;
+}
+
+static int compute_load_scale_factor(int cpu)
+{
+       int load_scale = 1024;
+
+       /*
+        * load_scale_factor accounts for the fact that task load
+        * is in reference to "best" performing cpu. Task's load will need to be
+        * scaled (up) by a factor to determine suitability to be placed on a
+        * (little) cpu.
+        */
+       load_scale *= load_scale_cpu_efficiency(cpu);
+       load_scale >>= 10;
+
+       load_scale *= load_scale_cpu_freq(cpu);
+       load_scale >>= 10;
+
+       return load_scale;
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+               unsigned long val, void *data)
+{
+       struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+       int i, update_max = 0;
+       u64 highest_mpc = 0, highest_mplsf = 0;
+       const struct cpumask *cpus = policy->related_cpus;
+       unsigned int orig_min_max_freq = min_max_freq;
+       unsigned int orig_max_possible_freq = max_possible_freq;
+       /* Initialized to policy->max in case policy->related_cpus is empty! */
+       unsigned int orig_max_freq = policy->max;
+
+       if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+                                               val != CPUFREQ_CREATE_POLICY)
+               return 0;
+
+       if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+               update_min_max_capacity();
+               return 0;
+       }
+
+       for_each_cpu(i, policy->related_cpus) {
+               cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+                            policy->related_cpus);
+               orig_max_freq = cpu_rq(i)->max_freq;
+               cpu_rq(i)->min_freq = policy->min;
+               cpu_rq(i)->max_freq = policy->max;
+               cpu_rq(i)->cur_freq = policy->cur;
+               cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
+       }
+
+       max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+       if (min_max_freq == 1)
+               min_max_freq = UINT_MAX;
+       min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+       BUG_ON(!min_max_freq);
+       BUG_ON(!policy->max);
+
+       /* Changes to policy other than max_freq don't require any updates */
+       if (orig_max_freq == policy->max)
+               return 0;
+
+       /*
+        * A changed min_max_freq or max_possible_freq (possible during bootup)
+        * needs to trigger re-computation of load_scale_factor and capacity for
+        * all possible cpus (even those offline). It also needs to trigger
+        * re-computation of nr_big_task count on all online cpus.
+        *
+        * A changed rq->max_freq otoh needs to trigger re-computation of
+        * load_scale_factor and capacity for just the cluster of cpus involved.
+        * Since small task definition depends on max_load_scale_factor, a
+        * changed load_scale_factor of one cluster could influence
+        * classification of tasks in another cluster. Hence a changed
+        * rq->max_freq will need to trigger re-computation of nr_big_task
+        * count on all online cpus.
+        *
+        * While it should be sufficient for nr_big_tasks to be
+        * re-computed for only online cpus, we have inadequate context
+        * information here (in policy notifier) with regard to hotplug-safety
+        * context in which notification is issued. As a result, we can't use
+        * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
+        * fixed up to issue notification always in hotplug-safe context,
+        * re-compute nr_big_task for all possible cpus.
+        */
+
+       if (orig_min_max_freq != min_max_freq ||
+               orig_max_possible_freq != max_possible_freq) {
+                       cpus = cpu_possible_mask;
+                       update_max = 1;
+       }
+
+       /*
+        * Changed load_scale_factor can trigger reclassification of tasks as
+        * big or small. Make this change "atomic" so that tasks are accounted
+        * properly due to changed load_scale_factor
+        */
+       for_each_cpu(i, cpus) {
+               struct rq *rq = cpu_rq(i);
+
+               rq->capacity = compute_capacity(i);
+               rq->load_scale_factor = compute_load_scale_factor(i);
+
+               if (update_max) {
+                       u64 mpc, mplsf;
+
+                       mpc = div_u64(((u64) rq->capacity) *
+                               rq->max_possible_freq, rq->max_freq);
+                       rq->max_possible_capacity = (int) mpc;
+
+                       mplsf = div_u64(((u64) rq->load_scale_factor) *
+                               rq->max_possible_freq, rq->max_freq);
+
+                       if (mpc > highest_mpc) {
+                               highest_mpc = mpc;
+                               cpumask_clear(&mpc_mask);
+                               cpumask_set_cpu(i, &mpc_mask);
+                       } else if (mpc == highest_mpc) {
+                               cpumask_set_cpu(i, &mpc_mask);
+                       }
+
+                       if (mplsf > highest_mplsf)
+                               highest_mplsf = mplsf;
+               }
+       }
+
+       if (update_max) {
+               max_possible_capacity = highest_mpc;
+               max_load_scale_factor = highest_mplsf;
+       }
+
+       __update_min_max_capacity();
+
+       return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+               unsigned long val, void *data)
+{
+       struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+       unsigned int cpu = freq->cpu, new_freq = freq->new;
+       unsigned long flags;
+       int i;
+
+       if (val != CPUFREQ_POSTCHANGE)
+               return 0;
+
+       BUG_ON(!new_freq);
+
+       if (cpu_rq(cpu)->cur_freq == new_freq)
+               return 0;
+
+       for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+               struct rq *rq = cpu_rq(i);
+
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                                     walt_ktime_clock(), 0);
+               rq->cur_freq = new_freq;
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
+
+       return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+       .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+       .notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+       int ret;
+
+       ret = cpufreq_register_notifier(&notifier_policy_block,
+                                               CPUFREQ_POLICY_NOTIFIER);
+
+       if (!ret)
+               ret = cpufreq_register_notifier(&notifier_trans_block,
+                                               CPUFREQ_TRANSITION_NOTIFIER);
+
+       return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+       int i;
+       u32 init_load_windows =
+                       div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+                          (u64)walt_ravg_window, 100);
+       u32 init_load_pct = current->init_load_pct;
+
+       p->init_load_pct = 0;
+       memset(&p->ravg, 0, sizeof(struct ravg));
+
+       if (init_load_pct) {
+               init_load_windows = div64_u64((u64)init_load_pct *
+                         (u64)walt_ravg_window, 100);
+       }
+
+       p->ravg.demand = init_load_windows;
+       for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+               p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644 (file)
index 0000000..cabc193
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+               u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+               int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_disabled;
+
+#endif
index dd46f370b73a2a01b6309645918203b300d3c0b1..e2d9953822be27ebc89532189ee05a3ac1516f9c 100644 (file)
@@ -311,6 +311,29 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SCHED_WALT
+       {
+               .procname       = "sched_use_walt_cpu_util",
+               .data           = &sysctl_sched_use_walt_cpu_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_use_walt_task_util",
+               .data           = &sysctl_sched_use_walt_task_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_init_task_load_pct",
+               .data           = &sysctl_sched_walt_init_task_load_pct,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif
        {
                .procname       = "sched_sync_hint_enable",
                .data           = &sysctl_sched_sync_hint_enable,