sched/walt: Accounting for number of irqs pending on each core
authorSrinath Sridharan <srinathsr@google.com>
Fri, 22 Jul 2016 12:21:15 +0000 (13:21 +0100)
committerAmit Pundir <amit.pundir@linaro.org>
Wed, 14 Sep 2016 09:32:22 +0000 (15:02 +0530)
Schedules on a core whose irq count is less than a threshold.
Improves I/O performance of EAS.

Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5

include/linux/sched/sysctl.h
kernel/sched/core.c
kernel/sched/cputime.c
kernel/sched/fair.c
kernel/sched/sched.h
kernel/sched/walt.c
kernel/sched/walt.h
kernel/sysctl.c

index 710f58a28d638f54ac85bea50f96eaac2296775c..d68e88c9d4d7032943e65c519aaf401f4bbe16b7 100644 (file)
@@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware;
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int sysctl_sched_use_walt_task_util;
 extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
 #endif
 
 enum sched_tunable_scaling {
index 4c981dfc34eefce2bd7e2232ee45cd7cb8177d9f..67abbbd3965ba7405ef9b72945f20ed29dd30b02 100644 (file)
@@ -7754,6 +7754,11 @@ void __init sched_init(void)
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+               rq->cur_irqload = 0;
+               rq->avg_irqload = 0;
+               rq->irqload_ts = 0;
+#endif
 
                INIT_LIST_HEAD(&rq->cfs_tasks);
 
index f74ea89e77a8e5aa5faf8abf6c2f1132a92de397..3f232c8b2bddc6bb59100634fe74d0cb69e030c5 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
 #include "sched.h"
+#include "walt.h"
 
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr)
        unsigned long flags;
        s64 delta;
        int cpu;
+#ifdef CONFIG_SCHED_WALT
+       u64 wallclock;
+       bool account = true;
+#endif
 
        if (!sched_clock_irqtime)
                return;
@@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr)
        local_irq_save(flags);
 
        cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+       wallclock = sched_clock_cpu(cpu);
+#endif
        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
        __this_cpu_add(irq_start_time, delta);
 
@@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr)
                __this_cpu_add(cpu_hardirq_time, delta);
        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                __this_cpu_add(cpu_softirq_time, delta);
+#ifdef CONFIG_SCHED_WALT
+       else
+               account = false;
+#endif
 
        irq_time_write_end();
+#ifdef CONFIG_SCHED_WALT
+       if (account)
+               walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
index 15b8a8f34bd9a31aa892706fa46c7059b545a7d7..8560a553003543bc614882df258e71768fccb0ad 100644 (file)
@@ -61,6 +61,8 @@ unsigned int sysctl_sched_cstate_aware = 1;
 #ifdef CONFIG_SCHED_WALT
 unsigned int sysctl_sched_use_walt_cpu_util = 1;
 unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
 #endif
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -4274,7 +4276,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        schedtune_enqueue_task(p, cpu_of(rq));
 
 #endif /* CONFIG_SMP */
-
        hrtick_update(rq);
 }
 
@@ -5648,6 +5649,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                if (new_util > capacity_orig_of(i))
                        continue;
 
+#ifdef CONFIG_SCHED_WALT
+               if (walt_cpu_high_irqload(i))
+                       continue;
+#endif
                /*
                 * For boosted tasks we favor idle cpus unconditionally to
                 * improve latency.
index f48fb371913a9bad56c4e233cfb22c26b59b6e41..51c632bc94b6fe6fd5103c481a7430daa22a95e3 100644 (file)
@@ -685,6 +685,9 @@ struct rq {
        u64 prev_runnable_sum;
        u64 nt_curr_runnable_sum;
        u64 nt_prev_runnable_sum;
+       u64 cur_irqload;
+       u64 avg_irqload;
+       u64 irqload_ts;
 #endif /* CONFIG_SCHED_WALT */
 
 
index 1dff3d2e2358dc1206ce11926a15589382b33e96..b9ae8d5c4393ada4a0355ea1fdbbbdec81a6c38d 100644 (file)
@@ -221,6 +221,71 @@ static int cpu_is_waiting_on_io(struct rq *rq)
        return atomic_read(&rq->nr_iowait);
 }
 
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+                                u64 delta, u64 wallclock)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags, nr_windows;
+       u64 cur_jiffies_ts;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       /*
+        * cputime (wallclock) uses sched_clock so use the same here for
+        * consistency.
+        */
+       delta += sched_clock() - wallclock;
+       cur_jiffies_ts = get_jiffies_64();
+
+       if (is_idle_task(curr))
+               walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+                                delta);
+
+       nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+       if (nr_windows) {
+               if (nr_windows < 10) {
+                       /* Decay CPU's irqload by 3/4 for each window. */
+                       rq->avg_irqload *= (3 * nr_windows);
+                       rq->avg_irqload = div64_u64(rq->avg_irqload,
+                                                   4 * nr_windows);
+               } else {
+                       rq->avg_irqload = 0;
+               }
+               rq->avg_irqload += rq->cur_irqload;
+               rq->cur_irqload = 0;
+       }
+
+       rq->cur_irqload += delta;
+       rq->irqload_ts = cur_jiffies_ts;
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+       struct rq *rq = cpu_rq(cpu);
+       s64 delta;
+       delta = get_jiffies_64() - rq->irqload_ts;
+
+        /*
+        * Current context can be preempted by irq and rq->irqload_ts can be
+        * updated by irq context so that delta can be negative.
+        * But this is okay and we can safely return as this means there
+        * was recent irq occurrence.
+        */
+
+        if (delta < WALT_HIGH_IRQ_TIMEOUT)
+               return rq->avg_irqload;
+        else
+               return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+       return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
 static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
                                     u64 irqtime, int event)
 {
index cabc193a683d5653106add97fec804a2bab64067..e181c87a928d89134ea9c6c47d635f36f715f29a 100644 (file)
@@ -31,6 +31,11 @@ void walt_set_window_start(struct rq *rq);
 void walt_migrate_sync_cpu(int cpu);
 void walt_init_cpu_efficiency(void);
 u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+                                  u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
 
 #else /* CONFIG_SCHED_WALT */
 
index e2d9953822be27ebc89532189ee05a3ac1516f9c..d964422eb6014f6abe6307bf15ed0f7de80340a0 100644 (file)
@@ -333,6 +333,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "sched_walt_cpu_high_irqload",
+               .data           = &sysctl_sched_walt_cpu_high_irqload,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
 #endif
        {
                .procname       = "sched_sync_hint_enable",