sched: Prevent unnecessary active balance of single task in sched group

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 90e26b11deaa1ab4b78302605850523a7852720b..2dc28766cf9a3ab3c0b00ba4089b8aeda635f7f3 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -687,8 +687,6 @@ void init_entity_runnable_average(struct sched_entity *se)
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  #else
  void init_entity_runnable_average(struct sched_entity *se)
  {
@@ -2682,6 +2680,23 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                          \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(*ptr) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+       res = var - val;                                        \
+       if (res > var)                                          \
+               res = 0;                                        \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
  static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  {
@@ -2689,16 +2704,16 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         int decayed, removed = 0;
  
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-               long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-               sa->load_avg = max_t(long, sa->load_avg - r, 0);
-               sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+               s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+               sub_positive(&sa->load_avg, r);
+               sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                 removed = 1;
         }
  
         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-               sa->util_avg = max_t(long, sa->util_avg - r, 0);
-               sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->util_avg, r);
+               sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2764,10 +2779,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                           &se->avg, se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
  
-       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+       sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+       sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+       sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+       sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -2809,27 +2824,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
  }
  
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 last_update_time;
-
  #ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
         u64 last_update_time_copy;
+       u64 last_update_time;
  
         do {
                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
                 smp_rmb();
                 last_update_time = cfs_rq->avg.last_update_time;
         } while (last_update_time != last_update_time_copy);
+
+       return last_update_time;
+}
  #else
-       last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->avg.last_update_time;
+}
  #endif
  
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 last_update_time;
+
+       /*
+        * Newly created task or never used group entity should not be removed
+        * from its (source) cfs_rq
+        */
+       if (se->avg.last_update_time == 0)
+               return;
+
+       last_update_time = cfs_rq_last_update_time(cfs_rq);
+
         __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
@@ -2931,6 +2964,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         }
  
                         trace_sched_stat_blocked(tsk, delta);
+                       trace_sched_blocked_reason(tsk);
  
                         /*
                          * Blocking time is in units of nanosecs, so shift by
@@ -4577,19 +4611,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 return wl;
  
         for_each_sched_entity(se) {
-               long w, W;
+               struct cfs_rq *cfs_rq = se->my_q;
+               long W, w = cfs_rq_load_avg(cfs_rq);
  
-               tg = se->my_q->tg;
+               tg = cfs_rq->tg;
  
                 /*
                  * W = @wg + \Sum rw_j
                  */
-               W = wg + calc_tg_weight(tg, se->my_q);
+               W = wg + atomic_long_read(&tg->load_avg);
+
+               /* Ensure \Sum rw_j >= rw_i */
+               W -= cfs_rq->tg_load_avg_contrib;
+               W += w;
  
                 /*
                  * w = rw_i + @wl
                  */
-               w = cfs_rq_load_avg(se->my_q) + wl;
+               w += wl;
  
                 /*
                  * wl = S * s'_i; see (2)
@@ -4724,6 +4763,48 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         return 1;
  }
  
+static inline unsigned long task_util(struct task_struct *p)
+{
+       return p->se.avg.util_avg;
+}
+
+static unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+       unsigned long capacity = capacity_of(cpu);
+
+       util += task_util(p);
+
+       return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+       unsigned long capacity = capacity_of(cpu);
+       unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       if (capacity == max_capacity)
+               return true;
+
+       if (capacity * capacity_margin > max_capacity * 1024)
+               return true;
+
+       return __task_fits(p, cpu, 0);
+}
+
+static int cpu_util(int cpu);
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+       return __task_fits(p, cpu, cpu_util(cpu));
+}
+
+static bool cpu_overutilized(int cpu)
+{
+       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
@@ -4733,7 +4814,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                   int this_cpu, int sd_flag)
  {
         struct sched_group *idlest = NULL, *group = sd->groups;
+       struct sched_group *fit_group = NULL, *spare_group = NULL;
         unsigned long min_load = ULONG_MAX, this_load = 0;
+       unsigned long fit_capacity = ULONG_MAX;
+       unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
         int load_idx = sd->forkexec_idx;
         int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
@@ -4741,7 +4825,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 load_idx = sd->wake_idx;
  
         do {
-               unsigned long load, avg_load;
+               unsigned long load, avg_load, spare_capacity;
                 int local_group;
                 int i;
  
@@ -4764,6 +4848,25 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                                 load = target_load(i, load_idx);
  
                         avg_load += load;
+
+                       /*
+                        * Look for most energy-efficient group that can fit
+                        * that can fit the task.
+                        */
+                       if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+                               fit_capacity = capacity_of(i);
+                               fit_group = group;
+                       }
+
+                       /*
+                        * Look for group which has most spare capacity on a
+                        * single cpu.
+                        */
+                       spare_capacity = capacity_of(i) - cpu_util(i);
+                       if (spare_capacity > max_spare_capacity) {
+                               max_spare_capacity = spare_capacity;
+                               spare_group = group;
+                       }
                 }
  
                 /* Adjust by relative CPU capacity of the group */
@@ -4777,6 +4880,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 }
         } while (group = group->next, group != sd->groups);
  
+       if (fit_group)
+               return fit_group;
+
+       if (spare_group)
+               return spare_group;
+
         if (!idlest || 100*this_load < imbalance*min_load)
                 return NULL;
         return idlest;
@@ -4797,7 +4906,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-               if (idle_cpu(i)) {
+               if (task_fits_spare(p, i)) {
                         struct rq *rq = cpu_rq(i);
                         struct cpuidle_state *idle = idle_get_state(rq);
                         if (idle && idle->exit_latency < min_exit_latency) {
@@ -4809,7 +4918,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                 min_exit_latency = idle->exit_latency;
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
-                       } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+                       } else if (idle_cpu(i) &&
+                                  (!idle || idle->exit_latency == min_exit_latency) &&
                                    rq->idle_stamp > latest_idle_timestamp) {
                                 /*
                                  * If equal or no active idle state, then
@@ -4818,6 +4928,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                  */
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
+                       } else if (shallowest_idle_cpu == -1) {
+                               /*
+                                * If we haven't found an idle CPU yet
+                                * pick a non-idle one that can fit the task as
+                                * fallback.
+                                */
+                               shallowest_idle_cpu = i;
                         }
                 } else if (shallowest_idle_cpu == -1) {
                         load = weighted_cpuload(i);
@@ -4932,7 +5049,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         int sync = wake_flags & WF_SYNC;
  
         if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
+                             cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
@@ -5532,6 +5650,7 @@ struct lb_env {
         int                     new_dst_cpu;
         enum cpu_idle_type      idle;
         long                    imbalance;
+       unsigned int            src_grp_nr_running;
         /* The set of CPUs under consideration for load-balancing */
         struct cpumask          *cpus;
  
@@ -6494,6 +6613,8 @@ next_group:
         if (env->sd->flags & SD_NUMA)
                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  
+       env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
         if (!env->sd->parent) {
                 /* update overload indicator if we are at root domain */
                 if (env->dst_rq->rd->overload != overload)
@@ -6903,6 +7024,13 @@ static int need_active_balance(struct lb_env *env)
                         return 1;
         }
  
+       if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+                               env->src_rq->cfs.h_nr_running == 1 &&
+                               cpu_overutilized(env->src_cpu) &&
+                               !cpu_overutilized(env->dst_cpu)) {
+                       return 1;
+       }
+
         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
  
@@ -7115,7 +7243,8 @@ more_balance:
                  * excessive cache_hot migrations and active balances.
                  */
                 if (idle != CPU_NEWLY_IDLE)
-                       sd->nr_balance_failed++;
+                       if (env.src_grp_nr_running > 1)
+                               sd->nr_balance_failed++;
  
                 if (need_active_balance(&env)) {
                         raw_spin_lock_irqsave(&busiest->lock, flags);