sched, nohz: Change rq->nr_running to always use wrappers

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 7570dd969c2838e9aab12ba9cb2c24cb87e21855..f7cac2ba62ea337ee70bf66a0171eb3b5cc8af3d 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p)
         if (env.best_cpu == -1)
                 return -EAGAIN;
  
-       sched_setnuma(p, env.dst_nid);
+       /*
+        * If the task is part of a workload that spans multiple NUMA nodes,
+        * and is migrating into one of the workload's active nodes, remember
+        * this node as the task's preferred numa node, so the workload can
+        * settle down.
+        * A task that migrated to a second choice node will be better off
+        * trying for a better one later. Do not set the preferred node here.
+        */
+       if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+               sched_setnuma(p, env.dst_nid);
  
         /*
          * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1335,15 @@ static int task_numa_migrate(struct task_struct *p)
  /* Attempt to migrate a task to a CPU on the preferred node. */
  static void numa_migrate_preferred(struct task_struct *p)
  {
+       unsigned long interval = HZ;
+
         /* This task has no NUMA fault statistics yet */
         if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                 return;
  
         /* Periodically retry migrating the task to the preferred node */
-       p->numa_migrate_retry = jiffies + HZ;
+       interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+       p->numa_migrate_retry = jiffies + interval;
  
         /* Success if task is already running on preferred CPU */
         if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1750,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         struct task_struct *p = current;
         bool migrated = flags & TNF_MIGRATED;
         int cpu_node = task_node(current);
+       int local = !!(flags & TNF_FAULT_LOCAL);
         int priv;
  
         if (!numabalancing_enabled)
@@ -1786,6 +1799,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                         task_numa_group(p, last_cpupid, flags, &priv);
         }
  
+       /*
+        * If a workload spans multiple NUMA nodes, a shared fault that
+        * occurs wholly within the set of nodes that the workload is
+        * actively using should be counted as local. This allows the
+        * scan rate to slow down when a workload has settled down.
+        */
+       if (!priv && !local && p->numa_group &&
+                       node_isset(cpu_node, p->numa_group->active_nodes) &&
+                       node_isset(mem_node, p->numa_group->active_nodes))
+               local = 1;
+
         task_numa_placement(p);
  
         /*
@@ -1800,7 +1824,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
  
         p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
         p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
-       p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+       p->numa_faults_locality[local] += pages;
  }
  
  static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3325,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         }
  
         if (!se)
-               rq->nr_running -= task_delta;
+               sub_nr_running(rq, task_delta);
  
         cfs_rq->throttled = 1;
         cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3376,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
         }
  
         if (!se)
-               rq->nr_running += task_delta;
+               add_nr_running(rq, task_delta);
  
         /* determine whether we need to wake up potentially idle cpu */
         if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3908,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
         if (!se) {
                 update_rq_runnable_avg(rq, rq->nr_running);
-               inc_nr_running(rq);
+               add_nr_running(rq, 1);
         }
         hrtick_update(rq);
  }
@@ -3944,7 +3968,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         }
  
         if (!se) {
-               dec_nr_running(rq);
+               sub_nr_running(rq, 1);
                 update_rq_runnable_avg(rq, 1);
         }
         hrtick_update(rq);
@@ -4449,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                         sd = tmp;
         }
  
-       if (affine_sd) {
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-                       prev_cpu = cpu;
+       if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               prev_cpu = cpu;
  
+       if (sd_flag & SD_BALANCE_WAKE) {
                 new_cpu = select_idle_sibling(p, prev_cpu);
                 goto unlock;
         }
@@ -4520,6 +4544,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
                 atomic_long_add(se->avg.load_avg_contrib,
                                                 &cfs_rq->removed_load);
         }
+
+       /* We have migrated, no longer consider this task hot */
+       se->exec_start = 0;
  }
  #endif /* CONFIG_SMP */
  
@@ -5564,6 +5591,7 @@ static unsigned long scale_rt_power(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
         u64 total, available, age_stamp, avg;
+       s64 delta;
  
         /*
          * Since we're reading these variables without serialization make sure
@@ -5572,7 +5600,11 @@ static unsigned long scale_rt_power(int cpu)
         age_stamp = ACCESS_ONCE(rq->age_stamp);
         avg = ACCESS_ONCE(rq->rt_avg);
  
-       total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+       delta = rq_clock(rq) - age_stamp;
+       if (unlikely(delta < 0))
+               delta = 0;
+
+       total = sched_avg_period() + delta;
  
         if (unlikely(total < avg)) {
                 /* Ensures that power won't end up being negative */
@@ -6640,27 +6672,62 @@ out:
         return ld_moved;
  }
  
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+       unsigned long interval = sd->balance_interval;
+
+       if (cpu_busy)
+               interval *= sd->busy_factor;
+
+       /* scale ms to jiffies */
+       interval = msecs_to_jiffies(interval);
+       interval = clamp(interval, 1UL, max_load_balance_interval);
+
+       return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+       unsigned long interval, next;
+
+       interval = get_sd_balance_interval(sd, cpu_busy);
+       next = sd->last_balance + interval;
+
+       if (time_after(*next_balance, next))
+               *next_balance = next;
+}
+
  /*
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
  static int idle_balance(struct rq *this_rq)
  {
+       unsigned long next_balance = jiffies + HZ;
+       int this_cpu = this_rq->cpu;
         struct sched_domain *sd;
         int pulled_task = 0;
-       unsigned long next_balance = jiffies + HZ;
         u64 curr_cost = 0;
-       int this_cpu = this_rq->cpu;
  
         idle_enter_fair(this_rq);
+
         /*
          * We must set idle_stamp _before_ calling idle_balance(), such that we
          * measure the duration of idle_balance() as idle time.
          */
         this_rq->idle_stamp = rq_clock(this_rq);
  
-       if (this_rq->avg_idle < sysctl_sched_migration_cost)
+       if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+               rcu_read_lock();
+               sd = rcu_dereference_check_sched_domain(this_rq->sd);
+               if (sd)
+                       update_next_balance(sd, 0, &next_balance);
+               rcu_read_unlock();
+
                 goto out;
+       }
  
         /*
          * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6670,20 +6737,20 @@ static int idle_balance(struct rq *this_rq)
         update_blocked_averages(this_cpu);
         rcu_read_lock();
         for_each_domain(this_cpu, sd) {
-               unsigned long interval;
                 int continue_balancing = 1;
                 u64 t0, domain_cost;
  
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
-               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+                       update_next_balance(sd, 0, &next_balance);
                         break;
+               }
  
                 if (sd->flags & SD_BALANCE_NEWIDLE) {
                         t0 = sched_clock_cpu(this_cpu);
  
-                       /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance(this_cpu, this_rq,
                                                    sd, CPU_NEWLY_IDLE,
                                                    &continue_balancing);
@@ -6695,42 +6762,37 @@ static int idle_balance(struct rq *this_rq)
                         curr_cost += domain_cost;
                 }
  
-               interval = msecs_to_jiffies(sd->balance_interval);
-               if (time_after(next_balance, sd->last_balance + interval))
-                       next_balance = sd->last_balance + interval;
-               if (pulled_task)
+               update_next_balance(sd, 0, &next_balance);
+
+               /*
+                * Stop searching for tasks to pull if there are
+                * now runnable tasks on this rq.
+                */
+               if (pulled_task || this_rq->nr_running > 0)
                         break;
         }
         rcu_read_unlock();
  
         raw_spin_lock(&this_rq->lock);
  
+       if (curr_cost > this_rq->max_idle_balance_cost)
+               this_rq->max_idle_balance_cost = curr_cost;
+
         /*
-        * While browsing the domains, we released the rq lock.
-        * A task could have be enqueued in the meantime
+        * While browsing the domains, we released the rq lock, a task could
+        * have been enqueued in the meantime. Since we're not going idle,
+        * pretend we pulled a task.
          */
-       if (this_rq->cfs.h_nr_running && !pulled_task) {
+       if (this_rq->cfs.h_nr_running && !pulled_task)
                 pulled_task = 1;
-               goto out;
-       }
  
-       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-               /*
-                * We are going idle. next_balance may be set based on
-                * a busy processor. So reset next_balance.
-                */
+out:
+       /* Move the next balance forward */
+       if (time_after(this_rq->next_balance, next_balance))
                 this_rq->next_balance = next_balance;
-       }
  
-       if (curr_cost > this_rq->max_idle_balance_cost)
-               this_rq->max_idle_balance_cost = curr_cost;
-
-out:
         /* Is there a task of a high priority class? */
-       if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
-           ((this_rq->stop && this_rq->stop->on_rq) ||
-            this_rq->dl.dl_nr_running ||
-            (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+       if (this_rq->nr_running != this_rq->cfs.h_nr_running)
                 pulled_task = -1;
  
         if (pulled_task) {
@@ -7011,16 +7073,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                         break;
                 }
  
-               interval = sd->balance_interval;
-               if (idle != CPU_IDLE)
-                       interval *= sd->busy_factor;
-
-               /* scale ms to jiffies */
-               interval = msecs_to_jiffies(interval);
-               interval = clamp(interval, 1UL, max_load_balance_interval);
+               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
  
                 need_serialize = sd->flags & SD_SERIALIZE;
-
                 if (need_serialize) {
                         if (!spin_trylock(&balancing))
                                 goto out;
@@ -7036,6 +7091,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                         }
                         sd->last_balance = jiffies;
+                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
                 }
                 if (need_serialize)
                         spin_unlock(&balancing);