sched, nohz: Change rq->nr_running to always use wrappers
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index 7570dd969c2838e9aab12ba9cb2c24cb87e21855..f7cac2ba62ea337ee70bf66a0171eb3b5cc8af3d 100644 (file)
@@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p)
        if (env.best_cpu == -1)
                return -EAGAIN;
 
-       sched_setnuma(p, env.dst_nid);
+       /*
+        * If the task is part of a workload that spans multiple NUMA nodes,
+        * and is migrating into one of the workload's active nodes, remember
+        * this node as the task's preferred numa node, so the workload can
+        * settle down.
+        * A task that migrated to a second choice node will be better off
+        * trying for a better one later. Do not set the preferred node here.
+        */
+       if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+               sched_setnuma(p, env.dst_nid);
 
        /*
         * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1335,15 @@ static int task_numa_migrate(struct task_struct *p)
 /* Attempt to migrate a task to a CPU on the preferred node. */
 static void numa_migrate_preferred(struct task_struct *p)
 {
+       unsigned long interval = HZ;
+
        /* This task has no NUMA fault statistics yet */
        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
 
        /* Periodically retry migrating the task to the preferred node */
-       p->numa_migrate_retry = jiffies + HZ;
+       interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+       p->numa_migrate_retry = jiffies + interval;
 
        /* Success if task is already running on preferred CPU */
        if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1750,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
        int cpu_node = task_node(current);
+       int local = !!(flags & TNF_FAULT_LOCAL);
        int priv;
 
        if (!numabalancing_enabled)
@@ -1786,6 +1799,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                        task_numa_group(p, last_cpupid, flags, &priv);
        }
 
+       /*
+        * If a workload spans multiple NUMA nodes, a shared fault that
+        * occurs wholly within the set of nodes that the workload is
+        * actively using should be counted as local. This allows the
+        * scan rate to slow down when a workload has settled down.
+        */
+       if (!priv && !local && p->numa_group &&
+                       node_isset(cpu_node, p->numa_group->active_nodes) &&
+                       node_isset(mem_node, p->numa_group->active_nodes))
+               local = 1;
+
        task_numa_placement(p);
 
        /*
@@ -1800,7 +1824,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
-       p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+       p->numa_faults_locality[local] += pages;
 }
 
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3325,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        }
 
        if (!se)
-               rq->nr_running -= task_delta;
+               sub_nr_running(rq, task_delta);
 
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3376,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        }
 
        if (!se)
-               rq->nr_running += task_delta;
+               add_nr_running(rq, task_delta);
 
        /* determine whether we need to wake up potentially idle cpu */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3908,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
        if (!se) {
                update_rq_runnable_avg(rq, rq->nr_running);
-               inc_nr_running(rq);
+               add_nr_running(rq, 1);
        }
        hrtick_update(rq);
 }
@@ -3944,7 +3968,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
 
        if (!se) {
-               dec_nr_running(rq);
+               sub_nr_running(rq, 1);
                update_rq_runnable_avg(rq, 1);
        }
        hrtick_update(rq);
@@ -4449,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        sd = tmp;
        }
 
-       if (affine_sd) {
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-                       prev_cpu = cpu;
+       if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               prev_cpu = cpu;
 
+       if (sd_flag & SD_BALANCE_WAKE) {
                new_cpu = select_idle_sibling(p, prev_cpu);
                goto unlock;
        }
@@ -4520,6 +4544,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
                atomic_long_add(se->avg.load_avg_contrib,
                                                &cfs_rq->removed_load);
        }
+
+       /* We have migrated, no longer consider this task hot */
+       se->exec_start = 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -5564,6 +5591,7 @@ static unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        u64 total, available, age_stamp, avg;
+       s64 delta;
 
        /*
         * Since we're reading these variables without serialization make sure
@@ -5572,7 +5600,11 @@ static unsigned long scale_rt_power(int cpu)
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
 
-       total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+       delta = rq_clock(rq) - age_stamp;
+       if (unlikely(delta < 0))
+               delta = 0;
+
+       total = sched_avg_period() + delta;
 
        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
@@ -6640,27 +6672,62 @@ out:
        return ld_moved;
 }
 
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+       unsigned long interval = sd->balance_interval;
+
+       if (cpu_busy)
+               interval *= sd->busy_factor;
+
+       /* scale ms to jiffies */
+       interval = msecs_to_jiffies(interval);
+       interval = clamp(interval, 1UL, max_load_balance_interval);
+
+       return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+       unsigned long interval, next;
+
+       interval = get_sd_balance_interval(sd, cpu_busy);
+       next = sd->last_balance + interval;
+
+       if (time_after(*next_balance, next))
+               *next_balance = next;
+}
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static int idle_balance(struct rq *this_rq)
 {
+       unsigned long next_balance = jiffies + HZ;
+       int this_cpu = this_rq->cpu;
        struct sched_domain *sd;
        int pulled_task = 0;
-       unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
-       int this_cpu = this_rq->cpu;
 
        idle_enter_fair(this_rq);
+
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
         */
        this_rq->idle_stamp = rq_clock(this_rq);
 
-       if (this_rq->avg_idle < sysctl_sched_migration_cost)
+       if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+               rcu_read_lock();
+               sd = rcu_dereference_check_sched_domain(this_rq->sd);
+               if (sd)
+                       update_next_balance(sd, 0, &next_balance);
+               rcu_read_unlock();
+
                goto out;
+       }
 
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6670,20 +6737,20 @@ static int idle_balance(struct rq *this_rq)
        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
-               unsigned long interval;
                int continue_balancing = 1;
                u64 t0, domain_cost;
 
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
-               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+                       update_next_balance(sd, 0, &next_balance);
                        break;
+               }
 
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        t0 = sched_clock_cpu(this_cpu);
 
-                       /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
@@ -6695,42 +6762,37 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
 
-               interval = msecs_to_jiffies(sd->balance_interval);
-               if (time_after(next_balance, sd->last_balance + interval))
-                       next_balance = sd->last_balance + interval;
-               if (pulled_task)
+               update_next_balance(sd, 0, &next_balance);
+
+               /*
+                * Stop searching for tasks to pull if there are
+                * now runnable tasks on this rq.
+                */
+               if (pulled_task || this_rq->nr_running > 0)
                        break;
        }
        rcu_read_unlock();
 
        raw_spin_lock(&this_rq->lock);
 
+       if (curr_cost > this_rq->max_idle_balance_cost)
+               this_rq->max_idle_balance_cost = curr_cost;
+
        /*
-        * While browsing the domains, we released the rq lock.
-        * A task could have be enqueued in the meantime
+        * While browsing the domains, we released the rq lock, a task could
+        * have been enqueued in the meantime. Since we're not going idle,
+        * pretend we pulled a task.
         */
-       if (this_rq->cfs.h_nr_running && !pulled_task) {
+       if (this_rq->cfs.h_nr_running && !pulled_task)
                pulled_task = 1;
-               goto out;
-       }
 
-       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-               /*
-                * We are going idle. next_balance may be set based on
-                * a busy processor. So reset next_balance.
-                */
+out:
+       /* Move the next balance forward */
+       if (time_after(this_rq->next_balance, next_balance))
                this_rq->next_balance = next_balance;
-       }
 
-       if (curr_cost > this_rq->max_idle_balance_cost)
-               this_rq->max_idle_balance_cost = curr_cost;
-
-out:
        /* Is there a task of a high priority class? */
-       if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
-           ((this_rq->stop && this_rq->stop->on_rq) ||
-            this_rq->dl.dl_nr_running ||
-            (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+       if (this_rq->nr_running != this_rq->cfs.h_nr_running)
                pulled_task = -1;
 
        if (pulled_task) {
@@ -7011,16 +7073,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                        break;
                }
 
-               interval = sd->balance_interval;
-               if (idle != CPU_IDLE)
-                       interval *= sd->busy_factor;
-
-               /* scale ms to jiffies */
-               interval = msecs_to_jiffies(interval);
-               interval = clamp(interval, 1UL, max_load_balance_interval);
+               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
 
                need_serialize = sd->flags & SD_SERIALIZE;
-
                if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
@@ -7036,6 +7091,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
+                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
                }
                if (need_serialize)
                        spin_unlock(&balancing);