Merge remote-tracking branch 'lsk/v3.10/topic/gator' into linux-linaro-lsk
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index ffa643ce9116df8fdf69e7447c0be4d3f3d93d9c..97ed132c809a9c203567a7e28833757beab65aef 100644 (file)
@@ -948,6 +948,13 @@ void task_numa_work(struct callback_head *work)
                if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
                        continue;
 
+               /*
+                * Skip inaccessible VMAs to avoid any confusion between
+                * PROT_NONE and NUMA hinting ptes
+                */
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                       continue;
+
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -2173,6 +2180,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_entity_load_avg(curr, 1);
        update_cfs_rq_blocked_load(cfs_rq, 1);
+       update_cfs_shares(cfs_rq);
 
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -2210,13 +2218,14 @@ static inline bool cfs_bandwidth_used(void)
        return static_key_false(&__cfs_bandwidth_used);
 }
 
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
+{
+       static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
 {
-       /* only need to count groups transitioning between enabled/!enabled */
-       if (enabled && !was_enabled)
-               static_key_slow_inc(&__cfs_bandwidth_used);
-       else if (!enabled && was_enabled)
-               static_key_slow_dec(&__cfs_bandwidth_used);
+       static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -2224,7 +2233,8 @@ static bool cfs_bandwidth_used(void)
        return true;
 }
 
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
 #endif /* HAVE_JUMP_LABEL */
 
 /*
@@ -2476,6 +2486,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled_clock = rq->clock;
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (!cfs_b->timer_active)
+               __start_cfs_bandwidth(cfs_b);
        raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -2587,6 +2599,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        if (idle)
                goto out_unlock;
 
+       /*
+        * if we have relooped after returning idle once, we need to update our
+        * status as actually running, so that other cpus doing
+        * __start_cfs_bandwidth will stop trying to cancel us.
+        */
+       cfs_b->timer_active = 1;
+
        __refill_cfs_bandwidth_runtime(cfs_b);
 
        if (!throttled) {
@@ -2647,7 +2666,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 /* how long we wait to gather additional slack before distributing */
 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
 
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 {
        struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2723,10 +2748,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        u64 expires;
 
        /* confirm we're still not at a refresh boundary */
-       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+       raw_spin_lock(&cfs_b->lock);
+       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+               raw_spin_unlock(&cfs_b->lock);
                return;
+       }
 
-       raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
                runtime = cfs_b->runtime;
                cfs_b->runtime = 0;
@@ -2851,11 +2878,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         * (timer_active==0 becomes visible before the hrtimer call-back
         * terminates).  In either case we ensure that it's re-programmed
         */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+       while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+              hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+               /* bounce the lock to allow do_sched_cfs_period_timer to run */
                raw_spin_unlock(&cfs_b->lock);
-               /* ensure cfs_b->lock is available while we wait */
-               hrtimer_cancel(&cfs_b->period_timer);
-
+               cpu_relax();
                raw_spin_lock(&cfs_b->lock);
                /* if someone else restarted the timer then we're done */
                if (cfs_b->timer_active)
@@ -3560,10 +3587,12 @@ static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer)
  * If there are any, set ns_delay to
  * ('target_residency of state with shortest too-big latency' - 1) * 1000.
  */
-static void hmp_keepalive_delay(unsigned int *ns_delay)
+static void hmp_keepalive_delay(int cpu, unsigned int *ns_delay)
 {
+       struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
        struct cpuidle_driver *drv;
-       drv = cpuidle_driver_ref();
+
+       drv = cpuidle_get_cpu_driver(dev);
        if (drv) {
                unsigned int us_delay = UINT_MAX;
                unsigned int us_max_delay = *ns_delay / 1000;
@@ -3582,7 +3611,6 @@ static void hmp_keepalive_delay(unsigned int *ns_delay)
                else
                        *ns_delay = 1000 * (us_delay - 1);
        }
-       cpuidle_driver_unref();
 }
 
 static void hmp_cpu_keepalive_trigger(void)
@@ -3596,7 +3624,7 @@ static void hmp_cpu_keepalive_trigger(void)
                                CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
                keepalive->timer.function = hmp_cpu_keepalive_notify;
 
-               hmp_keepalive_delay(&ns_delay);
+               hmp_keepalive_delay(cpu, &ns_delay);
                keepalive->delay = ns_to_ktime(ns_delay);
                keepalive->init = true;
        }
@@ -4456,7 +4484,11 @@ unlock:
 #else
                new_cpu = hmp_select_slower_cpu(p, prev_cpu);
 #endif
-               if (new_cpu != prev_cpu) {
+               /*
+                * we might have no suitable CPU
+                * in which case new_cpu == NR_CPUS
+                */
+               if (new_cpu < NR_CPUS && new_cpu != prev_cpu) {
                        hmp_next_down_delay(&p->se, new_cpu);
                        trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
                        return new_cpu;
@@ -7418,11 +7450,15 @@ static void task_fork_fair(struct task_struct *p)
        cfs_rq = task_cfs_rq(current);
        curr = cfs_rq->curr;
 
-       if (unlikely(task_cpu(p) != this_cpu)) {
-               rcu_read_lock();
-               __set_task_cpu(p, this_cpu);
-               rcu_read_unlock();
-       }
+       /*
+        * Not only the cpu but also the task_group of the parent might have
+        * been changed after parent->se.parent,cfs_rq were copied to
+        * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+        * of child point to valid ones.
+        */
+       rcu_read_lock();
+       __set_task_cpu(p, this_cpu);
+       rcu_read_unlock();
 
        update_curr(cfs_rq);
 
@@ -7472,15 +7508,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
        /*
-        * Ensure the task's vruntime is normalized, so that when its
+        * Ensure the task's vruntime is normalized, so that when it's
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-        * If it was on_rq, then the dequeue_entity(.flags=0) will already
-        * have normalized the vruntime, if it was !on_rq, then only when
+        * If it's on_rq, then the dequeue_entity(.flags=0) will already
+        * have normalized the vruntime, if it's !on_rq, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-       if (!se->on_rq && p->state != TASK_RUNNING) {
+       if (!p->on_rq && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -7701,7 +7737,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
 
        se->my_q = cfs_rq;
-       update_load_set(&se->load, 0);
+       /* guarantee group entities always have weight */
+       update_load_set(&se->load, NICE_0_LOAD);
        se->parent = parent;
 }