Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h

index b12f81022a6b2c574f9c037e616f90373502c24e..01bcde84d3e40fb36e16625f0c1024f1291bb9ae 100644 (file)
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
  /*
   * must be macros to avoid header recursion hell
   */
-#define init_task_preempt_count(p) do { \
-       task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
-} while (0)
+#define init_task_preempt_count(p) do { } while (0)
  
  #define init_idle_preempt_count(p, cpu) do { \
-       task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
         per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
  } while (0)
  
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index 8afdc3e44247c3a53cc5513ddb0d8f2f84340a4c..809877e9030b4680097edbbb53009511eb758396 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
         __u32                   flags;          /* low level flags */
         __u32                   status;         /* thread synchronous flags */
         __u32                   cpu;            /* current CPU */
-       int                     saved_preempt_count;
         mm_segment_t            addr_limit;
         void __user             *sysenter_return;
         unsigned int            sig_on_uaccess_error:1;
@@ -69,7 +68,6 @@ struct thread_info {
         .task           = &tsk,                 \
         .flags          = 0,                    \
         .cpu            = 0,                    \
-       .saved_preempt_count = INIT_PREEMPT_COUNT,      \
         .addr_limit     = KERNEL_DS,            \
  }
  
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index 737527b40e5bf40bb1e757b635cc30994db911bd..9f950917528b332b139afb6fa854c87c8d92c4d0 100644 (file)
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
                 set_iopl_mask(next->iopl);
  
-       /*
-        * If it were not for PREEMPT_ACTIVE we could guarantee that the
-        * preempt_count of all tasks was equal here and this would not be
-        * needed.
-        */
-       task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
-       this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
         /*
          * Now maybe handle debug registers and/or IO bitmaps
          */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index b35921a670b25b03878e3f9ac2e96abfae0e910c..e835d263a33b43ccf7601698cfeadc447ede491e 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         /*
          * Switch FS and GS.
          *
-        * These are even more complicated than FS and GS: they have
+        * These are even more complicated than DS and ES: they have
          * 64-bit bases are that controlled by arch_prctl.  Those bases
          * only differ from the values in the GDT or LDT if the selector
          * is 0.
@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
          */
         this_cpu_write(current_task, next_p);
  
-       /*
-        * If it were not for PREEMPT_ACTIVE we could guarantee that the
-        * preempt_count of all tasks was equal here and this would not be
-        * needed.
-        */
-       task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
-       this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
         /* Reload esp0 and ss1.  This changes current_thread_info(). */
         load_sp0(tss, next);
  
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h

index 0bec580a48854f0a49b012b203663ebeddf98257..5d8ffa3e6f8c8a4e3715f3da4294abc860390386 100644 (file)
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
   * must be macros to avoid header recursion hell
   */
  #define init_task_preempt_count(p) do { \
-       task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
+       task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
  } while (0)
  
  #define init_idle_preempt_count(p, cpu) do { \
diff --git a/include/linux/preempt.h b/include/linux/preempt.h

index bea8dd8ff5e026f8fc3e3bc446af7aea7ec891e2..75e4e30677f1254fb2c9f076f71fc4e2df1fb9be 100644 (file)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -26,7 +26,6 @@
   *         SOFTIRQ_MASK:       0x0000ff00
   *         HARDIRQ_MASK:       0x000f0000
   *             NMI_MASK:       0x00100000
- *       PREEMPT_ACTIVE:       0x00200000
   * PREEMPT_NEED_RESCHED:       0x80000000
   */
  #define PREEMPT_BITS   8
@@ -53,10 +52,6 @@
  
  #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
  
-#define PREEMPT_ACTIVE_BITS    1
-#define PREEMPT_ACTIVE_SHIFT   (NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-
  /* We use the MSB mostly because its available */
  #define PREEMPT_NEED_RESCHED   0x80000000
  
@@ -126,8 +121,7 @@
   * Check whether we were atomic before we did preempt_disable():
   * (used by the scheduler)
   */
-#define in_atomic_preempt_off() \
-               ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
+#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
  
  #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
  extern void preempt_count_add(int val);
@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val);
  #define preempt_count_inc() preempt_count_add(1)
  #define preempt_count_dec() preempt_count_sub(1)
  
-#define preempt_active_enter() \
-do { \
-       preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-       barrier(); \
-} while (0)
-
-#define preempt_active_exit() \
-do { \
-       barrier(); \
-       preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-} while (0)
-
  #ifdef CONFIG_PREEMPT_COUNT
  
  #define preempt_disable() \
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 56667292d1e444df2e9f21fc8281caf32e44f3f5..9e1e06c3ce051e63862a22e7edb5897e7027591e 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -599,20 +599,26 @@ struct task_cputime_atomic {
                 .sum_exec_runtime = ATOMIC64_INIT(0),           \
         }
  
-#ifdef CONFIG_PREEMPT_COUNT
-#define PREEMPT_DISABLED       (1 + PREEMPT_ENABLED)
-#else
-#define PREEMPT_DISABLED       PREEMPT_ENABLED
-#endif
+#define PREEMPT_DISABLED       (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+
+/*
+ * Disable preemption until the scheduler is running -- use an unconditional
+ * value so that it also works on !PREEMPT_COUNT kernels.
+ *
+ * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+ */
+#define INIT_PREEMPT_COUNT     PREEMPT_OFFSET
  
  /*
- * Disable preemption until the scheduler is running.
- * Reset by start_kernel()->sched_init()->init_idle().
+ * Initial preempt_count value; reflects the preempt_count schedule invariant
+ * which states that during context switches:
   *
- * We include PREEMPT_ACTIVE to avoid cond_resched() from working
- * before the scheduler is active -- see should_resched().
+ *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ *
+ * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ * Note: See finish_task_switch().
   */
-#define INIT_PREEMPT_COUNT     (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+#define FORK_PREEMPT_COUNT     (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level {
  #endif
  };
  
-extern struct sched_domain_topology_level *sched_domain_topology;
-
  extern void set_sched_topology(struct sched_domain_topology_level *tl);
  extern void wake_up_if_idle(int cpu);
  
@@ -1192,10 +1196,10 @@ struct load_weight {
  
  /*
   * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors the amount of time that a sched_entity is
- * runnable on a rq into its weight. For cfs_rq, it is the aggregated
- * such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency scaling into the amount of time
+ * 1) load_avg factors frequency scaling into the amount of time that a
+ * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+ * aggregated such weights of all runnable and blocked sched_entities.
+ * 2) util_avg factors frequency and cpu scaling into the amount of time
   * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
   * For cfs_rq, it is the aggregated such times of all runnable and
   * blocked sched_entities.
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h

index 9d303b8847df28e4851746515fd9ee388e1cf8e1..9089a2ae913ddf4d12f10a7bcff209bbd1894400 100644 (file)
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p)
         return dl_prio(p->prio);
  }
  
+static inline bool dl_time_before(u64 a, u64 b)
+{
+       return (s64)(a - b) < 0;
+}
+
  #endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h

index e6109a6cd8f65eb779163d1a084a6e0256a11db0..12910cf19869c7db32d1ca34ddda0931e80570d1 100644 (file)
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -24,9 +24,6 @@ struct smpboot_thread_data;
   *                     parked (cpu offline)
   * @unpark:            Optional unpark function, called when the thread is
   *                     unparked (cpu online)
- * @pre_unpark:                Optional unpark function, called before the thread is
- *                     unparked (cpu online). This is not guaranteed to be
- *                     called on the target cpu of the thread. Careful!
   * @cpumask:           Internal state.  To update which threads are unparked,
   *                     call smpboot_update_cpumask_percpu_thread().
   * @selfparking:       Thread is not parked by the park function.
@@ -42,7 +39,6 @@ struct smp_hotplug_thread {
         void                            (*cleanup)(unsigned int cpu, bool online);
         void                            (*park)(unsigned int cpu);
         void                            (*unpark)(unsigned int cpu);
-       void                            (*pre_unpark)(unsigned int cpu);
         cpumask_var_t                   cpumask;
         bool                            selfparking;
         const char                      *thread_comm;
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h

index 414d924318ce1ba57a448a485d8594a99a9012d5..0adedca24c5bfbd4ca7a25b80641e76e0e638738 100644 (file)
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
                          struct cpu_stop_work *work_buf);
  int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
  int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+void stop_machine_park(int cpu);
+void stop_machine_unpark(int cpu);
  
  #else  /* CONFIG_SMP */
  
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index 539d6bc3216a3784f9ad5b4d1e3ef06e7a4cc223..9b90c57517a918687189933ae6920b80d251e98e 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
              TP_ARGS(p));
  
  #ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
  {
-       long state = p->state;
-
-#ifdef CONFIG_PREEMPT
  #ifdef CONFIG_SCHED_DEBUG
         BUG_ON(p != current);
  #endif /* CONFIG_SCHED_DEBUG */
+
         /*
-        * For all intents and purposes a preempted task is a running task.
+        * Preemption ignores task state, therefore preempted tasks are always
+        * RUNNING (we will not have dequeued if state != RUNNING).
          */
-       if (preempt_count() & PREEMPT_ACTIVE)
-               state = TASK_RUNNING | TASK_STATE_MAX;
-#endif /* CONFIG_PREEMPT */
-
-       return state;
+       return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
  }
  #endif /* CREATE_TRACE_POINTS */
  
@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
   */
  TRACE_EVENT(sched_switch,
  
-       TP_PROTO(struct task_struct *prev,
+       TP_PROTO(bool preempt,
+                struct task_struct *prev,
                  struct task_struct *next),
  
-       TP_ARGS(prev, next),
+       TP_ARGS(preempt, prev, next),
  
         TP_STRUCT__entry(
                 __array(        char,   prev_comm,      TASK_COMM_LEN   )
@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
                 memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                 __entry->prev_pid       = prev->pid;
                 __entry->prev_prio      = prev->prio;
-               __entry->prev_state     = __trace_sched_switch_state(prev);
+               __entry->prev_state     = __trace_sched_switch_state(preempt, prev);
                 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                 __entry->next_pid       = next->pid;
                 __entry->next_prio      = next->prio;
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 14a9cdf8abe9e806083ca3041c2861f6ef0713b3..85ff5e26e23b45b34201120c758082599f995b7e 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
  {
         struct task_struct *g, *p;
  
-       read_lock_irq(&tasklist_lock);
-       do_each_thread(g, p) {
+       read_lock(&tasklist_lock);
+       for_each_process_thread(g, p) {
                 if (!p->on_rq)
                         continue;
                 /*
@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
  
                 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
                         p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-       } while_each_thread(g, p);
-       read_unlock_irq(&tasklist_lock);
+       }
+       read_unlock(&tasklist_lock);
  }
  
  struct take_cpu_down_param {
@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param)
         /* Give up timekeeping duties */
         tick_handover_do_timer();
         /* Park the stopper thread */
-       kthread_park(current);
+       stop_machine_park((long)param->hcpu);
         return 0;
  }
  
diff --git a/kernel/exit.c b/kernel/exit.c

index 0e93b63bbc59292815f49ddc95c0eceafa48b7e4..07110c6020a04ea37c04bc18bd0b9287cd0466dc 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
         smp_mb();
         raw_spin_unlock_wait(&tsk->pi_lock);
  
-       if (unlikely(in_atomic()))
+       if (unlikely(in_atomic())) {
                 pr_info("note: %s[%d] exited with preempt_count %d\n",
                         current->comm, task_pid_nr(current),
                         preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
  
         /* sync mm's RSS info before statistics gathering */
         if (tsk->mm)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c

index bbb72b4f64a148de2bcf186acf5b2e08d2d77207..8251e75dd9c0bd67337754baf6f03fb2cf1f956f 100644 (file)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
          * then right waiter has a dl_prio() too.
          */
         if (dl_prio(left->prio))
-               return (left->task->dl.deadline < right->task->dl.deadline);
+               return dl_time_before(left->task->dl.deadline,
+                                     right->task->dl.deadline);
  
         return 0;
  }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index f7402f7eb44803a6659b5f2b6337d64624dd8d0a..aa5973220ad213a960092012bf4493f296dab90b 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
-       if (p->policy == SCHED_IDLE) {
+       if (idle_policy(p->policy)) {
                 load->weight = scale_load(WEIGHT_IDLEPRIO);
                 load->inv_weight = WMULT_IDLEPRIO;
                 return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
         load->inv_weight = prio_to_wmult[prio];
  }
  
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_queued(rq, p);
+       if (!(flags & ENQUEUE_RESTORE))
+               sched_info_queued(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
  }
  
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_dequeued(rq, p);
+       if (!(flags & DEQUEUE_SAVE))
+               sched_info_dequeued(rq, p);
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                  * holding rq->lock.
                  */
                 lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         }
         if (running)
                 put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
  }
  
  /*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         if (task_cpu(p) != new_cpu) {
                 if (p->sched_class->migrate_task_rq)
-                       p->sched_class->migrate_task_rq(p, new_cpu);
+                       p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
         }
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
         struct rq *src_rq, *dst_rq;
         int ret = -EAGAIN;
  
+       if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+               return -EAGAIN;
+
         src_rq = cpu_rq(arg->src_cpu);
         dst_rq = cpu_rq(arg->dst_cpu);
  
         double_raw_lock(&arg->src_task->pi_lock,
                         &arg->dst_task->pi_lock);
         double_rq_lock(src_rq, dst_rq);
+
         if (task_cpu(arg->dst_task) != arg->dst_cpu)
                 goto unlock;
  
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                         goto out;
                 }
  
+               /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
-                       /* No more Mr. Nice Guy. */
-                       cpuset_cpus_allowed_fallback(p);
-                       state = possible;
-                       break;
-
+                       if (IS_ENABLED(CONFIG_CPUSETS)) {
+                               cpuset_cpus_allowed_fallback(p);
+                               state = possible;
+                               break;
+                       }
+                       /* fall-through */
                 case possible:
                         do_set_cpus_allowed(p, cpu_possible_mask);
                         state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  #endif /* CONFIG_SCHEDSTATS */
  }
  
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  {
         activate_task(rq, p, en_flags);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
  #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
  void set_numabalancing_state(bool enabled)
  {
         if (enabled)
-               sched_feat_set("NUMA");
+               static_branch_enable(&sched_numa_balancing);
         else
-               sched_feat_set("NO_NUMA");
+               static_branch_disable(&sched_numa_balancing);
  }
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
-       numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
  
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
  {
         struct ctl_table t;
         int err;
-       int state = numabalancing_enabled;
+       int state = static_branch_likely(&sched_numa_balancing);
  
         if (write && !capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
         struct rq *rq;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
+       /* Initialize new task's runnable average */
+       init_entity_runnable_average(&p->se);
  #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
         set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
         rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2483,7 +2485,6 @@ static inline void
  prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
  {
-       trace_sched_switch(prev, next);
         sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
  
+       /*
+        * The previous task will have left us with a preempt_count of 2
+        * because it left us after:
+        *
+        *      schedule()
+        *        preempt_disable();                    // 1
+        *        __schedule()
+        *          raw_spin_lock_irq(&rq->lock)        // 2
+        *
+        * Also, see FORK_PREEMPT_COUNT.
+        */
+       if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+                     "corrupted preempt_count: %s/%d/0x%x\n",
+                     current->comm, current->pid, preempt_count()))
+               preempt_count_set(FORK_PREEMPT_COUNT);
+
         rq->prev_mm = NULL;
  
         /*
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
  {
         struct rq *rq;
  
-       /* finish_task_switch() drops rq->lock and enables preemtion */
-       preempt_disable();
+       /*
+        * New tasks start with FORK_PREEMPT_COUNT, see there and
+        * finish_task_switch() for details.
+        *
+        * finish_task_switch() will drop rq->lock() and lower preempt_count
+        * and the preempt_enable() will end up enabling preemption (on
+        * PREEMPT_COUNT kernels).
+        */
+
         rq = finish_task_switch(prev);
         balance_callback(rq);
         preempt_enable();
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
-       BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+       BUG_ON(task_stack_end_corrupted(prev));
  #endif
-       /*
-        * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path. Otherwise whine
-        * if we are scheduling when we should not.
-        */
-       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+       if (unlikely(in_atomic_preempt_off())) {
                 __schedule_bug(prev);
+               preempt_count_set(PREEMPT_DISABLED);
+       }
         rcu_sleep_check();
  
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3054,7 +3076,7 @@ again:
   *
   * WARNING: must be called with preemption disabled!
   */
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
         rcu_note_context_switch();
         prev = rq->curr;
  
+       /*
+        * do_exit() calls schedule() with preemption disabled as an exception;
+        * however we must fix that up, otherwise the next task will see an
+        * inconsistent (higher) preempt count.
+        *
+        * It also avoids the below schedule_debug() test from complaining
+        * about this.
+        */
+       if (unlikely(prev->state == TASK_DEAD))
+               preempt_enable_no_resched_notrace();
+
         schedule_debug(prev);
  
         if (sched_feat(HRTICK))
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
         switch_count = &prev->nivcsw;
-       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+       if (!preempt && prev->state) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
                 rq->curr = next;
                 ++*switch_count;
  
+               trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
                 cpu = cpu_of(rq);
         } else {
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
         sched_submit_work(tsk);
         do {
                 preempt_disable();
-               __schedule();
+               __schedule(false);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
  }
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
-               preempt_active_enter();
-               __schedule();
-               preempt_active_exit();
+               preempt_disable_notrace();
+               __schedule(true);
+               preempt_enable_no_resched_notrace();
  
                 /*
                  * Check again in case we missed a preemption opportunity
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                 return;
  
         do {
-               /*
-                * Use raw __prempt_count() ops that don't call function.
-                * We can't call functions before disabling preemption which
-                * disarm preemption tracing recursions.
-                */
-               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-               barrier();
+               preempt_disable_notrace();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
-               __schedule();
+               __schedule(true);
                 exception_exit(prev_ctx);
  
-               barrier();
-               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               preempt_enable_no_resched_notrace();
         } while (need_resched());
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
         prev_state = exception_enter();
  
         do {
-               preempt_active_enter();
+               preempt_disable();
                 local_irq_enable();
-               __schedule();
+               __schedule(true);
                 local_irq_disable();
-               preempt_active_exit();
+               sched_preempt_enable_no_resched();
         } while (need_resched());
  
         exception_exit(prev_state);
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, queued, running, enqueue_flag = 0;
+       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
         struct rq *rq;
         const struct sched_class *prev_class;
  
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       enqueue_flag = ENQUEUE_REPLENISH;
+                       enqueue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
-                       enqueue_flag = ENQUEUE_HEAD;
+                       enqueue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
         }
         queued = task_on_rq_queued(p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
  
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
         delta = p->prio - old_prio;
  
         if (queued) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -3753,10 +3780,7 @@ recheck:
         } else {
                 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
-               if (policy != SCHED_DEADLINE &&
-                               policy != SCHED_FIFO && policy != SCHED_RR &&
-                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                               policy != SCHED_IDLE)
+               if (!valid_policy(policy))
                         return -EINVAL;
         }
  
@@ -3812,7 +3836,7 @@ recheck:
                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+               if (idle_policy(p->policy) && !idle_policy(policy)) {
                         if (!can_nice(p, task_nice(p)))
                                 return -EPERM;
                 }
@@ -3937,7 +3961,7 @@ change:
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3947,11 +3971,15 @@ change:
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
+               int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
-               enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+               if (oldprio <= p->prio)
+                       enqueue_flags |= ENQUEUE_HEAD;
+
+               enqueue_task(rq, p, enqueue_flags);
         }
  
         check_class_changed(rq, p, prev_class, oldprio);
@@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         running = task_current(rq, p);
  
         if (queued)
-               dequeue_task(rq, p, 0);
+               dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
  
@@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, ENQUEUE_RESTORE);
         task_rq_unlock(rq, p, &flags);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
  static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
+       int cpu = (long)hcpu;
+
         switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_STARTING:
                 set_cpu_rq_start_time();
                 return NOTIFY_OK;
+
         case CPU_ONLINE:
                 /*
                  * At this point a starting CPU has marked itself as online via
                  * set_cpu_online(). But it might not yet have marked itself
                  * as active, which is essential from here on.
-                *
-                * Thus, fall-through and help the starting CPU along.
                  */
+               set_cpu_active(cpu, true);
+               stop_machine_unpark(cpu);
+               return NOTIFY_OK;
+
         case CPU_DOWN_FAILED:
-               set_cpu_active((long)hcpu, true);
+               set_cpu_active(cpu, true);
                 return NOTIFY_OK;
+
         default:
                 return NOTIFY_DONE;
         }
@@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
         { NULL, },
  };
  
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+       default_topology;
  
  #define for_each_sd_topology(tl)                       \
         for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7478,7 +7513,7 @@ void __init sched_init(void)
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
-       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+       int nested = preempt_count() + rcu_preempt_depth();
  
         return (nested == preempt_offset);
  }
@@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
         queued = task_on_rq_queued(tsk);
  
         if (queued)
-               dequeue_task(rq, tsk, 0);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
  
@@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, queued);
+               tsk->sched_class->task_move_group(tsk);
         else
  #endif
                 set_task_rq(tsk, task_cpu(tsk));
@@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, tsk, 0);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
  
         task_rq_unlock(rq, tsk, &flags);
  }
@@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
                             struct cgroup_subsys_state *old_css,
                             struct task_struct *task)
  {
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
-
         sched_move_task(task);
  }
  
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c

index c6acb07466bb82b1143af4aba1da5e483f628e4f..5a75b08cfd8576d830adf9fc9df52d807c052be9 100644 (file)
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,11 +31,6 @@ static inline int right_child(int i)
         return (i << 1) + 2;
  }
  
-static inline int dl_time_before(u64 a, u64 b)
-{
-       return (s64)(a - b) < 0;
-}
-
  static void cpudl_exchange(struct cpudl *cp, int a, int b)
  {
         int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h

index 1a0a6ef2fbe1be030e32895571a1d267e26579e4..fcbdf83fed7e31afc4ee916f94d272d03117a2c2 100644 (file)
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -2,6 +2,7 @@
  #define _LINUX_CPUDL_H
  
  #include <linux/sched.h>
+#include <linux/sched/deadline.h>
  
  #define IDX_INVALID     -1
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 9a5e60fe721a5e7e8036508f61950eeb0c8dea31..824aa9f501a3b2a183a5a700dee57c984c51f89e 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p);
  
  /*
   * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
+ * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
+ * dependent on this value.
   */
  #define LOAD_AVG_PERIOD 32
  #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
  
  /* Give new sched_entity start runnable values to heavy its load in infant time */
  void init_entity_runnable_average(struct sched_entity *se)
@@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
         sa->load_avg = scale_load_down(se->load.weight);
         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
         sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
-       sa->util_sum = LOAD_AVG_MAX;
+       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
@@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         int local = !!(flags & TNF_FAULT_LOCAL);
         int priv;
  
-       if (!numabalancing_enabled)
+       if (!static_branch_likely(&sched_numa_balancing))
                 return;
  
         /* for example, ksmd faulting in a user's mm */
@@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work)
         struct vm_area_struct *vma;
         unsigned long start, end;
         unsigned long nr_pte_updates = 0;
-       long pages;
+       long pages, virtpages;
  
         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
  
@@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work)
         start = mm->numa_scan_offset;
         pages = sysctl_numa_balancing_scan_size;
         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+       virtpages = pages * 8;     /* Scan up to this much virtual space */
         if (!pages)
                 return;
  
+
         down_read(&mm->mmap_sem);
         vma = find_vma(mm, start);
         if (!vma) {
@@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work)
                         start = max(start, vma->vm_start);
                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                         end = min(end, vma->vm_end);
-                       nr_pte_updates += change_prot_numa(vma, start, end);
+                       nr_pte_updates = change_prot_numa(vma, start, end);
  
                         /*
-                        * Scan sysctl_numa_balancing_scan_size but ensure that
-                        * at least one PTE is updated so that unused virtual
-                        * address space is quickly skipped.
+                        * Try to scan sysctl_numa_balancing_size worth of
+                        * hpages that have at least one present PTE that
+                        * is not already pte-numa. If the VMA contains
+                        * areas that are unused or already full of prot_numa
+                        * PTEs, scan up to virtpages, to skip through those
+                        * areas faster.
                          */
                         if (nr_pte_updates)
                                 pages -= (end - start) >> PAGE_SHIFT;
+                       virtpages -= (end - start) >> PAGE_SHIFT;
  
                         start = end;
-                       if (pages <= 0)
+                       if (pages <= 0 || virtpages <= 0)
                                 goto out;
  
                         cond_resched();
@@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n)
         return contrib + runnable_avg_yN_sum[n];
  }
  
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
  /*
   * We can represent the historical contribution to runnable average as the
   * coefficients of a geometric series.  To do this we sub-divide our runnable
@@ -2547,10 +2560,10 @@ static __always_inline int
  __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
  {
-       u64 delta, periods;
+       u64 delta, scaled_delta, periods;
         u32 contrib;
-       int delta_w, decayed = 0;
-       unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       unsigned int delta_w, scaled_delta_w, decayed = 0;
+       unsigned long scale_freq, scale_cpu;
  
         delta = now - sa->last_update_time;
         /*
@@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                 return 0;
         sa->last_update_time = now;
  
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
         /* delta_w is the amount already accumulated against our next period */
         delta_w = sa->period_contrib;
         if (delta + delta_w >= 1024) {
@@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  * period and accrue it.
                  */
                 delta_w = 1024 - delta_w;
+               scaled_delta_w = cap_scale(delta_w, scale_freq);
                 if (weight) {
-                       sa->load_sum += weight * delta_w;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * delta_w;
+                       sa->load_sum += weight * scaled_delta_w;
+                       if (cfs_rq) {
+                               cfs_rq->runnable_load_sum +=
+                                               weight * scaled_delta_w;
+                       }
                 }
                 if (running)
-                       sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
+                       sa->util_sum += scaled_delta_w * scale_cpu;
  
                 delta -= delta_w;
  
@@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  
                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                 contrib = __compute_runnable_contrib(periods);
+               contrib = cap_scale(contrib, scale_freq);
                 if (weight) {
                         sa->load_sum += weight * contrib;
                         if (cfs_rq)
                                 cfs_rq->runnable_load_sum += weight * contrib;
                 }
                 if (running)
-                       sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
+                       sa->util_sum += contrib * scale_cpu;
         }
  
         /* Remainder of delta accrued against u_0` */
+       scaled_delta = cap_scale(delta, scale_freq);
         if (weight) {
-               sa->load_sum += weight * delta;
+               sa->load_sum += weight * scaled_delta;
                 if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * delta;
+                       cfs_rq->runnable_load_sum += weight * scaled_delta;
         }
         if (running)
-               sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
+               sa->util_sum += scaled_delta * scale_cpu;
  
         sa->period_contrib += delta;
  
@@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                         cfs_rq->runnable_load_avg =
                                 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
                 }
-               sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
         }
  
         return decayed;
@@ -2677,8 +2698,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                 sa->util_avg = max_t(long, sa->util_avg - r, 0);
-               sa->util_sum = max_t(s32, sa->util_sum -
-                       ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
+               sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2696,33 +2716,70 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  static inline void update_load_avg(struct sched_entity *se, int update_tg)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       int cpu = cpu_of(rq_of(cfs_rq));
         u64 now = cfs_rq_clock_task(cfs_rq);
+       int cpu = cpu_of(rq_of(cfs_rq));
  
         /*
          * Track task load average for carrying it to new CPU after migrated, and
          * track group sched_entity load average for task_h_load calc in migration
          */
         __update_load_avg(now, cpu, &se->avg,
-               se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
+                         se->on_rq * scale_load_down(se->load.weight),
+                         cfs_rq->curr == se, NULL);
  
         if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                 update_tg_load_avg(cfs_rq, 0);
  }
  
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       if (!sched_feat(ATTACH_AGE_LOAD))
+               goto skip_aging;
+
+       /*
+        * If we got migrated (either between CPUs or between cgroups) we'll
+        * have aged the average right before clearing @last_update_time.
+        */
+       if (se->avg.last_update_time) {
+               __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+                                 &se->avg, 0, 0, NULL);
+
+               /*
+                * XXX: we could have just aged the entire load away if we've been
+                * absent from the fair class for too long.
+                */
+       }
+
+skip_aging:
+       se->avg.last_update_time = cfs_rq->avg.last_update_time;
+       cfs_rq->avg.load_avg += se->avg.load_avg;
+       cfs_rq->avg.load_sum += se->avg.load_sum;
+       cfs_rq->avg.util_avg += se->avg.util_avg;
+       cfs_rq->avg.util_sum += se->avg.util_sum;
+}
+
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+                         &se->avg, se->on_rq * scale_load_down(se->load.weight),
+                         cfs_rq->curr == se, NULL);
+
+       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+}
+
  /* Add the load generated by se into cfs_rq's load average */
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct sched_avg *sa = &se->avg;
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int migrated = 0, decayed;
+       int migrated, decayed;
  
-       if (sa->last_update_time == 0) {
-               sa->last_update_time = now;
-               migrated = 1;
-       }
-       else {
+       migrated = !sa->last_update_time;
+       if (!migrated) {
                 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
                         se->on_rq * scale_load_down(se->load.weight),
                         cfs_rq->curr == se, NULL);
@@ -2733,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
  
-       if (migrated) {
-               cfs_rq->avg.load_avg += sa->load_avg;
-               cfs_rq->avg.load_sum += sa->load_sum;
-               cfs_rq->avg.util_avg += sa->util_avg;
-               cfs_rq->avg.util_sum += sa->util_sum;
-       }
+       if (migrated)
+               attach_entity_load_avg(cfs_rq, se);
  
         if (decayed || migrated)
                 update_tg_load_avg(cfs_rq, 0);
@@ -2753,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
         cfs_rq->runnable_load_avg =
                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
         cfs_rq->runnable_load_sum =
-               max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+               max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
  }
  
  /*
@@ -2821,6 +2874,11 @@ static inline void
  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void remove_entity_load_avg(struct sched_entity *se) {}
  
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+
  static inline int idle_balance(struct rq *rq)
  {
         return 0;
@@ -4817,32 +4875,39 @@ next:
  done:
         return target;
  }
+
  /*
- * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
   * tasks. The unit of the return value must be the one of capacity so we can
- * compare the usage with the capacity of the CPU that is available for CFS
- * task (ie cpu_capacity).
- * cfs.avg.util_avg is the sum of running time of runnable tasks on a
- * CPU. It represents the amount of utilization of a CPU in the range
- * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
- * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in util_avg or just
- * after migrating tasks until the average stabilizes with the new running
- * time. So we need to check that the usage stays into the range
- * [0..cpu_capacity_orig] and cap if necessary.
- * Without capping the usage, a group could be seen as overloaded (CPU0 usage
- * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
   */
-static int get_cpu_usage(int cpu)
+static int cpu_util(int cpu)
  {
-       unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
         unsigned long capacity = capacity_orig_of(cpu);
  
-       if (usage >= SCHED_LOAD_SCALE)
-               return capacity;
-
-       return (usage * capacity) >> SCHED_LOAD_SHIFT;
+       return (util >= capacity) ? capacity : util;
  }
  
  /*
@@ -4945,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
   * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
   * other assumptions, including the state of rq->lock, should be made.
   */
-static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p)
  {
         /*
          * We are supposed to update the task to "current" time, then its up to date
@@ -5525,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
         unsigned long src_faults, dst_faults;
         int src_nid, dst_nid;
  
-       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+       if (!static_branch_likely(&sched_numa_balancing))
                 return -1;
  
-       if (!sched_feat(NUMA))
+       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
                 return -1;
  
         src_nid = cpu_to_node(env->src_cpu);
@@ -5934,7 +5999,7 @@ struct sg_lb_stats {
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long load_per_task;
         unsigned long group_capacity;
-       unsigned long group_usage; /* Total usage of the group */
+       unsigned long group_util; /* Total utilization of the group */
         unsigned int sum_nr_running; /* Nr tasks running in the group */
         unsigned int idle_cpus;
         unsigned int group_weight;
@@ -6010,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
         return load_idx;
  }
  
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-       if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-               return sd->smt_gain / sd->span_weight;
-
-       return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-       return default_scale_cpu_capacity(sd, cpu);
-}
-
  static unsigned long scale_rt_capacity(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -6052,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu)
  
  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  {
-       unsigned long capacity = SCHED_CAPACITY_SCALE;
+       unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
         struct sched_group *sdg = sd->groups;
  
-       if (sched_feat(ARCH_CAPACITY))
-               capacity *= arch_scale_cpu_capacity(sd, cpu);
-       else
-               capacity *= default_scale_cpu_capacity(sd, cpu);
-
-       capacity >>= SCHED_CAPACITY_SHIFT;
-
         cpu_rq(cpu)->cpu_capacity_orig = capacity;
  
         capacity *= scale_rt_capacity(cpu);
@@ -6187,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group)
   * group_has_capacity returns true if the group has spare capacity that could
   * be used by some tasks.
   * We consider that a group has spare capacity if the  * number of task is
- * smaller than the number of CPUs or if the usage is lower than the available
- * capacity for CFS tasks.
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
   * For the latter, we use a threshold to stabilize the state, to take into
   * account the variance of the tasks' load and to return true if the available
   * capacity in meaningful for the load balancer.
@@ -6202,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
                 return true;
  
         if ((sgs->group_capacity * 100) >
-                       (sgs->group_usage * env->sd->imbalance_pct))
+                       (sgs->group_util * env->sd->imbalance_pct))
                 return true;
  
         return false;
@@ -6223,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
                 return false;
  
         if ((sgs->group_capacity * 100) <
-                       (sgs->group_usage * env->sd->imbalance_pct))
+                       (sgs->group_util * env->sd->imbalance_pct))
                 return true;
  
         return false;
  }
  
-static enum group_type group_classify(struct lb_env *env,
-               struct sched_group *group,
-               struct sg_lb_stats *sgs)
+static inline enum
+group_type group_classify(struct sched_group *group,
+                         struct sg_lb_stats *sgs)
  {
         if (sgs->group_no_capacity)
                 return group_overloaded;
@@ -6271,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         load = source_load(i, load_idx);
  
                 sgs->group_load += load;
-               sgs->group_usage += get_cpu_usage(i);
+               sgs->group_util += cpu_util(i);
                 sgs->sum_nr_running += rq->cfs.h_nr_running;
  
                 if (rq->nr_running > 1)
@@ -6296,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
         sgs->group_weight = group->group_weight;
  
         sgs->group_no_capacity = group_is_overloaded(env, sgs);
-       sgs->group_type = group_classify(env, group, sgs);
+       sgs->group_type = group_classify(group, sgs);
  }
  
  /**
@@ -6430,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                     group_has_capacity(env, &sds->local_stat) &&
                     (sgs->sum_nr_running > 1)) {
                         sgs->group_no_capacity = 1;
-                       sgs->group_type = group_overloaded;
+                       sgs->group_type = group_classify(sg, sgs);
                 }
  
                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -7610,8 +7655,22 @@ out:
          * When the cpu is attached to null domain for ex, it will not be
          * updated.
          */
-       if (likely(update_next_balance))
+       if (likely(update_next_balance)) {
                 rq->next_balance = next_balance;
+
+#ifdef CONFIG_NO_HZ_COMMON
+               /*
+                * If this CPU has been elected to perform the nohz idle
+                * balance. Other idle CPUs have already rebalanced with
+                * nohz_idle_balance() and nohz.next_balance has been
+                * updated accordingly. This CPU is now running the idle load
+                * balance for itself and we need to update the
+                * nohz.next_balance accordingly.
+                */
+               if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+                       nohz.next_balance = rq->next_balance;
+#endif
+       }
  }
  
  #ifdef CONFIG_NO_HZ_COMMON
@@ -7624,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
         int this_cpu = this_rq->cpu;
         struct rq *rq;
         int balance_cpu;
+       /* Earliest time when we have to do rebalance again */
+       unsigned long next_balance = jiffies + 60*HZ;
+       int update_next_balance = 0;
  
         if (idle != CPU_IDLE ||
             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
@@ -7655,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                         rebalance_domains(rq, CPU_IDLE);
                 }
  
-               if (time_after(this_rq->next_balance, rq->next_balance))
-                       this_rq->next_balance = rq->next_balance;
+               if (time_after(next_balance, rq->next_balance)) {
+                       next_balance = rq->next_balance;
+                       update_next_balance = 1;
+               }
         }
-       nohz.next_balance = this_rq->next_balance;
+
+       /*
+        * next_balance will be updated only when there is a need.
+        * When the CPU is attached to null domain for ex, it will not be
+        * updated.
+        */
+       if (likely(update_next_balance))
+               nohz.next_balance = next_balance;
  end:
         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
  }
@@ -7811,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                 entity_tick(cfs_rq, se, queued);
         }
  
-       if (numabalancing_enabled)
+       if (static_branch_unlikely(&sched_numa_balancing))
                 task_tick_numa(rq, curr);
  }
  
@@ -7887,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
                 check_preempt_curr(rq, p, 0);
  }
  
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
+static inline bool vruntime_normalized(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
         /*
-        * Ensure the task's vruntime is normalized, so that when it's
-        * switched back to the fair class the enqueue_entity(.flags=0) will
-        * do the right thing.
+        * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
+        * the dequeue_entity(.flags=0) will already have normalized the
+        * vruntime.
+        */
+       if (p->on_rq)
+               return true;
+
+       /*
+        * When !on_rq, vruntime of the task has usually NOT been normalized.
+        * But there are some cases where it has already been normalized:
          *
-        * If it's queued, then the dequeue_entity(.flags=0) will already
-        * have normalized the vruntime, if it's !queued, then only when
-        * the task is sleeping will it still have non-normalized vruntime.
+        * - A forked child which is waiting for being woken up by
+        *   wake_up_new_task().
+        * - A task which has been woken up by try_to_wake_up() and
+        *   waiting for actually being woken up by sched_ttwu_pending().
          */
-       if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
+       if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+               return true;
+
+       return false;
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       if (!vruntime_normalized(p)) {
                 /*
                  * Fix up our vruntime so that the current sleep doesn't
                  * cause 'unlimited' sleep bonus.
@@ -7910,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                 se->vruntime -= cfs_rq->min_vruntime;
         }
  
-#ifdef CONFIG_SMP
         /* Catch up with the cfs_rq and remove our load when we leave */
-       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
-               se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
-
-       cfs_rq->avg.load_avg =
-               max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum =
-               max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg =
-               max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum =
-               max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
-#endif
+       detach_entity_load_avg(cfs_rq, se);
  }
  
-/*
- * We switched to the sched_fair class.
- */
-static void switched_to_fair(struct rq *rq, struct task_struct *p)
+static void attach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /*
@@ -7941,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
         se->depth = se->parent ? se->parent->depth + 1 : 0;
  #endif
  
-       if (!task_on_rq_queued(p)) {
+       /* Synchronize task with its cfs_rq */
+       attach_entity_load_avg(cfs_rq, se);
  
+       if (!vruntime_normalized(p))
+               se->vruntime += cfs_rq->min_vruntime;
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+       detach_task_cfs_rq(p);
+}
+
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+       attach_task_cfs_rq(p);
+
+       if (task_on_rq_queued(p)) {
                 /*
-                * Ensure the task has a non-normalized vruntime when it is switched
-                * back to the fair class with !queued, so that enqueue_entity() at
-                * wake-up time will do the right thing.
-                *
-                * If it's queued, then the enqueue_entity(.flags=0) makes the task
-                * has non-normalized vruntime, if it's !queued, then it still has
-                * normalized vruntime.
+                * We were most likely switched from sched_rt, so
+                * kick off the schedule if running, otherwise just see
+                * if we can still preempt the current task.
                  */
-               if (p->state != TASK_RUNNING)
-                       se->vruntime += cfs_rq_of(se)->min_vruntime;
-               return;
+               if (rq->curr == p)
+                       resched_curr(rq);
+               else
+                       check_preempt_curr(rq, p, 0);
         }
-
-       /*
-        * We were most likely switched from sched_rt, so
-        * kick off the schedule if running, otherwise just see
-        * if we can still preempt the current task.
-        */
-       if (rq->curr == p)
-               resched_curr(rq);
-       else
-               check_preempt_curr(rq, p, 0);
  }
  
  /* Account for a task changing its policy or group.
@@ -8000,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int queued)
+static void task_move_group_fair(struct task_struct *p)
  {
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq;
-
-       /*
-        * If the task was not on the rq at the time of this cgroup movement
-        * it must have been asleep, sleeping tasks keep their ->vruntime
-        * absolute on their old rq until wakeup (needed for the fair sleeper
-        * bonus in place_entity()).
-        *
-        * If it was on the rq, we've just 'preempted' it, which does convert
-        * ->vruntime to a relative base.
-        *
-        * Make sure both cases convert their relative position when migrating
-        * to another cgroup's rq. This does somewhat interfere with the
-        * fair sleeper stuff for the first placement, but who cares.
-        */
-       /*
-        * When !queued, vruntime of the task has usually NOT been normalized.
-        * But there are some cases where it has already been normalized:
-        *
-        * - Moving a forked child which is waiting for being woken up by
-        *   wake_up_new_task().
-        * - Moving a task which has been woken up by try_to_wake_up() and
-        *   waiting for actually being woken up by sched_ttwu_pending().
-        *
-        * To prevent boost or penalty in the new cfs_rq caused by delta
-        * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
-        */
-       if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-               queued = 1;
-
-       if (!queued)
-               se->vruntime -= cfs_rq_of(se)->min_vruntime;
+       detach_task_cfs_rq(p);
         set_task_rq(p, task_cpu(p));
-       se->depth = se->parent ? se->parent->depth + 1 : 0;
-       if (!queued) {
-               cfs_rq = cfs_rq_of(se);
-               se->vruntime += cfs_rq->min_vruntime;
  
  #ifdef CONFIG_SMP
-               /* Virtually synchronize task with its new cfs_rq */
-               p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
-               cfs_rq->avg.load_avg += p->se.avg.load_avg;
-               cfs_rq->avg.load_sum += p->se.avg.load_sum;
-               cfs_rq->avg.util_avg += p->se.avg.util_avg;
-               cfs_rq->avg.util_sum += p->se.avg.util_sum;
+       /* Tell se's cfs_rq has been changed -- migrated */
+       p->se.avg.last_update_time = 0;
  #endif
-       }
+       attach_task_cfs_rq(p);
  }
  
  void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 83a50e7ca53315695f779e2c5ac7177c423b646d..69631fa46c2f84fecd3e15599cba0e5935c1148e 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
   */
  SCHED_FEAT(WAKEUP_PREEMPTION, true)
  
-/*
- * Use arch dependent cpu capacity functions
- */
-SCHED_FEAT(ARCH_CAPACITY, true)
-
  SCHED_FEAT(HRTICK, false)
  SCHED_FEAT(DOUBLE_TICK, false)
  SCHED_FEAT(LB_BIAS, true)
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
  SCHED_FEAT(FORCE_SD_OVERLAP, false)
  SCHED_FEAT(RT_RUNTIME_SHARE, true)
  SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
  
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-
-/*
- * NUMA will favor moving tasks towards nodes where a higher number of
- * hinting faults are recorded during active load balancing. It will
- * resist moving tasks towards nodes where a lower number of hinting
- * faults have been recorded.
- */
-SCHED_FEAT(NUMA,       true)
-#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index d2ea59364a1c8f7e66d72e96c7d170088424eec0..e3cc16312046689fd04db8748304210c0d9fc8df 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
  /*
   * We ran out of runtime, see if we can borrow some from our neighbours.
   */
-static int do_balance_runtime(struct rt_rq *rt_rq)
+static void do_balance_runtime(struct rt_rq *rt_rq)
  {
         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
         struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
-       int i, weight, more = 0;
+       int i, weight;
         u64 rt_period;
  
         weight = cpumask_weight(rd->span);
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                                 diff = rt_period - rt_rq->rt_runtime;
                         iter->rt_runtime -= diff;
                         rt_rq->rt_runtime += diff;
-                       more = 1;
                         if (rt_rq->rt_runtime == rt_period) {
                                 raw_spin_unlock(&iter->rt_runtime_lock);
                                 break;
@@ -683,8 +682,6 @@ next:
                 raw_spin_unlock(&iter->rt_runtime_lock);
         }
         raw_spin_unlock(&rt_b->rt_runtime_lock);
-
-       return more;
  }
  
  /*
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
         }
  }
  
-static int balance_runtime(struct rt_rq *rt_rq)
+static void balance_runtime(struct rt_rq *rt_rq)
  {
-       int more = 0;
-
         if (!sched_feat(RT_RUNTIME_SHARE))
-               return more;
+               return;
  
         if (rt_rq->rt_time > rt_rq->rt_runtime) {
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
-               more = do_balance_runtime(rt_rq);
+               do_balance_runtime(rt_rq);
                 raw_spin_lock(&rt_rq->rt_runtime_lock);
         }
-
-       return more;
  }
  #else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
-       return 0;
-}
+static inline void balance_runtime(struct rt_rq *rt_rq) {}
  #endif /* CONFIG_SMP */
  
  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 6d2a119c7ad9f63338ffb0e9e92efcbc269c2141..efd3bfc7e34722883e2f08ca82f91cffde812963 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
   */
  #define RUNTIME_INF    ((u64)~0ULL)
  
+static inline int idle_policy(int policy)
+{
+       return policy == SCHED_IDLE;
+}
  static inline int fair_policy(int policy)
  {
         return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
  {
         return policy == SCHED_DEADLINE;
  }
+static inline bool valid_policy(int policy)
+{
+       return idle_policy(policy) || fair_policy(policy) ||
+               rt_policy(policy) || dl_policy(policy);
+}
  
  static inline int task_has_rt_policy(struct task_struct *p)
  {
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
         return dl_policy(p->policy);
  }
  
-static inline bool dl_time_before(u64 a, u64 b)
-{
-       return (s64)(a - b) < 0;
-}
-
  /*
   * Tells if entity @a should preempt entity @b.
   */
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
  #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
  
-#ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
+extern struct static_key_false sched_numa_balancing;
  
  static inline u64 global_rt_period(void)
  {
@@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
  };
  
-#define ENQUEUE_WAKEUP         1
-#define ENQUEUE_HEAD           2
+#define ENQUEUE_WAKEUP         0x01
+#define ENQUEUE_HEAD           0x02
  #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING         4       /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING         0x04    /* sched_class::task_waking was called */
  #else
-#define ENQUEUE_WAKING         0
+#define ENQUEUE_WAKING         0x00
  #endif
-#define ENQUEUE_REPLENISH      8
+#define ENQUEUE_REPLENISH      0x08
+#define ENQUEUE_RESTORE        0x10
  
-#define DEQUEUE_SLEEP          1
+#define DEQUEUE_SLEEP          0x01
+#define DEQUEUE_SAVE           0x02
  
  #define RETRY_TASK             ((void *)-1UL)
  
@@ -1194,7 +1190,7 @@ struct sched_class {
  
  #ifdef CONFIG_SMP
         int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-       void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+       void (*migrate_task_rq)(struct task_struct *p);
  
         void (*task_waking) (struct task_struct *task);
         void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1227,7 +1223,7 @@ struct sched_class {
         void (*update_curr) (struct rq *rq);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       void (*task_move_group) (struct task_struct *p, int on_rq);
+       void (*task_move_group) (struct task_struct *p);
  #endif
  };
  
@@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
  }
  #endif
  
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+       if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+               return sd->smt_gain / sd->span_weight;
+
+       return SCHED_CAPACITY_SCALE;
+}
+#endif
+
  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
  {
         rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/smpboot.c b/kernel/smpboot.c

index a818cbc73e147382488cb0ea5bb5c490c6a46e15..d264f59bff56cb128edc68c5746280a2499752d5 100644 (file)
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
  {
         struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
  
-       if (ht->pre_unpark)
-               ht->pre_unpark(cpu);
-       kthread_unpark(tsk);
+       if (!ht->selfparking)
+               kthread_unpark(tsk);
  }
  
  void smpboot_unpark_threads(unsigned int cpu)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c

index 12484e5d5c88769058610aca529924ea9e882aff..867bc20e1ef142a63349c345932af24b26a1adfc 100644 (file)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
         }
  }
  
+static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
+                                       struct cpu_stop_work *work)
+{
+       list_add_tail(&work->list, &stopper->works);
+       wake_up_process(stopper->thread);
+}
+
  /* queue @work to @stopper.  if offline, @work is completed immediately */
  static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
  {
         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-
         unsigned long flags;
  
         spin_lock_irqsave(&stopper->lock, flags);
-
-       if (stopper->enabled) {
-               list_add_tail(&work->list, &stopper->works);
-               wake_up_process(stopper->thread);
-       } else
+       if (stopper->enabled)
+               __cpu_stop_queue_work(stopper, work);
+       else
                 cpu_stop_signal_done(work->done, false);
-
         spin_unlock_irqrestore(&stopper->lock, flags);
  }
  
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
         return err;
  }
  
+static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
+                                   int cpu2, struct cpu_stop_work *work2)
+{
+       struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+       struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+       int err;
+
+       lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+       spin_lock_irq(&stopper1->lock);
+       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+
+       err = -ENOENT;
+       if (!stopper1->enabled || !stopper2->enabled)
+               goto unlock;
+
+       err = 0;
+       __cpu_stop_queue_work(stopper1, work1);
+       __cpu_stop_queue_work(stopper2, work2);
+unlock:
+       spin_unlock(&stopper2->lock);
+       spin_unlock_irq(&stopper1->lock);
+       lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
+       return err;
+}
  /**
   * stop_two_cpus - stops two cpus
   * @cpu1: the cpu to stop
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
         cpu_stop_init_done(&done, 2);
         set_state(&msdata, MULTI_STOP_PREPARE);
  
-       /*
-        * If we observe both CPUs active we know _cpu_down() cannot yet have
-        * queued its stop_machine works and therefore ours will get executed
-        * first. Or its not either one of our CPUs that's getting unplugged,
-        * in which case we don't care.
-        *
-        * This relies on the stopper workqueues to be FIFO.
-        */
-       if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+       if (cpu1 > cpu2)
+               swap(cpu1, cpu2);
+       if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
                 preempt_enable();
                 return -ENOENT;
         }
  
-       lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
-       cpu_stop_queue_work(cpu1, &work1);
-       cpu_stop_queue_work(cpu2, &work2);
-       lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
-
         preempt_enable();
  
         wait_for_completion(&done.completion);
@@ -452,6 +469,18 @@ repeat:
         }
  }
  
+void stop_machine_park(int cpu)
+{
+       struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+       /*
+        * Lockless. cpu_stopper_thread() will take stopper->lock and flush
+        * the pending works before it parks, until then it is fine to queue
+        * the new works.
+        */
+       stopper->enabled = false;
+       kthread_park(stopper->thread);
+}
+
  extern void sched_set_stop_task(int cpu, struct task_struct *stop);
  
  static void cpu_stop_create(unsigned int cpu)
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
  static void cpu_stop_park(unsigned int cpu)
  {
         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-       struct cpu_stop_work *work, *tmp;
-       unsigned long flags;
  
-       /* drain remaining works */
-       spin_lock_irqsave(&stopper->lock, flags);
-       list_for_each_entry_safe(work, tmp, &stopper->works, list) {
-               list_del_init(&work->list);
-               cpu_stop_signal_done(work->done, false);
-       }
-       stopper->enabled = false;
-       spin_unlock_irqrestore(&stopper->lock, flags);
+       WARN_ON(!list_empty(&stopper->works));
  }
  
-static void cpu_stop_unpark(unsigned int cpu)
+void stop_machine_unpark(int cpu)
  {
         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  
-       spin_lock_irq(&stopper->lock);
         stopper->enabled = true;
-       spin_unlock_irq(&stopper->lock);
+       kthread_unpark(stopper->thread);
  }
  
  static struct smp_hotplug_thread cpu_stop_threads = {
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
         .thread_fn              = cpu_stopper_thread,
         .thread_comm            = "migration/%u",
         .create                 = cpu_stop_create,
-       .setup                  = cpu_stop_unpark,
         .park                   = cpu_stop_park,
-       .pre_unpark             = cpu_stop_unpark,
         .selfparking            = true,
  };
  
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
         }
  
         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
+       stop_machine_unpark(raw_smp_processor_id());
         stop_machine_initialized = true;
         return 0;
  }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index b0623ac785a22287526327a021d81fe3eaf5fafb..00611e95a8ee00bb91e7ccbd41c3ebcbc580e4f8 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5697,7 +5697,7 @@ free:
  }
  
  static void
-ftrace_graph_probe_sched_switch(void *ignore,
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
                         struct task_struct *prev, struct task_struct *next)
  {
         unsigned long long timestamp;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c

index f270088e9929aa2e085a15960828929140ae1037..4c896a0101bdcb231581a3cced5f2ef4a3556d08 100644 (file)
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int                    sched_ref;
  static DEFINE_MUTEX(sched_register_mutex);
  
  static void
-probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, bool preempt,
+                  struct task_struct *prev, struct task_struct *next)
  {
         if (unlikely(!sched_ref))
                 return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c

index 12cbe77b413620cb80436ab55d3758237e31a5b9..4bcfbac289ff9e9e6ab4d39772ad2dffd89509a8 100644 (file)
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
  }
  
  static void notrace
-probe_wakeup_sched_switch(void *ignore,
+probe_wakeup_sched_switch(void *ignore, bool preempt,
                           struct task_struct *prev, struct task_struct *next)
  {
         struct trace_array_cpu *data;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 4 Nov 2015 02:03:50 +0000 (18:03 -0800)
arch/x86/include/asm/preempt.h		patch \| blob \| history
arch/x86/include/asm/thread_info.h		patch \| blob \| history
arch/x86/kernel/process_32.c		patch \| blob \| history
arch/x86/kernel/process_64.c		patch \| blob \| history
include/asm-generic/preempt.h		patch \| blob \| history
include/linux/preempt.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/deadline.h		patch \| blob \| history
include/linux/smpboot.h		patch \| blob \| history
include/linux/stop_machine.h		patch \| blob \| history
include/trace/events/sched.h		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/locking/rtmutex.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cpudeadline.c		patch \| blob \| history
kernel/sched/cpudeadline.h		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/features.h		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/smpboot.c		patch \| blob \| history
kernel/stop_machine.c		patch \| blob \| history
kernel/trace/ftrace.c		patch \| blob \| history
kernel/trace/trace_sched_switch.c		patch \| blob \| history
kernel/trace/trace_sched_wakeup.c		patch \| blob \| history