Merge remote-tracking branch 'linux-2.6.32.y/master' into develop

[firefly-linux-kernel-4.4.55.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index e28e6c520bca8c0560a909bd544d843306456157..09dd64bba4042426cdb2dab2a37171556874f73d 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
   */
  static DEFINE_MUTEX(sched_domains_mutex);
  
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
  
  #include <linux/cgroup.h>
  
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
  
  /* task group related information */
  struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
         struct cgroup_subsys_state css;
-#endif
-
-#ifdef CONFIG_USER_SCHED
-       uid_t uid;
-#endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
         struct list_head children;
  };
  
-#ifdef CONFIG_USER_SCHED
-
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
-       user->tg->uid = user->uid;
-}
-
-/*
- * Root task group.
- *     Every UID task group (including init_task_group aka UID-0) will
- *     be a child to this group.
- */
-struct task_group root_task_group;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
  }
  #endif
  
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD  (2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
  # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
  
  /*
   * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
  {
         struct task_group *tg;
  
-#ifdef CONFIG_USER_SCHED
-       rcu_read_lock();
-       tg = __task_cred(p)->user->tg;
-       rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
+#ifdef CONFIG_CGROUP_SCHED
         tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                 struct task_group, css);
  #else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
         return NULL;
  }
  
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
  
  /* CFS-related fields in a runqueue */
  struct cfs_rq {
@@ -567,6 +525,7 @@ struct rq {
         struct mm_struct *prev_mm;
  
         u64 clock;
+       u64 clock_task;
  
         atomic_t nr_iowait;
  
@@ -574,6 +533,8 @@ struct rq {
         struct root_domain *rd;
         struct sched_domain *sd;
  
+       unsigned long cpu_power;
+
         unsigned char idle_at_tick;
         /* For active balancing */
         int post_schedule;
@@ -594,6 +555,10 @@ struct rq {
         u64 avg_idle;
  #endif
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       u64 prev_irq_time;
+#endif
+
         /* calc_load related fields */
         unsigned long calc_load_update;
         long calc_load_active;
@@ -631,11 +596,7 @@ struct rq {
  
  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-       rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-}
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
  
  static inline int cpu_of(struct rq *rq)
  {
@@ -662,9 +623,20 @@ static inline int cpu_of(struct rq *rq)
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  #define raw_rq()               (&__raw_get_cpu_var(runqueues))
  
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
  inline void update_rq_clock(struct rq *rq)
  {
+       int cpu = cpu_of(rq);
+       u64 irq_time;
+
         rq->clock = sched_clock_cpu(cpu_of(rq));
+       irq_time = irq_time_cpu(cpu);
+       if (rq->clock - irq_time > rq->clock_task)
+               rq->clock_task = rq->clock - irq_time;
+
+       sched_irq_time_avg_update(rq, irq_time);
  }
  
  /*
@@ -1297,6 +1269,10 @@ static void resched_task(struct task_struct *p)
  static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
  {
  }
+
+static void sched_avg_update(struct rq *rq)
+{
+}
  #endif /* CONFIG_SMP */
  
  #if BITS_PER_LONG == 32
@@ -1546,24 +1522,9 @@ static unsigned long target_load(int cpu, int type)
         return max(rq->cpu_load[type-1], total);
  }
  
-static struct sched_group *group_of(int cpu)
-{
-       struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
-
-       if (!sd)
-               return NULL;
-
-       return sd->groups;
-}
-
  static unsigned long power_of(int cpu)
  {
-       struct sched_group *group = group_of(cpu);
-
-       if (!group)
-               return SCHED_LOAD_SCALE;
-
-       return group->cpu_power;
+       return cpu_rq(cpu)->cpu_power;
  }
  
  static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1845,6 +1806,94 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  #endif
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+       if (!sched_clock_irqtime)
+               return 0;
+
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+       unsigned long flags;
+       int cpu;
+       u64 now, delta;
+
+       if (!sched_clock_irqtime)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       now = sched_clock_cpu(cpu);
+       delta = now - per_cpu(irq_start_time, cpu);
+       per_cpu(irq_start_time, cpu) = now;
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               per_cpu(cpu_hardirq_time, cpu) += delta;
+       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+               per_cpu(cpu_softirq_time, cpu) += delta;
+
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+               rq->prev_irq_time = curr_irq_time;
+               sched_rt_avg_update(rq, delta_irq);
+       }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+       return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
  #include "sched_stats.h"
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@ -1870,8 +1919,8 @@ static void dec_nr_running(struct rq *rq)
  static void set_load_weight(struct task_struct *p)
  {
         if (task_has_rt_policy(p)) {
-               p->se.load.weight = prio_to_weight[0] * 2;
-               p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+               p->se.load.weight = 0;
+               p->se.load.inv_weight = WMULT_CONST;
                 return;
         }
  
@@ -2052,6 +2101,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         if (p->sched_class != &fair_sched_class)
                 return 0;
  
+       if (unlikely(p->policy == SCHED_IDLE))
+               return 0;
+
         /*
          * Buddy candidates are cache hot:
          */
@@ -2323,6 +2375,24 @@ void task_oncpu_function_call(struct task_struct *p,
         preempt_enable();
  }
  
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+       const struct sched_class *class;
+
+       if (p->sched_class == rq->curr->sched_class) {
+               rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+       } else {
+               for_each_class(class) {
+                       if (class == rq->curr->sched_class)
+                               break;
+                       if (class == p->sched_class) {
+                               resched_task(rq->curr);
+                               break;
+                       }
+               }
+       }
+}
+
  #ifdef CONFIG_SMP
  /*
   * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -3144,6 +3214,8 @@ static void update_cpu_load(struct rq *this_rq)
                 this_rq->calc_load_update += LOAD_FREQ;
                 calc_load_account_active(this_rq);
         }
+
+       sched_avg_update(this_rq);
  }
  
  #ifdef CONFIG_SMP
@@ -3275,7 +3347,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
          * 2) too many balance attempts have failed.
          */
  
-       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
         if (!tsk_cache_hot ||
                 sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
@@ -3458,12 +3530,17 @@ struct sd_lb_stats {
         unsigned long this_load;
         unsigned long this_load_per_task;
         unsigned long this_nr_running;
+       unsigned long this_has_capacity;
+       unsigned int  this_idle_cpus;
  
         /* Statistics of the busiest group */
+       unsigned int  busiest_idle_cpus;
         unsigned long max_load;
         unsigned long busiest_load_per_task;
         unsigned long busiest_nr_running;
         unsigned long busiest_group_capacity;
+       unsigned long busiest_has_capacity;
+       unsigned int  busiest_group_weight;
  
         int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3485,7 +3562,10 @@ struct sg_lb_stats {
         unsigned long sum_nr_running; /* Nr tasks running in the group */
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long group_capacity;
+       unsigned long idle_cpus;
+       unsigned long group_weight;
         int group_imb; /* Is there an imbalance in the group ? */
+       int group_has_capacity; /* Is there extra capacity in the group? */
  };
  
  /**
@@ -3695,10 +3775,14 @@ unsigned long scale_rt_power(int cpu)
         struct rq *rq = cpu_rq(cpu);
         u64 total, available;
  
-       sched_avg_update(rq);
-
         total = sched_avg_period() + (rq->clock - rq->age_stamp);
-       available = total - rq->rt_avg;
+
+       if (unlikely(total < rq->rt_avg)) {
+               /* Ensures that power won't end up being negative */
+               available = 0;
+       } else {
+               available = total - rq->rt_avg;
+       }
  
         if (unlikely((s64)total < SCHED_LOAD_SCALE))
                 total = SCHED_LOAD_SCALE;
@@ -3736,6 +3820,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
         if (!power)
                 power = 1;
  
+       cpu_rq(cpu)->cpu_power = power;
         sdg->cpu_power = power;
  }
  
@@ -3780,7 +3865,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
  {
-       unsigned long load, max_cpu_load, min_cpu_load;
+       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
         int i;
         unsigned int balance_cpu = -1, first_idle_cpu = 0;
         unsigned long avg_load_per_task = 0;
@@ -3794,6 +3879,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         /* Tally up the load of all CPUs in the group */
         max_cpu_load = 0;
         min_cpu_load = ~0UL;
+       max_nr_running = 0;
  
         for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                 struct rq *rq = cpu_rq(i);
@@ -3811,8 +3897,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                         load = target_load(i, load_idx);
                 } else {
                         load = source_load(i, load_idx);
-                       if (load > max_cpu_load)
+                       if (load > max_cpu_load) {
                                 max_cpu_load = load;
+                               max_nr_running = rq->nr_running;
+                       }
                         if (min_cpu_load > load)
                                 min_cpu_load = load;
                 }
@@ -3820,7 +3908,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                 sgs->group_load += load;
                 sgs->sum_nr_running += rq->nr_running;
                 sgs->sum_weighted_load += weighted_cpuload(i);
-
+               if (idle_cpu(i))
+                       sgs->idle_cpus++;
         }
  
         /*
@@ -3850,11 +3939,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         if (sgs->sum_nr_running)
                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                 sgs->group_imb = 1;
  
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       sgs->group_weight = group->group_weight;
+
+       if (sgs->group_capacity > sgs->sum_nr_running)
+               sgs->group_has_capacity = 1;
  }
  
  /**
@@ -3901,9 +3993,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                 /*
                  * In case the child domain prefers tasks go to siblings
                  * first, lower the group capacity to one so that we'll try
-                * and move all the excess tasks away.
+                * and move all the excess tasks away. We lower the capacity
+                * of a group only if the local group has the capacity to fit
+                * these excess tasks, i.e. nr_running < group_capacity. The
+                * extra check prevents the case where you always pull from the
+                * heaviest group when it is already under-utilized (possible
+                * with a large weight task outweighs the tasks on the system).
                  */
-               if (prefer_sibling)
+               if (prefer_sibling && !local_group && sds->this_has_capacity)
                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
  
                 if (local_group) {
@@ -3911,14 +4008,19 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                         sds->this = group;
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
+                       sds->this_has_capacity = sgs.group_has_capacity;
+                       sds->this_idle_cpus = sgs.idle_cpus;
                 } else if (sgs.avg_load > sds->max_load &&
                            (sgs.sum_nr_running > sgs.group_capacity ||
                                 sgs.group_imb)) {
                         sds->max_load = sgs.avg_load;
                         sds->busiest = group;
                         sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_idle_cpus = sgs.idle_cpus;
                         sds->busiest_group_capacity = sgs.group_capacity;
+                       sds->busiest_group_weight = sgs.group_weight;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->busiest_has_capacity = sgs.group_has_capacity;
                         sds->group_imb = sgs.group_imb;
                 }
  
@@ -4064,6 +4166,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                 return fix_small_imbalance(sds, this_cpu, imbalance);
  
  }
+
  /******* find_busiest_group() helpers end here *********************/
  
  /**
@@ -4115,6 +4218,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
          * 4) This group is more busy than the avg busieness at this
          *    sched_domain.
          * 5) The imbalance is within the specified limit.
+        *
+        * Note: when doing newidle balance, if the local group has excess
+        * capacity (i.e. nr_running < group_capacity) and the busiest group
+        * does not have any capacity, we force a load balance to pull tasks
+        * to the local group. In this case, we skip past checks 3, 4 and 5.
          */
         if (balance && !(*balance))
                 goto ret;
@@ -4122,6 +4230,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (!sds.busiest || sds.busiest_nr_running == 0)
                 goto out_balanced;
  
+       /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                       !sds.busiest_has_capacity)
+               goto force_balance;
+
         if (sds.this_load >= sds.max_load)
                 goto out_balanced;
  
@@ -4130,9 +4243,28 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (sds.this_load >= sds.avg_load)
                 goto out_balanced;
  
-       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-               goto out_balanced;
+       /*
+        * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+        * And to check for busy balance use !idle_cpu instead of
+        * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+        * even when they are idle.
+        */
+       if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                       goto out_balanced;
+       } else {
+               /*
+                * This cpu is idle. If the busiest group load doesn't
+                * have more tasks than the number of available cpu's and
+                * there is no imbalance between this and busiest group
+                * wrt to idle cpu's, it is balanced.
+                */
+               if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                   sds.busiest_nr_running <= sds.busiest_group_weight)
+                       goto out_balanced;
+       }
  
+force_balance:
         /* Looks like there is an imbalance. Compute it */
         calculate_imbalance(&sds, this_cpu, imbalance);
         return sds.busiest;
@@ -4288,7 +4420,14 @@ redo:
  
         if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
-               sd->nr_balance_failed++;
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
  
                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
  
@@ -5033,7 +5172,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  
         if (task_current(rq, p)) {
                 update_rq_clock(rq);
-               ns = rq->clock - p->se.exec_start;
+               ns = rq->clock_task - p->se.exec_start;
                 if ((s64)ns < 0)
                         ns = 0;
         }
@@ -5177,7 +5316,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       else if (softirq_count())
+       else if (in_serving_softirq())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -7121,7 +7260,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->se.exec_start = sched_clock();
  
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+       /*
+        * We're having a chicken and egg problem, even though we are
+        * holding rq->lock, the cpu isn't yet set to this cpu so the
+        * lockdep check in task_group() will fail.
+        *
+        * Similar case to sched_fork(). / Alternatively we could
+        * use task_rq_lock() here and obtain the other rq->lock.
+        *
+        * Silence PROVE_RCU
+        */
+       rcu_read_lock();
         __set_task_cpu(idle, cpu);
+       rcu_read_unlock();
  
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -7139,7 +7290,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
-       ftrace_graph_init_task(idle);
+       ftrace_graph_init_idle_task(idle, cpu);
  }
  
  /*
@@ -8628,6 +8779,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
         if (cpu != group_first_cpu(sd->groups))
                 return;
  
+       sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+
         child = sd->child;
  
         sd->groups->cpu_power = 0;
@@ -9511,9 +9664,6 @@ void __init sched_init(void)
  #ifdef CONFIG_RT_GROUP_SCHED
         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
  #endif
-#ifdef CONFIG_USER_SCHED
-       alloc_size *= 2;
-#endif
  #ifdef CONFIG_CPUMASK_OFFSTACK
         alloc_size += num_possible_cpus() * cpumask_size();
  #endif
@@ -9531,13 +9681,6 @@ void __init sched_init(void)
                 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
-#ifdef CONFIG_USER_SCHED
-               root_task_group.se = (struct sched_entity **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-
-               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  #ifdef CONFIG_RT_GROUP_SCHED
                 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9546,13 +9689,6 @@ void __init sched_init(void)
                 init_task_group.rt_rq = (struct rt_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
-#ifdef CONFIG_USER_SCHED
-               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-
-               root_task_group.rt_rq = (struct rt_rq **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
  #endif /* CONFIG_RT_GROUP_SCHED */
  #ifdef CONFIG_CPUMASK_OFFSTACK
                 for_each_possible_cpu(i) {
@@ -9572,22 +9708,13 @@ void __init sched_init(void)
  #ifdef CONFIG_RT_GROUP_SCHED
         init_rt_bandwidth(&init_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_USER_SCHED
-       init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                       global_rt_period(), RUNTIME_INF);
-#endif /* CONFIG_USER_SCHED */
  #endif /* CONFIG_RT_GROUP_SCHED */
  
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
         INIT_LIST_HEAD(&init_task_group.children);
  
-#ifdef CONFIG_USER_SCHED
-       INIT_LIST_HEAD(&root_task_group.children);
-       init_task_group.parent = &root_task_group;
-       list_add(&init_task_group.siblings, &root_task_group.children);
-#endif /* CONFIG_USER_SCHED */
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
  
  #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
         update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9627,25 +9754,6 @@ void __init sched_init(void)
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                  */
                 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-               root_task_group.shares = NICE_0_LOAD;
-               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
-               /*
-                * In case of task-groups formed thr' the user id of tasks,
-                * init_task_group represents tasks belonging to root user.
-                * Hence it forms a sibling of all subsequent groups formed.
-                * In this case, init_task_group gets only a fraction of overall
-                * system cpu resource, based on the weight assigned to root
-                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
-                * by letting tasks of init_task_group sit in a separate cfs_rq
-                * (init_tg_cfs_rq) and having one entity represent this group of
-                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
-                */
-               init_tg_cfs_entry(&init_task_group,
-                               &per_cpu(init_tg_cfs_rq, i),
-                               &per_cpu(init_sched_entity, i), i, 1,
-                               root_task_group.se[i]);
-
  #endif
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
@@ -9668,6 +9776,7 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
+               rq->cpu_power = SCHED_LOAD_SCALE;
                 rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
@@ -10062,7 +10171,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
  static void free_sched_group(struct task_group *tg)
  {
         free_fair_sched_group(tg);
@@ -10157,12 +10266,12 @@ void sched_move_task(struct task_struct *tsk)
         if (unlikely(running))
                 tsk->sched_class->put_prev_task(rq, tsk);
  
-       set_task_rq(tsk, task_cpu(tsk));
-
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (tsk->sched_class->moved_group)
-               tsk->sched_class->moved_group(tsk, on_rq);
+       if (tsk->sched_class->task_move_group)
+               tsk->sched_class->task_move_group(tsk, on_rq);
+       else
  #endif
+               set_task_rq(tsk, task_cpu(tsk));
  
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
@@ -10171,7 +10280,7 @@ void sched_move_task(struct task_struct *tsk)
  
         task_rq_unlock(rq, &flags);
  }
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10313,13 +10422,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
                 runtime = d->rt_runtime;
         }
  
-#ifdef CONFIG_USER_SCHED
-       if (tg == &root_task_group) {
-               period = global_rt_period();
-               runtime = global_rt_runtime();
-       }
-#endif
-
         /*
          * Cannot have more runtime than the period.
          */