Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck...

author Ingo Molnar <mingo@elte.hu>

Thu, 23 Dec 2010 11:57:04 +0000 (12:57 +0100)

committer Ingo Molnar <mingo@elte.hu>

Thu, 23 Dec 2010 11:57:04 +0000 (12:57 +0100)
author Ingo Molnar <mingo@elte.hu>
Thu, 23 Dec 2010 11:57:04 +0000 (12:57 +0100)
committer Ingo Molnar <mingo@elte.hu>
Thu, 23 Dec 2010 11:57:04 +0000 (12:57 +0100)
diff --combined include/linux/init_task.h

index 1f8c06ce0fa66b83760863735eaf1209908205d7,69f91aacdeee3c3caabbd10a1e6303c482b7d886..6b281fae114a8d28db406e6886e74dafc5ddfaf0
--- 1/include/linux/init_task.h
--- 2/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@@ -29,8 -29,6 +29,8 @@@ extern struct fs_struct init_fs
                 .running = 0,                                           \
                 .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),        \
         },                                                              \
+ +      .cred_guard_mutex =                                             \
+ +               __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
   }
   
   extern struct nsproxy init_nsproxy;
@@@ -83,6 -81,12 +83,12 @@@ extern struct group_info init_groups
    */
   # define CAP_INIT_BSET  CAP_FULL_SET
   
+ #ifdef CONFIG_RCU_BOOST
+ #define INIT_TASK_RCU_BOOST()                                         \
+       .rcu_boost_mutex = NULL,
+ #else
+ #define INIT_TASK_RCU_BOOST()
+ #endif
   #ifdef CONFIG_TREE_PREEMPT_RCU
   #define INIT_TASK_RCU_TREE_PREEMPT()                                  \
         .rcu_blocked_node = NULL,
@@@ -94,7 -98,8 +100,8 @@@
         .rcu_read_lock_nesting = 0,                                     \
         .rcu_read_unlock_special = 0,                                   \
         .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
-       INIT_TASK_RCU_TREE_PREEMPT()
+       INIT_TASK_RCU_TREE_PREEMPT()                                    \
+       INIT_TASK_RCU_BOOST()
   #else
   #define INIT_TASK_RCU_PREEMPT(tsk)
   #endif
@@@ -147,6 -152,8 +154,6 @@@ extern struct cred init_cred
         .group_leader   = &tsk,                                         \
         RCU_INIT_POINTER(.real_cred, &init_cred),                       \
         RCU_INIT_POINTER(.cred, &init_cred),                            \
- -      .cred_guard_mutex =                                             \
- -               __MUTEX_INITIALIZER(tsk.cred_guard_mutex),             \
         .comm           = "swapper",                                    \
         .thread         = INIT_THREAD,                                  \
         .fs             = &init_fs,                                     \
diff --combined include/linux/sched.h

index 223874538b33208e3c5ff11710f3161d58b4aef2,ed1a9bc52b2f5a9521b6ad45f4aa05e7d5d6a3f9..d8005503cc627ef4c281e46286a93dd1864c4cf4
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -143,7 -143,7 +143,7 @@@ extern unsigned long nr_iowait_cpu(int 
   extern unsigned long this_cpu_load(void);
   
   
- -extern void calc_global_load(void);
+ +extern void calc_global_load(unsigned long ticks);
   
   extern unsigned long get_parent_ip(unsigned long addr);
   
@@@ -336,9 -336,6 +336,9 @@@ extern unsigned long sysctl_hung_task_w
   extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
                                          void __user *buffer,
                                          size_t *lenp, loff_t *ppos);
+ +#else
+ +/* Avoid need for ifdefs elsewhere in the code */
+ +enum { sysctl_hung_task_timeout_secs = 0 };
   #endif
   
   /* Attach to any functions which should be ignored in wchan output. */
@@@ -626,10 -623,6 +626,10 @@@ struct signal_struct 
   
         int oom_adj;            /* OOM kill score adjustment (bit shift) */
         int oom_score_adj;      /* OOM kill score adjustment */
+ +
+ +      struct mutex cred_guard_mutex;  /* guard against foreign influences on
+ +                                       * credential calculations
+ +                                       * (notably. ptrace) */
   };
   
   /* Context switch must be unlocked if interrupts are to be enabled */
@@@ -672,9 -665,6 +672,9 @@@ struct user_struct 
         atomic_t inotify_watches; /* How many inotify watches does this user have? */
         atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
   #endif
+ +#ifdef CONFIG_FANOTIFY
+ +      atomic_t fanotify_listeners;
+ +#endif
   #ifdef CONFIG_EPOLL
         atomic_t epoll_watches; /* The number of file descriptors currently watched */
   #endif
@@@ -862,7 -852,6 +862,7 @@@ struct sched_group 
          * single CPU.
          */
         unsigned int cpu_power, cpu_power_orig;
+ +      unsigned int group_weight;
   
         /*
          * The CPUs this group covers.
@@@ -886,7 -875,6 +886,7 @@@ enum sched_domain_level 
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
         SD_LV_MC,
+ +      SD_LV_BOOK,
         SD_LV_CPU,
         SD_LV_NODE,
         SD_LV_ALLNODES,
@@@ -1084,7 -1072,7 +1084,7 @@@ struct sched_class 
                                          struct task_struct *task);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -      void (*moved_group) (struct task_struct *p, int on_rq);
+ +      void (*task_move_group) (struct task_struct *p, int on_rq);
   #endif
   };
   
@@@ -1172,13 -1160,6 +1172,13 @@@ struct sched_rt_entity 
   
   struct rcu_node;
   
+ +enum perf_event_task_context {
+ +      perf_invalid_context = -1,
+ +      perf_hw_context = 0,
+ +      perf_sw_context,
+ +      perf_nr_task_contexts,
+ +};
+ +
   struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@@ -1229,6 -1210,9 +1229,9 @@@
   #ifdef CONFIG_TREE_PREEMPT_RCU
         struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ #ifdef CONFIG_RCU_BOOST
+       struct rt_mutex *rcu_boost_mutex;
+ #endif /* #ifdef CONFIG_RCU_BOOST */
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         struct sched_info sched_info;
@@@ -1313,6 -1297,9 +1316,6 @@@
                                          * credentials (COW) */
         const struct cred __rcu *cred;  /* effective (overridable) subjective task
                                          * credentials (COW) */
- -      struct mutex cred_guard_mutex;  /* guard against foreign influences on
- -                                       * credential calculations
- -                                       * (notably. ptrace) */
         struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
   
         char comm[TASK_COMM_LEN]; /* executable name excluding path
@@@ -1449,7 -1436,7 +1452,7 @@@
         struct futex_pi_state *pi_state_cache;
   #endif
   #ifdef CONFIG_PERF_EVENTS
- -      struct perf_event_context *perf_event_ctxp;
+ +      struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
   #endif
@@@ -1699,7 -1686,8 +1702,7 @@@ extern void thread_group_times(struct t
   /*
    * Per process flags
    */
- -#define PF_ALIGNWARN  0x00000001      /* Print alignment warning msgs */
- -                                      /* Not implemented yet, only for 486*/
+ +#define PF_KSOFTIRQD  0x00000001      /* I am ksoftirqd */
   #define PF_STARTING   0x00000002      /* being created */
   #define PF_EXITING    0x00000004      /* getting shut down */
   #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
@@@ -1711,6 -1699,7 +1714,6 @@@
   #define PF_DUMPCORE   0x00000200      /* dumped core */
   #define PF_SIGNALED   0x00000400      /* killed by a signal */
   #define PF_MEMALLOC   0x00000800      /* Allocating memory */
- -#define PF_FLUSHER    0x00001000      /* responsible for disk writeback */
   #define PF_USED_MATH  0x00002000      /* if unset the fpu must be initialized before use */
   #define PF_FREEZING   0x00004000      /* freeze in progress. do not account to load */
   #define PF_NOFREEZE   0x00008000      /* this thread should not be frozen */
@@@ -1759,7 -1748,8 +1762,8 @@@
   #ifdef CONFIG_PREEMPT_RCU
   
   #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
- #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+ #define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+ #define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
   
   static inline void rcu_copy_process(struct task_struct *p)
   {
@@@ -1767,7 -1757,10 +1771,10 @@@
         p->rcu_read_unlock_special = 0;
   #ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
- #endif
+ #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ #ifdef CONFIG_RCU_BOOST
+       p->rcu_boost_mutex = NULL;
+ #endif /* #ifdef CONFIG_RCU_BOOST */
         INIT_LIST_HEAD(&p->rcu_node_entry);
   }
   
@@@ -1844,19 -1837,6 +1851,19 @@@ extern void sched_clock_idle_sleep_even
   extern void sched_clock_idle_wakeup_event(u64 delta_ns);
   #endif
   
+ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ +/*
+ + * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ + * The reason for this explicit opt-in is not to have perf penalty with
+ + * slow sched_clocks.
+ + */
+ +extern void enable_sched_clock_irqtime(void);
+ +extern void disable_sched_clock_irqtime(void);
+ +#else
+ +static inline void enable_sched_clock_irqtime(void) {}
+ +static inline void disable_sched_clock_irqtime(void) {}
+ +#endif
+ +
   extern unsigned long long
   task_sched_runtime(struct task_struct *task);
   extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@@ -2241,16 -2221,9 +2248,16 @@@ static inline void task_unlock(struct t
         spin_unlock(&p->alloc_lock);
   }
   
- -extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+ +extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
                                                         unsigned long *flags);
   
+ +#define lock_task_sighand(tsk, flags)                                 \
+ +({    struct sighand_struct *__ss;                                    \
+ +      __cond_lock(&(tsk)->sighand->siglock,                           \
+ +                  (__ss = __lock_task_sighand(tsk, flags)));          \
+ +      __ss;                                                           \
+ +})                                                                    \
+ +
   static inline void unlock_task_sighand(struct task_struct *tsk,
                                                 unsigned long *flags)
   {
@@@ -2405,9 -2378,9 +2412,9 @@@ extern int __cond_resched_lock(spinlock
   
   extern int __cond_resched_softirq(void);
   
- -#define cond_resched_softirq() ({                             \
- -      __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);      \
- -      __cond_resched_softirq();                               \
+ +#define cond_resched_softirq() ({                                     \
+ +      __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);      \
+ +      __cond_resched_softirq();                                       \
   })
   
   /*
diff --combined init/Kconfig

index c9728992a776356e043d21df7b33aa045c2d7904,35518243c4bdddf0f93ba5ab2aee4505db1f7e0b..526ec1c7456a901d896cc3dca7edec19ff242d70
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -21,13 -21,6 +21,13 @@@ config CONSTRUCTOR
         depends on !UML
         default y
   
+ +config HAVE_IRQ_WORK
+ +      bool
+ +
+ +config IRQ_WORK
+ +      bool
+ +      depends on HAVE_IRQ_WORK
+ +
   menu "General setup"
   
   config EXPERIMENTAL
@@@ -71,7 -64,7 +71,7 @@@ config BROKEN_ON_SM
   
   config LOCK_KERNEL
         bool
- -      depends on SMP || PREEMPT
+ +      depends on (SMP || PREEMPT) && BKL
         default y
   
   config INIT_ENV_ARG_LIMIT
@@@ -186,7 -179,7 +186,7 @@@ config KERNEL_LZ
         depends on HAVE_KERNEL_LZO
         help
           Its compression ratio is the poorest among the 4. The kernel
- -        size is about about 10% bigger than gzip; however its speed
+ +        size is about 10% bigger than gzip; however its speed
           (both compression and decompression) is the fastest.
   
   endchoice
@@@ -339,8 -332,6 +339,8 @@@ config AUDIT_TRE
         depends on AUDITSYSCALL
         select FSNOTIFY
   
+ +source "kernel/irq/Kconfig"
+ +
   menu "RCU Subsystem"
   
   choice
@@@ -393,7 -384,6 +393,6 @@@ config PREEMPT_RC
   
   config RCU_TRACE
         bool "Enable tracing for RCU"
-       depends on TREE_RCU || TREE_PREEMPT_RCU
         help
           This option provides tracing in RCU which presents stats
           in debugfs for debugging RCU implementation.
@@@ -459,6 -449,60 +458,60 @@@ config TREE_RCU_TRAC
           TREE_PREEMPT_RCU implementations, permitting Makefile to
           trivially select kernel/rcutree_trace.c.
   
+ config RCU_BOOST
+       bool "Enable RCU priority boosting"
+       depends on RT_MUTEXES && TINY_PREEMPT_RCU
+       default n
+       help
+         This option boosts the priority of preempted RCU readers that
+         block the current preemptible RCU grace period for too long.
+         This option also prevents heavy loads from blocking RCU
+         callback invocation for all flavors of RCU.
+ 
+         Say Y here if you are working with real-time apps or heavy loads
+         Say N here if you are unsure.
+ 
+ config RCU_BOOST_PRIO
+       int "Real-time priority to boost RCU readers to"
+       range 1 99
+       depends on RCU_BOOST
+       default 1
+       help
+         This option specifies the real-time priority to which preempted
+         RCU readers are to be boosted.  If you are working with CPU-bound
+         real-time applications, you should specify a priority higher then
+         the highest-priority CPU-bound application.
+ 
+         Specify the real-time priority, or take the default if unsure.
+ 
+ config RCU_BOOST_DELAY
+       int "Milliseconds to delay boosting after RCU grace-period start"
+       range 0 3000
+       depends on RCU_BOOST
+       default 500
+       help
+         This option specifies the time to wait after the beginning of
+         a given grace period before priority-boosting preempted RCU
+         readers blocking that grace period.  Note that any RCU reader
+         blocking an expedited RCU grace period is boosted immediately.
+ 
+         Accept the default if unsure.
+ 
+ config SRCU_SYNCHRONIZE_DELAY
+       int "Microseconds to delay before waiting for readers"
+       range 0 20
+       default 10
+       help
+         This option controls how long SRCU delays before entering its
+         loop waiting on SRCU readers.  The purpose of this loop is
+         to avoid the unconditional context-switch penalty that would
+         otherwise be incurred if there was an active SRCU reader,
+         in a manner similar to adaptive locking schemes.  This should
+         be set to be a bit longer than the common-case SRCU read-side
+         critical-section overhead.
+ 
+         Accept the default if unsure.
+ 
   endmenu # "RCU Subsystem"
   
   config IKCONFIG
@@@ -518,6 -562,7 +571,6 @@@ if CGROUP
   
   config CGROUP_DEBUG
         bool "Example debug cgroup subsystem"
- -      depends on CGROUPS
         default n
         help
           This option enables a simple cgroup subsystem that
@@@ -528,6 -573,7 +581,6 @@@
   
   config CGROUP_NS
         bool "Namespace cgroup subsystem"
- -      depends on CGROUPS
         help
           Provides a simple namespace cgroup subsystem to
           provide hierarchical naming of sets of namespaces,
@@@ -536,18 -582,21 +589,18 @@@
   
   config CGROUP_FREEZER
         bool "Freezer cgroup subsystem"
- -      depends on CGROUPS
         help
           Provides a way to freeze and unfreeze all tasks in a
           cgroup.
   
   config CGROUP_DEVICE
         bool "Device controller for cgroups"
- -      depends on CGROUPS && EXPERIMENTAL
         help
           Provides a cgroup implementing whitelists for devices which
           a process in the cgroup can mknod or open.
   
   config CPUSETS
         bool "Cpuset support"
- -      depends on CGROUPS
         help
           This option will let you create and manage CPUSETs which
           allow dynamically partitioning a system into sets of CPUs and
@@@ -563,6 -612,7 +616,6 @@@ config PROC_PID_CPUSE
   
   config CGROUP_CPUACCT
         bool "Simple CPU accounting cgroup subsystem"
- -      depends on CGROUPS
         help
           Provides a simple Resource Controller for monitoring the
           total CPU consumed by the tasks in a cgroup.
@@@ -572,10 -622,11 +625,10 @@@ config RESOURCE_COUNTER
         help
           This option enables controller independent resource accounting
           infrastructure that works with cgroups.
- -      depends on CGROUPS
   
   config CGROUP_MEM_RES_CTLR
         bool "Memory Resource Controller for Control Groups"
- -      depends on CGROUPS && RESOURCE_COUNTERS
+ +      depends on RESOURCE_COUNTERS
         select MM_OWNER
         help
           Provides a memory resource controller that manages both anonymous
@@@ -613,23 -664,10 +666,23 @@@ config CGROUP_MEM_RES_CTLR_SWA
           if boot option "noswapaccount" is set, swap will not be accounted.
           Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
           size is 4096bytes, 512k per 1Gbytes of swap.
+ +config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+ +      bool "Memory Resource Controller Swap Extension enabled by default"
+ +      depends on CGROUP_MEM_RES_CTLR_SWAP
+ +      default y
+ +      help
+ +        Memory Resource Controller Swap Extension comes with its price in
+ +        a bigger memory consumption. General purpose distribution kernels
+ +        which want to enable the feautre but keep it disabled by default
+ +        and let the user enable it by swapaccount boot command line
+ +        parameter should have this option unselected.
+ +        For those who want to have the feature enabled by default should
+ +        select this option (if, for some reason, they need to disable it
+ +        then noswapaccount does the trick).
   
   menuconfig CGROUP_SCHED
         bool "Group CPU scheduler"
- -      depends on EXPERIMENTAL && CGROUPS
+ +      depends on EXPERIMENTAL
         default n
         help
           This feature lets CPU scheduler recognize task groups and control CPU
@@@ -658,7 -696,7 +711,7 @@@ endif #CGROUP_SCHE
   
   config BLK_CGROUP
         tristate "Block IO controller"
- -      depends on CGROUPS && BLOCK
+ +      depends on BLOCK
         default n
         ---help---
         Generic block IO controller cgroup interface. This is the common
@@@ -667,14 -705,11 +720,14 @@@
   
         Currently, CFQ IO scheduler uses it to recognize task groups and
         control disk bandwidth allocation (proportional time slice allocation)
- -      to such task groups.
+ +      to such task groups. It is also used by bio throttling logic in
+ +      block layer to implement upper limit in IO rates on a device.
   
         This option only enables generic Block IO controller infrastructure.
- -      One needs to also enable actual IO controlling logic in CFQ for it
- -      to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+ +      One needs to also enable actual IO controlling logic/policy. For
+ +      enabling proportional weight division of disk bandwidth in CFQ seti
+ +      CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
+ +      CONFIG_BLK_THROTTLE=y.
   
         See Documentation/cgroups/blkio-controller.txt for more information.
   
@@@ -688,7 -723,57 +741,7 @@@ config DEBUG_BLK_CGROU
   
   endif # CGROUPS
   
- -config MM_OWNER
- -      bool
- -
- -config SYSFS_DEPRECATED
- -      bool
- -
- -config SYSFS_DEPRECATED_V2
- -      bool "enable deprecated sysfs features to support old userspace tools"
- -      depends on SYSFS
- -      default n
- -      select SYSFS_DEPRECATED
- -      help
- -        This option switches the layout of sysfs to the deprecated
- -        version. Do not use it on recent distributions.
- -
- -        The current sysfs layout features a unified device tree at
- -        /sys/devices/, which is able to express a hierarchy between
- -        class devices. If the deprecated option is set to Y, the
- -        unified device tree is split into a bus device tree at
- -        /sys/devices/ and several individual class device trees at
- -        /sys/class/. The class and bus devices will be connected by
- -        "<subsystem>:<name>" and the "device" links. The "block"
- -        class devices, will not show up in /sys/class/block/. Some
- -        subsystems will suppress the creation of some devices which
- -        depend on the unified device tree.
- -
- -        This option is not a pure compatibility option that can
- -        be safely enabled on newer distributions. It will change the
- -        layout of sysfs to the non-extensible deprecated version,
- -        and disable some features, which can not be exported without
- -        confusing older userspace tools. Since 2007/2008 all major
- -        distributions do not enable this option, and ship no tools which
- -        depend on the deprecated layout or this option.
- -
- -        If you are using a new kernel on an older distribution, or use
- -        older userspace tools, you might need to say Y here. Do not say Y,
- -        if the original kernel, that came with your distribution, has
- -        this option set to N.
- -
- -config RELAY
- -      bool "Kernel->user space relay support (formerly relayfs)"
- -      help
- -        This option enables support for relay interface support in
- -        certain file systems (such as debugfs).
- -        It is designed to provide an efficient mechanism for tools and
- -        facilities to relay large amounts of data from kernel space to
- -        user space.
- -
- -        If unsure, say N.
- -
- -config NAMESPACES
+ +menuconfig NAMESPACES
         bool "Namespaces support" if EMBEDDED
         default !EMBEDDED
         help
@@@ -697,102 -782,48 +750,102 @@@
           or same user id or pid may refer to different tasks when used in
           different namespaces.
   
+ +if NAMESPACES
+ +
   config UTS_NS
         bool "UTS namespace"
- -      depends on NAMESPACES
+ +      default y
         help
           In this namespace tasks see different info provided with the
           uname() system call
   
   config IPC_NS
         bool "IPC namespace"
- -      depends on NAMESPACES && (SYSVIPC || POSIX_MQUEUE)
+ +      depends on (SYSVIPC || POSIX_MQUEUE)
+ +      default y
         help
           In this namespace tasks work with IPC ids which correspond to
           different IPC objects in different namespaces.
   
   config USER_NS
         bool "User namespace (EXPERIMENTAL)"
- -      depends on NAMESPACES && EXPERIMENTAL
+ +      depends on EXPERIMENTAL
+ +      default y
         help
           This allows containers, i.e. vservers, to use user namespaces
           to provide different user info for different servers.
           If unsure, say N.
   
   config PID_NS
- -      bool "PID Namespaces (EXPERIMENTAL)"
- -      default n
- -      depends on NAMESPACES && EXPERIMENTAL
+ +      bool "PID Namespaces"
+ +      default y
         help
           Support process id namespaces.  This allows having multiple
           processes with the same pid as long as they are in different
           pid namespaces.  This is a building block of containers.
   
- -        Unless you want to work with an experimental feature
- -        say N here.
- -
   config NET_NS
         bool "Network namespace"
- -      default n
- -      depends on NAMESPACES && EXPERIMENTAL && NET
+ +      depends on NET
+ +      default y
         help
           Allow user space to create what appear to be multiple instances
           of the network stack.
   
+ +endif # NAMESPACES
+ +
+ +config MM_OWNER
+ +      bool
+ +
+ +config SYSFS_DEPRECATED
+ +      bool "enable deprecated sysfs features to support old userspace tools"
+ +      depends on SYSFS
+ +      default n
+ +      help
+ +        This option adds code that switches the layout of the "block" class
+ +        devices, to not show up in /sys/class/block/, but only in
+ +        /sys/block/.
+ +
+ +        This switch is only active when the sysfs.deprecated=1 boot option is
+ +        passed or the SYSFS_DEPRECATED_V2 option is set.
+ +
+ +        This option allows new kernels to run on old distributions and tools,
+ +        which might get confused by /sys/class/block/. Since 2007/2008 all
+ +        major distributions and tools handle this just fine.
+ +
+ +        Recent distributions and userspace tools after 2009/2010 depend on
+ +        the existence of /sys/class/block/, and will not work with this
+ +        option enabled.
+ +
+ +        Only if you are using a new kernel on an old distribution, you might
+ +        need to say Y here.
+ +
+ +config SYSFS_DEPRECATED_V2
+ +      bool "enabled deprecated sysfs features by default"
+ +      default n
+ +      depends on SYSFS
+ +      depends on SYSFS_DEPRECATED
+ +      help
+ +        Enable deprecated sysfs by default.
+ +
+ +        See the CONFIG_SYSFS_DEPRECATED option for more details about this
+ +        option.
+ +
+ +        Only if you are using a new kernel on an old distribution, you might
+ +        need to say Y here. Even then, odds are you would not need it
+ +        enabled, you can always pass the boot option if absolutely necessary.
+ +
+ +config RELAY
+ +      bool "Kernel->user space relay support (formerly relayfs)"
+ +      help
+ +        This option enables support for relay interface support in
+ +        certain file systems (such as debugfs).
+ +        It is designed to provide an efficient mechanism for tools and
+ +        facilities to relay large amounts of data from kernel space to
+ +        user space.
+ +
+ +        If unsure, say N.
+ +
   config BLK_DEV_INITRD
         bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
         depends on BROKEN || !FRV
@@@ -1027,7 -1058,6 +1080,7 @@@ config PERF_EVENT
         default y if (PROFILING || PERF_COUNTERS)
         depends on HAVE_PERF_EVENTS
         select ANON_INODES
+ +      select IRQ_WORK
         help
           Enable kernel support for various performance events provided
           by software and hardware.
diff --combined kernel/sched.c

index 297d1a0eedb0e68d8b9327f530ba477c93b1222e,d1e8889872a1222845c0b1c48daeece3cb9e7506..e6f8f1254319b730933764fddab98e7804fd59ce
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -426,7 -426,9 +426,7 @@@ struct root_domain 
          */
         cpumask_var_t rto_mask;
         atomic_t rto_count;
- -#ifdef CONFIG_SMP
         struct cpupri cpupri;
- -#endif
   };
   
   /*
@@@ -435,7 -437,7 +435,7 @@@
    */
   static struct root_domain def_root_domain;
   
- -#endif
+ +#endif /* CONFIG_SMP */
   
   /*
    * This is the main, per-CPU runqueue data structure.
@@@ -486,12 -488,11 +486,12 @@@ struct rq 
          */
         unsigned long nr_uninterruptible;
   
- -      struct task_struct *curr, *idle;
+ +      struct task_struct *curr, *idle, *stop;
         unsigned long next_balance;
         struct mm_struct *prev_mm;
   
         u64 clock;
+ +      u64 clock_task;
   
         atomic_t nr_iowait;
   
@@@ -519,10 -520,6 +519,10 @@@
         u64 avg_idle;
   #endif
   
+ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ +      u64 prev_irq_time;
+ +#endif
+ +
         /* calc_load related fields */
         unsigned long calc_load_update;
         long calc_load_active;
@@@ -560,8 -557,18 +560,8 @@@
   
   static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
- -static inline
- -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
- -{
- -      rq->curr->sched_class->check_preempt_curr(rq, p, flags);
   
- -      /*
- -       * A queue event has occurred, and we're going to schedule.  In
- -       * this case, we can save a useless back to back clock update.
- -       */
- -      if (test_tsk_need_resched(p))
- -              rq->skip_clock_update = 1;
- -}
+ +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
   
   static inline int cpu_of(struct rq *rq)
   {
@@@ -636,18 -643,10 +636,18 @@@ static inline struct task_group *task_g
   
   #endif /* CONFIG_CGROUP_SCHED */
   
- -inline void update_rq_clock(struct rq *rq)
+ +static void update_rq_clock_task(struct rq *rq, s64 delta);
+ +
+ +static void update_rq_clock(struct rq *rq)
   {
- -      if (!rq->skip_clock_update)
- -              rq->clock = sched_clock_cpu(cpu_of(rq));
+ +      s64 delta;
+ +
+ +      if (rq->skip_clock_update)
+ +              return;
+ +
+ +      delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ +      rq->clock += delta;
+ +      update_rq_clock_task(rq, delta);
   }
   
   /*
@@@ -724,7 -723,7 +724,7 @@@ sched_feat_write(struct file *filp, con
                 size_t cnt, loff_t *ppos)
   {
         char buf[64];
- -      char *cmp = buf;
+ +      char *cmp;
         int neg = 0;
         int i;
   
@@@ -735,7 -734,6 +735,7 @@@
                 return -EFAULT;
   
         buf[cnt] = 0;
+ +      cmp = strstrip(buf);
   
         if (strncmp(buf, "NO_", 3) == 0) {
                 neg = 1;
@@@ -743,7 -741,9 +743,7 @@@
         }
   
         for (i = 0; sched_feat_names[i]; i++) {
- -              int len = strlen(sched_feat_names[i]);
- -
- -              if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+ +              if (strcmp(cmp, sched_feat_names[i]) == 0) {
                         if (neg)
                                 sysctl_sched_features &= ~(1UL << i);
                         else
@@@ -1840,7 -1840,7 +1840,7 @@@ static inline void __set_task_cpu(struc
   
   static const struct sched_class rt_sched_class;
   
- -#define sched_class_highest (&rt_sched_class)
+ +#define sched_class_highest (&stop_sched_class)
   #define for_each_class(class) \
      for (class = sched_class_highest; class; class = class->next)
   
@@@ -1858,6 -1858,12 +1858,6 @@@ static void dec_nr_running(struct rq *r
   
   static void set_load_weight(struct task_struct *p)
   {
- -      if (task_has_rt_policy(p)) {
- -              p->se.load.weight = 0;
- -              p->se.load.inv_weight = WMULT_CONST;
- -              return;
- -      }
- -
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
@@@ -1911,193 -1917,13 +1911,193 @@@ static void deactivate_task(struct rq *
         dec_nr_running(rq);
   }
   
+ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ +
+ +/*
+ + * There are no locks covering percpu hardirq/softirq time.
+ + * They are only modified in account_system_vtime, on corresponding CPU
+ + * with interrupts disabled. So, writes are safe.
+ + * They are read and saved off onto struct rq in update_rq_clock().
+ + * This may result in other CPU reading this CPU's irq time and can
+ + * race with irq/account_system_vtime on this CPU. We would either get old
+ + * or new value with a side effect of accounting a slice of irq time to wrong
+ + * task when irq is in progress while we read rq->clock. That is a worthy
+ + * compromise in place of having locks on each irq in account_system_time.
+ + */
+ +static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+ +static DEFINE_PER_CPU(u64, cpu_softirq_time);
+ +
+ +static DEFINE_PER_CPU(u64, irq_start_time);
+ +static int sched_clock_irqtime;
+ +
+ +void enable_sched_clock_irqtime(void)
+ +{
+ +      sched_clock_irqtime = 1;
+ +}
+ +
+ +void disable_sched_clock_irqtime(void)
+ +{
+ +      sched_clock_irqtime = 0;
+ +}
+ +
+ +#ifndef CONFIG_64BIT
+ +static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+ +
+ +static inline void irq_time_write_begin(void)
+ +{
+ +      __this_cpu_inc(irq_time_seq.sequence);
+ +      smp_wmb();
+ +}
+ +
+ +static inline void irq_time_write_end(void)
+ +{
+ +      smp_wmb();
+ +      __this_cpu_inc(irq_time_seq.sequence);
+ +}
+ +
+ +static inline u64 irq_time_read(int cpu)
+ +{
+ +      u64 irq_time;
+ +      unsigned seq;
+ +
+ +      do {
+ +              seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ +              irq_time = per_cpu(cpu_softirq_time, cpu) +
+ +                         per_cpu(cpu_hardirq_time, cpu);
+ +      } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+ +
+ +      return irq_time;
+ +}
+ +#else /* CONFIG_64BIT */
+ +static inline void irq_time_write_begin(void)
+ +{
+ +}
+ +
+ +static inline void irq_time_write_end(void)
+ +{
+ +}
+ +
+ +static inline u64 irq_time_read(int cpu)
+ +{
+ +      return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+ +}
+ +#endif /* CONFIG_64BIT */
+ +
+ +/*
+ + * Called before incrementing preempt_count on {soft,}irq_enter
+ + * and before decrementing preempt_count on {soft,}irq_exit.
+ + */
+ +void account_system_vtime(struct task_struct *curr)
+ +{
+ +      unsigned long flags;
+ +      s64 delta;
+ +      int cpu;
+ +
+ +      if (!sched_clock_irqtime)
+ +              return;
+ +
+ +      local_irq_save(flags);
+ +
+ +      cpu = smp_processor_id();
+ +      delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ +      __this_cpu_add(irq_start_time, delta);
+ +
+ +      irq_time_write_begin();
+ +      /*
+ +       * We do not account for softirq time from ksoftirqd here.
+ +       * We want to continue accounting softirq time to ksoftirqd thread
+ +       * in that case, so as not to confuse scheduler with a special task
+ +       * that do not consume any time, but still wants to run.
+ +       */
+ +      if (hardirq_count())
+ +              __this_cpu_add(cpu_hardirq_time, delta);
+ +      else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ +              __this_cpu_add(cpu_softirq_time, delta);
+ +
+ +      irq_time_write_end();
+ +      local_irq_restore(flags);
+ +}
+ +EXPORT_SYMBOL_GPL(account_system_vtime);
+ +
+ +static void update_rq_clock_task(struct rq *rq, s64 delta)
+ +{
+ +      s64 irq_delta;
+ +
+ +      irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ +
+ +      /*
+ +       * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ +       * this case when a previous update_rq_clock() happened inside a
+ +       * {soft,}irq region.
+ +       *
+ +       * When this happens, we stop ->clock_task and only update the
+ +       * prev_irq_time stamp to account for the part that fit, so that a next
+ +       * update will consume the rest. This ensures ->clock_task is
+ +       * monotonic.
+ +       *
+ +       * It does however cause some slight miss-attribution of {soft,}irq
+ +       * time, a more accurate solution would be to update the irq_time using
+ +       * the current rq->clock timestamp, except that would require using
+ +       * atomic ops.
+ +       */
+ +      if (irq_delta > delta)
+ +              irq_delta = delta;
+ +
+ +      rq->prev_irq_time += irq_delta;
+ +      delta -= irq_delta;
+ +      rq->clock_task += delta;
+ +
+ +      if (irq_delta && sched_feat(NONIRQ_POWER))
+ +              sched_rt_avg_update(rq, irq_delta);
+ +}
+ +
+ +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+ +
+ +static void update_rq_clock_task(struct rq *rq, s64 delta)
+ +{
+ +      rq->clock_task += delta;
+ +}
+ +
+ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+ +
   #include "sched_idletask.c"
   #include "sched_fair.c"
   #include "sched_rt.c"
+ +#include "sched_stoptask.c"
   #ifdef CONFIG_SCHED_DEBUG
   # include "sched_debug.c"
   #endif
   
+ +void sched_set_stop_task(int cpu, struct task_struct *stop)
+ +{
+ +      struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ +      struct task_struct *old_stop = cpu_rq(cpu)->stop;
+ +
+ +      if (stop) {
+ +              /*
+ +               * Make it appear like a SCHED_FIFO task, its something
+ +               * userspace knows about and won't get confused about.
+ +               *
+ +               * Also, it will make PI more or less work without too
+ +               * much confusion -- but then, stop work should not
+ +               * rely on PI working anyway.
+ +               */
+ +              sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+ +
+ +              stop->sched_class = &stop_sched_class;
+ +      }
+ +
+ +      cpu_rq(cpu)->stop = stop;
+ +
+ +      if (old_stop) {
+ +              /*
+ +               * Reset it back to a normal scheduling class so that
+ +               * it can die in pieces.
+ +               */
+ +              old_stop->sched_class = &rt_sched_class;
+ +      }
+ +}
+ +
   /*
    * __normal_prio - return the priority that is based on the static prio
    */
@@@ -2165,31 -1991,6 +2165,31 @@@ static inline void check_class_changed(
                 p->sched_class->prio_changed(rq, p, oldprio, running);
   }
   
+ +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+ +{
+ +      const struct sched_class *class;
+ +
+ +      if (p->sched_class == rq->curr->sched_class) {
+ +              rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ +      } else {
+ +              for_each_class(class) {
+ +                      if (class == rq->curr->sched_class)
+ +                              break;
+ +                      if (class == p->sched_class) {
+ +                              resched_task(rq->curr);
+ +                              break;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * A queue event has occurred, and we're going to schedule.  In
+ +       * this case, we can save a useless back to back clock update.
+ +       */
+ +      if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+ +              rq->skip_clock_update = 1;
+ +}
+ +
   #ifdef CONFIG_SMP
   /*
    * Is this task likely cache-hot:
@@@ -2202,9 -2003,6 +2202,9 @@@ task_hot(struct task_struct *p, u64 now
         if (p->sched_class != &fair_sched_class)
                 return 0;
   
+ +      if (unlikely(p->policy == SCHED_IDLE))
+ +              return 0;
+ +
         /*
          * Buddy candidates are cache hot:
          */
@@@ -3054,14 -2852,14 +3054,14 @@@ context_switch(struct rq *rq, struct ta
          */
         arch_start_context_switch(prev);
   
- -      if (likely(!mm)) {
+ +      if (!mm) {
                 next->active_mm = oldmm;
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
                 switch_mm(oldmm, mm, next);
   
- -      if (likely(!prev->mm)) {
+ +      if (!prev->mm) {
                 prev->active_mm = NULL;
                 rq->prev_mm = oldmm;
         }
@@@ -3176,15 -2974,6 +3176,15 @@@ static long calc_load_fold_active(struc
         return delta;
   }
   
+ +static unsigned long
+ +calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ +{
+ +      load *= exp;
+ +      load += active * (FIXED_1 - exp);
+ +      load += 1UL << (FSHIFT - 1);
+ +      return load >> FSHIFT;
+ +}
+ +
   #ifdef CONFIG_NO_HZ
   /*
    * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@@ -3214,128 -3003,6 +3214,128 @@@ static long calc_load_fold_idle(void
   
         return delta;
   }
+ +
+ +/**
+ + * fixed_power_int - compute: x^n, in O(log n) time
+ + *
+ + * @x:         base of the power
+ + * @frac_bits: fractional bits of @x
+ + * @n:         power to raise @x to.
+ + *
+ + * By exploiting the relation between the definition of the natural power
+ + * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ + * (where: n_i \elem {0, 1}, the binary vector representing n),
+ + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ + * of course trivially computable in O(log_2 n), the length of our binary
+ + * vector.
+ + */
+ +static unsigned long
+ +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+ +{
+ +      unsigned long result = 1UL << frac_bits;
+ +
+ +      if (n) for (;;) {
+ +              if (n & 1) {
+ +                      result *= x;
+ +                      result += 1UL << (frac_bits - 1);
+ +                      result >>= frac_bits;
+ +              }
+ +              n >>= 1;
+ +              if (!n)
+ +                      break;
+ +              x *= x;
+ +              x += 1UL << (frac_bits - 1);
+ +              x >>= frac_bits;
+ +      }
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * a1 = a0 * e + a * (1 - e)
+ + *
+ + * a2 = a1 * e + a * (1 - e)
+ + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ + *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ + *
+ + * a3 = a2 * e + a * (1 - e)
+ + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ + *
+ + *  ...
+ + *
+ + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ + *    = a0 * e^n + a * (1 - e^n)
+ + *
+ + * [1] application of the geometric series:
+ + *
+ + *              n         1 - x^(n+1)
+ + *     S_n := \Sum x^i = -------------
+ + *             i=0          1 - x
+ + */
+ +static unsigned long
+ +calc_load_n(unsigned long load, unsigned long exp,
+ +          unsigned long active, unsigned int n)
+ +{
+ +
+ +      return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+ +}
+ +
+ +/*
+ + * NO_HZ can leave us missing all per-cpu ticks calling
+ + * calc_load_account_active(), but since an idle CPU folds its delta into
+ + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ + * in the pending idle delta if our idle period crossed a load cycle boundary.
+ + *
+ + * Once we've updated the global active value, we need to apply the exponential
+ + * weights adjusted to the number of cycles missed.
+ + */
+ +static void calc_global_nohz(unsigned long ticks)
+ +{
+ +      long delta, active, n;
+ +
+ +      if (time_before(jiffies, calc_load_update))
+ +              return;
+ +
+ +      /*
+ +       * If we crossed a calc_load_update boundary, make sure to fold
+ +       * any pending idle changes, the respective CPUs might have
+ +       * missed the tick driven calc_load_account_active() update
+ +       * due to NO_HZ.
+ +       */
+ +      delta = calc_load_fold_idle();
+ +      if (delta)
+ +              atomic_long_add(delta, &calc_load_tasks);
+ +
+ +      /*
+ +       * If we were idle for multiple load cycles, apply them.
+ +       */
+ +      if (ticks >= LOAD_FREQ) {
+ +              n = ticks / LOAD_FREQ;
+ +
+ +              active = atomic_long_read(&calc_load_tasks);
+ +              active = active > 0 ? active * FIXED_1 : 0;
+ +
+ +              avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ +              avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ +              avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ +
+ +              calc_load_update += n * LOAD_FREQ;
+ +      }
+ +
+ +      /*
+ +       * Its possible the remainder of the above division also crosses
+ +       * a LOAD_FREQ period, the regular check in calc_global_load()
+ +       * which comes after this will take care of that.
+ +       *
+ +       * Consider us being 11 ticks before a cycle completion, and us
+ +       * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ +       * age us 4 cycles, and the test in calc_global_load() will
+ +       * pick up the final one.
+ +       */
+ +}
   #else
   static void calc_load_account_idle(struct rq *this_rq)
   {
@@@ -3345,10 -3012,6 +3345,10 @@@ static inline long calc_load_fold_idle(
   {
         return 0;
   }
+ +
+ +static void calc_global_nohz(unsigned long ticks)
+ +{
+ +}
   #endif
   
   /**
@@@ -3366,17 -3029,24 +3366,17 @@@ void get_avenrun(unsigned long *loads, 
         loads[2] = (avenrun[2] + offset) << shift;
   }
   
- -static unsigned long
- -calc_load(unsigned long load, unsigned long exp, unsigned long active)
- -{
- -      load *= exp;
- -      load += active * (FIXED_1 - exp);
- -      return load >> FSHIFT;
- -}
- -
   /*
    * calc_load - update the avenrun load estimates 10 ticks after the
    * CPUs have updated calc_load_tasks.
    */
- -void calc_global_load(void)
+ +void calc_global_load(unsigned long ticks)
   {
- -      unsigned long upd = calc_load_update + 10;
         long active;
   
- -      if (time_before(jiffies, upd))
+ +      calc_global_nohz(ticks);
+ +
+ +      if (time_before(jiffies, calc_load_update + 10))
                 return;
   
         active = atomic_long_read(&calc_load_tasks);
@@@ -3578,7 -3248,7 +3578,7 @@@ static u64 do_task_delta_exec(struct ta
   
         if (task_current(rq, p)) {
                 update_rq_clock(rq);
- -              ns = rq->clock - p->se.exec_start;
+ +              ns = rq->clock_task - p->se.exec_start;
                 if ((s64)ns < 0)
                         ns = 0;
         }
@@@ -3727,7 -3397,7 +3727,7 @@@ void account_system_time(struct task_st
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
- -      else if (softirq_count())
+ +      else if (in_serving_softirq())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
@@@ -3914,7 -3584,7 +3914,7 @@@ void scheduler_tick(void
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
   
- -      perf_event_task_tick(curr);
+ +      perf_event_task_tick();
   
   #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@@ -4030,6 -3700,7 +4030,6 @@@ static void put_prev_task(struct rq *rq
   {
         if (prev->se.on_rq)
                 update_rq_clock(rq);
- -      rq->skip_clock_update = 0;
         prev->sched_class->put_prev_task(rq, prev);
   }
   
@@@ -4052,13 -3723,17 +4052,13 @@@ pick_next_task(struct rq *rq
                         return p;
         }
   
- -      class = sched_class_highest;
- -      for ( ; ; ) {
+ +      for_each_class(class) {
                 p = class->pick_next_task(rq);
                 if (p)
                         return p;
- -              /*
- -               * Will never be NULL as the idle class always
- -               * returns a non-NULL p:
- -               */
- -              class = class->next;
         }
+ +
+ +      BUG(); /* the idle class will always have a runnable task */
   }
   
   /*
@@@ -4087,6 -3762,7 +4087,6 @@@ need_resched_nonpreemptible
                 hrtick_clear(rq);
   
         raw_spin_lock_irq(&rq->lock);
- -      clear_tsk_need_resched(prev);
   
         switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@@ -4118,8 -3794,6 +4118,8 @@@
   
         put_prev_task(rq, prev);
         next = pick_next_task(rq);
+ +      clear_tsk_need_resched(prev);
+ +      rq->skip_clock_update = 0;
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
@@@ -4684,7 -4358,6 +4684,7 @@@ void rt_mutex_setprio(struct task_struc
   
         rq = task_rq_lock(p, &flags);
   
+ +      trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->se.on_rq;
@@@ -4972,7 -4645,7 +4972,7 @@@ recheck
         }
   
         if (user) {
- -              retval = security_task_setscheduler(p, policy, param);
+ +              retval = security_task_setscheduler(p);
                 if (retval)
                         return retval;
         }
@@@ -4988,15 -4661,6 +4988,15 @@@
          */
         rq = __task_rq_lock(p);
   
+ +      /*
+ +       * Changing the policy of the stop threads its a very bad idea
+ +       */
+ +      if (p == rq->stop) {
+ +              __task_rq_unlock(rq);
+ +              raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ +              return -EINVAL;
+ +      }
+ +
   #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
                 /*
@@@ -5223,13 -4887,13 +5223,13 @@@ long sched_setaffinity(pid_t pid, cons
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
   
- -      retval = security_task_setscheduler(p, 0, NULL);
+ +      retval = security_task_setscheduler(p);
         if (retval)
                 goto out_unlock;
   
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
- - again:
+ +again:
         retval = set_cpus_allowed_ptr(p, new_mask);
   
         if (!retval) {
@@@ -6862,7 -6526,6 +6862,7 @@@ struct s_data 
         cpumask_var_t           nodemask;
         cpumask_var_t           this_sibling_map;
         cpumask_var_t           this_core_map;
+ +      cpumask_var_t           this_book_map;
         cpumask_var_t           send_covered;
         cpumask_var_t           tmpmask;
         struct sched_group      **sched_group_nodes;
@@@ -6874,7 -6537,6 +6874,7 @@@ enum s_alloc 
         sa_rootdomain,
         sa_tmpmask,
         sa_send_covered,
+ +      sa_this_book_map,
         sa_this_core_map,
         sa_this_sibling_map,
         sa_nodemask,
@@@ -6910,48 -6572,31 +6910,48 @@@ cpu_to_cpu_group(int cpu, const struct 
   #ifdef CONFIG_SCHED_MC
   static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
   static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
- -#endif /* CONFIG_SCHED_MC */
   
- -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
   cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                   struct sched_group **sg, struct cpumask *mask)
   {
         int group;
- -
+ +#ifdef CONFIG_SCHED_SMT
         cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
+ +#else
+ +      group = cpu;
+ +#endif
         if (sg)
                 *sg = &per_cpu(sched_group_core, group).sg;
         return group;
   }
- -#elif defined(CONFIG_SCHED_MC)
+ +#endif /* CONFIG_SCHED_MC */
+ +
+ +/*
+ + * book sched-domains:
+ + */
+ +#ifdef CONFIG_SCHED_BOOK
+ +static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+ +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+ +
   static int
- -cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *unused)
+ +cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+ +                struct sched_group **sg, struct cpumask *mask)
   {
+ +      int group = cpu;
+ +#ifdef CONFIG_SCHED_MC
+ +      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+ +      group = cpumask_first(mask);
+ +#elif defined(CONFIG_SCHED_SMT)
+ +      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+ +      group = cpumask_first(mask);
+ +#endif
         if (sg)
- -              *sg = &per_cpu(sched_group_core, cpu).sg;
- -      return cpu;
+ +              *sg = &per_cpu(sched_group_book, group).sg;
+ +      return group;
   }
- -#endif
+ +#endif /* CONFIG_SCHED_BOOK */
   
   static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
   static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@@ -6961,10 -6606,7 +6961,10 @@@ cpu_to_phys_group(int cpu, const struc
                   struct sched_group **sg, struct cpumask *mask)
   {
         int group;
- -#ifdef CONFIG_SCHED_MC
+ +#ifdef CONFIG_SCHED_BOOK
+ +      cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+ +      group = cpumask_first(mask);
+ +#elif defined(CONFIG_SCHED_MC)
         cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
@@@ -7160,8 -6802,6 +7160,8 @@@ static void init_sched_groups_power(in
         if (cpu != group_first_cpu(sd->groups))
                 return;
   
+ +      sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ +
         child = sd->child;
   
         sd->groups->cpu_power = 0;
@@@ -7227,9 -6867,6 +7227,9 @@@ SD_INIT_FUNC(CPU
   #ifdef CONFIG_SCHED_MC
    SD_INIT_FUNC(MC)
   #endif
+ +#ifdef CONFIG_SCHED_BOOK
+ + SD_INIT_FUNC(BOOK)
+ +#endif
   
   static int default_relax_domain_level = -1;
   
@@@ -7279,8 -6916,6 +7279,8 @@@ static void __free_domain_allocs(struc
                 free_cpumask_var(d->tmpmask); /* fall through */
         case sa_send_covered:
                 free_cpumask_var(d->send_covered); /* fall through */
+ +      case sa_this_book_map:
+ +              free_cpumask_var(d->this_book_map); /* fall through */
         case sa_this_core_map:
                 free_cpumask_var(d->this_core_map); /* fall through */
         case sa_this_sibling_map:
@@@ -7327,10 -6962,8 +7327,10 @@@ static enum s_alloc __visit_domain_allo
                 return sa_nodemask;
         if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                 return sa_this_sibling_map;
- -      if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ +      if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                 return sa_this_core_map;
+ +      if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ +              return sa_this_book_map;
         if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                 return sa_send_covered;
         d->rd = alloc_rootdomain();
@@@ -7388,23 -7021,6 +7388,23 @@@ static struct sched_domain *__build_cpu
         return sd;
   }
   
+ +static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+ +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ +      struct sched_domain *parent, int i)
+ +{
+ +      struct sched_domain *sd = parent;
+ +#ifdef CONFIG_SCHED_BOOK
+ +      sd = &per_cpu(book_domains, i).sd;
+ +      SD_INIT(sd, BOOK);
+ +      set_domain_attribute(sd, attr);
+ +      cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+ +      sd->parent = parent;
+ +      parent->child = sd;
+ +      cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+ +#endif
+ +      return sd;
+ +}
+ +
   static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
         const struct cpumask *cpu_map, struct sched_domain_attr *attr,
         struct sched_domain *parent, int i)
@@@ -7461,15 -7077,6 +7461,15 @@@ static void build_sched_groups(struct s
                                                 &cpu_to_core_group,
                                                 d->send_covered, d->tmpmask);
                 break;
+ +#endif
+ +#ifdef CONFIG_SCHED_BOOK
+ +      case SD_LV_BOOK: /* set up book groups */
+ +              cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+ +              if (cpu == cpumask_first(d->this_book_map))
+ +                      init_sched_build_groups(d->this_book_map, cpu_map,
+ +                                              &cpu_to_book_group,
+ +                                              d->send_covered, d->tmpmask);
+ +              break;
   #endif
         case SD_LV_CPU: /* set up physical groups */
                 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
@@@ -7518,14 -7125,12 +7518,14 @@@ static int __build_sched_domains(const 
   
                 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+ +              sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
         }
   
         for_each_cpu(i, cpu_map) {
                 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+ +              build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
         }
   
@@@ -7556,12 -7161,6 +7556,12 @@@
                 init_sched_groups_power(i, sd);
         }
   #endif
+ +#ifdef CONFIG_SCHED_BOOK
+ +      for_each_cpu(i, cpu_map) {
+ +              sd = &per_cpu(book_domains, i).sd;
+ +              init_sched_groups_power(i, sd);
+ +      }
+ +#endif
   
         for_each_cpu(i, cpu_map) {
                 sd = &per_cpu(phys_domains, i).sd;
@@@ -7587,8 -7186,6 +7587,8 @@@
                 sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
                 sd = &per_cpu(core_domains, i).sd;
+ +#elif defined(CONFIG_SCHED_BOOK)
+ +              sd = &per_cpu(book_domains, i).sd;
   #else
                 sd = &per_cpu(phys_domains, i).sd;
   #endif
@@@ -8493,9 -8090,9 +8493,9 @@@ int alloc_fair_sched_group(struct task_
   
         return 1;
   
- - err_free_rq:
+ +err_free_rq:
         kfree(cfs_rq);
- - err:
+ +err:
         return 0;
   }
   
@@@ -8583,9 -8180,9 +8583,9 @@@ int alloc_rt_sched_group(struct task_gr
   
         return 1;
   
- - err_free_rq:
+ +err_free_rq:
         kfree(rt_rq);
- - err:
+ +err:
         return 0;
   }
   
@@@ -8712,12 -8309,12 +8712,12 @@@ void sched_move_task(struct task_struc
         if (unlikely(running))
                 tsk->sched_class->put_prev_task(rq, tsk);
   
- -      set_task_rq(tsk, task_cpu(tsk));
- -
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -      if (tsk->sched_class->moved_group)
- -              tsk->sched_class->moved_group(tsk, on_rq);
+ +      if (tsk->sched_class->task_move_group)
+ +              tsk->sched_class->task_move_group(tsk, on_rq);
+ +      else
   #endif
+ +              set_task_rq(tsk, task_cpu(tsk));
   
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
@@@ -8943,7 -8540,7 +8943,7 @@@ static int tg_set_bandwidth(struct task
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
         }
         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- - unlock:
+ +unlock:
         read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
   
@@@ -9534,72 -9131,3 +9534,3 @@@ struct cgroup_subsys cpuacct_subsys = 
   };
   #endif        /* CONFIG_CGROUP_CPUACCT */
   
- #ifndef CONFIG_SMP
- 
- void synchronize_sched_expedited(void)
- {
-       barrier();
- }
- EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
- 
- #else /* #ifndef CONFIG_SMP */
- 
- static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
- 
- static int synchronize_sched_expedited_cpu_stop(void *data)
- {
-       /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
-        */
-       smp_mb(); /* See above comment block. */
-       return 0;
- }
- 
- /*
-  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
-  * approach to force grace period to end quickly.  This consumes
-  * significant time on all CPUs, and is thus not recommended for
-  * any sort of common-case code.
-  *
-  * Note that it is illegal to call this function while holding any
-  * lock that is acquired by a CPU-hotplug notifier.  Failing to
-  * observe this restriction will result in deadlock.
-  */
- void synchronize_sched_expedited(void)
- {
-       int snap, trycount = 0;
- 
-       smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-       get_online_cpus();
-       while (try_stop_cpus(cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               if (trycount++ < 10)
-                       udelay(trycount * num_online_cpus());
-               else {
-                       synchronize_sched();
-                       return;
-               }
-               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                       smp_mb(); /* ensure test happens before caller kfree */
-                       return;
-               }
-               get_online_cpus();
-       }
-       atomic_inc(&synchronize_sched_expedited_count);
-       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-       put_online_cpus();
- }
- EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
- 
- #endif /* #else #ifndef CONFIG_SMP */
author	Ingo Molnar <mingo@elte.hu>
	Thu, 23 Dec 2010 11:57:04 +0000 (12:57 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 23 Dec 2010 11:57:04 +0000 (12:57 +0100)
		1	2
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history