.posix_timers = LIST_HEAD_INIT(sig.posix_timers), \
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
++ .cputime = { .totals = { \
++ .utime = cputime_zero, \
++ .stime = cputime_zero, \
++ .sum_exec_runtime = 0, \
++ .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \
++ }, }, \
}
extern struct nsproxy init_nsproxy;
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
+ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \
extern void softlockup_tick(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
++ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
++ struct file *filp, void __user *buffer,
++ size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
extern unsigned long sysctl_hung_task_check_count;
extern unsigned long sysctl_hung_task_timeout_secs;
* @utime: time spent in user mode, in &cputime_t units
* @stime: time spent in kernel mode, in &cputime_t units
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
++ * @lock: lock for fields in this struct
*
* This structure groups together three kinds of CPU time that are
* tracked for threads and thread groups. Most things considering
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
++ spinlock_t lock;
};
/* Alternate field names when used to cache expirations. */
#define prof_exp stime
* used for thread group CPU clock calculations.
*/
struct thread_group_cputime {
-- struct task_cputime *totals;
++ struct task_cputime totals;
};
/*
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
#endif
#ifdef CONFIG_EPOLL
-- atomic_t epoll_devs; /* The number of epoll descriptors currently open */
atomic_t epoll_watches; /* The number of file descriptors currently watched */
#endif
#ifdef CONFIG_POSIX_MQUEUE
struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ + int (*needs_post_schedule) (struct rq *this_rq);
void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
u64 last_wakeup;
u64 avg_overlap;
++ u64 start_runtime;
++ u64 avg_wakeup;
++ u64 nr_migrations;
++
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
u64 exec_max;
u64 slice_max;
-- u64 nr_migrations;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
#endif
struct list_head tasks;
+ + struct plist_node pushable_tasks;
struct mm_struct *mm, *active_mm;
* Thread group CPU time accounting.
*/
-- extern int thread_group_cputime_alloc(struct task_struct *);
-- extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
--
-- static inline void thread_group_cputime_init(struct signal_struct *sig)
++ static inline
++ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
-- sig->cputime.totals = NULL;
++ struct task_cputime *totals = &tsk->signal->cputime.totals;
++ unsigned long flags;
++
++ spin_lock_irqsave(&totals->lock, flags);
++ *times = *totals;
++ spin_unlock_irqrestore(&totals->lock, flags);
}
-- static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
++ static inline void thread_group_cputime_init(struct signal_struct *sig)
{
-- if (curr->signal->cputime.totals)
-- return 0;
-- return thread_group_cputime_alloc(curr);
++ sig->cputime.totals = (struct task_cputime){
++ .utime = cputime_zero,
++ .stime = cputime_zero,
++ .sum_exec_runtime = 0,
++ };
++
++ spin_lock_init(&sig->cputime.totals.lock);
}
static inline void thread_group_cputime_free(struct signal_struct *sig)
{
-- free_percpu(sig->cputime.totals);
}
/*
def_bool y
depends on AUDITSYSCALL && INOTIFY
++ menu "RCU Subsystem"
++
++ choice
++ prompt "RCU Implementation"
++ default CLASSIC_RCU
++
++ config CLASSIC_RCU
++ bool "Classic RCU"
++ help
++ This option selects the classic RCU implementation that is
++ designed for best read-side performance on non-realtime
++ systems.
++
++ Select this option if you are unsure.
++
++ config TREE_RCU
++ bool "Tree-based hierarchical RCU"
++ help
++ This option selects the RCU implementation that is
++ designed for very large SMP system with hundreds or
++ thousands of CPUs.
++
++ config PREEMPT_RCU
++ bool "Preemptible RCU"
++ depends on PREEMPT
++ help
++ This option reduces the latency of the kernel by making certain
++ RCU sections preemptible. Normally RCU code is non-preemptible, if
++ this option is selected then read-only RCU sections become
++ preemptible. This helps latency, but may expose bugs due to
++ now-naive assumptions about each RCU read-side critical section
++ remaining on a given CPU through its execution.
++
++ endchoice
++
++ config RCU_TRACE
++ bool "Enable tracing for RCU"
++ depends on TREE_RCU || PREEMPT_RCU
++ help
++ This option provides tracing in RCU which presents stats
++ in debugfs for debugging RCU implementation.
++
++ Say Y here if you want to enable RCU tracing
++ Say N if you are unsure.
++
++ config RCU_FANOUT
++ int "Tree-based hierarchical RCU fanout value"
++ range 2 64 if 64BIT
++ range 2 32 if !64BIT
++ depends on TREE_RCU
++ default 64 if 64BIT
++ default 32 if !64BIT
++ help
++ This option controls the fanout of hierarchical implementations
++ of RCU, allowing RCU to work efficiently on machines with
++ large numbers of CPUs. This value must be at least the cube
++ root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
++ systems and up to 262,144 for 64-bit systems.
++
++ Select a specific number if testing RCU itself.
++ Take the default if unsure.
++
++ config RCU_FANOUT_EXACT
++ bool "Disable tree-based hierarchical RCU auto-balancing"
++ depends on TREE_RCU
++ default n
++ help
++ This option forces use of the exact RCU_FANOUT value specified,
++ regardless of imbalances in the hierarchy. This is useful for
++ testing RCU itself, and might one day be useful on systems with
++ strong NUMA behavior.
++
++ Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
++
++ Say N if unsure.
++
++ config TREE_RCU_TRACE
++ def_bool RCU_TRACE && TREE_RCU
++ select DEBUG_FS
++ help
++ This option provides tracing for the TREE_RCU implementation,
++ permitting Makefile to trivially select kernel/rcutree_trace.c.
++
++ config PREEMPT_RCU_TRACE
++ def_bool RCU_TRACE && PREEMPT_RCU
++ select DEBUG_FS
++ help
++ This option provides tracing for the PREEMPT_RCU implementation,
++ permitting Makefile to trivially select kernel/rcupreempt_trace.c.
++
++ endmenu # "RCU Subsystem"
++
config IKCONFIG
tristate "Kernel .config support"
---help---
This option allows you to create arbitrary task groups
using the "cgroup" pseudo filesystem and control
the cpu bandwidth allocated to each such task group.
-- Refer to Documentation/cgroups.txt for more information
-- on "cgroup" pseudo filesystem.
++ Refer to Documentation/cgroups/cgroups.txt for more
++ information on "cgroup" pseudo filesystem.
endchoice
-- menu "Control Group support"
-- config CGROUPS
-- bool "Control Group support"
++ menuconfig CGROUPS
++ boolean "Control Group support"
help
-- This option add support for grouping sets of processes together, for
++ This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
controls or device isolation.
See
-- - Documentation/cpusets.txt (Cpusets)
- Documentation/scheduler/sched-design-CFS.txt (CFS)
-- - Documentation/cgroups/ (features for grouping, isolation)
-- - Documentation/controllers/ (features for resource control)
++ - Documentation/cgroups/ (features for grouping, isolation
++ and resource control)
Say N if unsure.
++ if CGROUPS
++
config CGROUP_DEBUG
bool "Example debug cgroup subsystem"
depends on CGROUPS
help
This option enables a simple cgroup subsystem that
exports useful debugging information about the cgroups
-- framework
++ framework.
-- Say N if unsure
++ Say N if unsure.
config CGROUP_NS
-- bool "Namespace cgroup subsystem"
-- depends on CGROUPS
-- help
-- Provides a simple namespace cgroup subsystem to
-- provide hierarchical naming of sets of namespaces,
-- for instance virtual servers and checkpoint/restart
-- jobs.
++ bool "Namespace cgroup subsystem"
++ depends on CGROUPS
++ help
++ Provides a simple namespace cgroup subsystem to
++ provide hierarchical naming of sets of namespaces,
++ for instance virtual servers and checkpoint/restart
++ jobs.
config CGROUP_FREEZER
-- bool "control group freezer subsystem"
-- depends on CGROUPS
-- help
-- Provides a way to freeze and unfreeze all tasks in a
++ bool "Freezer cgroup subsystem"
++ depends on CGROUPS
++ help
++ Provides a way to freeze and unfreeze all tasks in a
cgroup.
config CGROUP_DEVICE
Say N if unsure.
++ config PROC_PID_CPUSET
++ bool "Include legacy /proc/<pid>/cpuset file"
++ depends on CPUSETS
++ default y
++
config CGROUP_CPUACCT
bool "Simple CPU accounting cgroup subsystem"
depends on CGROUPS
help
Provides a simple Resource Controller for monitoring the
-- total CPU consumed by the tasks in a cgroup
++ total CPU consumed by the tasks in a cgroup.
config RESOURCE_COUNTERS
bool "Resource counters"
help
This option enables controller independent resource accounting
-- infrastructure that works with cgroups
++ infrastructure that works with cgroups.
depends on CGROUPS
config CGROUP_MEM_RES_CTLR
This config option also selects MM_OWNER config option, which
could in turn add some fork/exit overhead.
-- config MM_OWNER
-- bool
--
config CGROUP_MEM_RES_CTLR_SWAP
bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)"
depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
there will be no overhead from this. Even when you set this config=y,
if boot option "noswapaccount" is set, swap will not be accounted.
++ endif # CGROUPS
-- endmenu
++ config MM_OWNER
++ bool
config SYSFS_DEPRECATED
bool
if the original kernel, that came with your distribution, has
this option set to N.
-- config PROC_PID_CPUSET
-- bool "Include legacy /proc/<pid>/cpuset file"
-- depends on CPUSETS
-- default y
--
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Unless you want to work with an experimental feature
say N here.
++ config NET_NS
++ bool "Network namespace"
++ default n
++ depends on NAMESPACES && EXPERIMENTAL && NET
++ help
++ Allow user space to create what appear to be multiple instances
++ of the network stack.
++
config BLK_DEV_INITRD
bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
depends on BROKEN || !FRV
Say N.
-- config KALLSYMS_STRIP_GENERATED
-- bool "Strip machine generated symbols from kallsyms"
-- depends on KALLSYMS_ALL
-- default y
-- help
-- Say N if you want kallsyms to retain even machine generated symbols.
--
config KALLSYMS_EXTRA_PASS
bool "Do an extra kallsyms pass"
depends on KALLSYMS
config RT_MUTEXES
boolean
- - select PLIST
config BASE_SMALL
int
config PREEMPT_NOTIFIERS
bool
-- choice
-- prompt "RCU Implementation"
-- default CLASSIC_RCU
--
-- config CLASSIC_RCU
-- bool "Classic RCU"
-- help
-- This option selects the classic RCU implementation that is
-- designed for best read-side performance on non-realtime
-- systems.
--
-- Select this option if you are unsure.
--
-- config TREE_RCU
-- bool "Tree-based hierarchical RCU"
-- help
-- This option selects the RCU implementation that is
-- designed for very large SMP system with hundreds or
-- thousands of CPUs.
--
-- config PREEMPT_RCU
-- bool "Preemptible RCU"
-- depends on PREEMPT
-- help
-- This option reduces the latency of the kernel by making certain
-- RCU sections preemptible. Normally RCU code is non-preemptible, if
-- this option is selected then read-only RCU sections become
-- preemptible. This helps latency, but may expose bugs due to
-- now-naive assumptions about each RCU read-side critical section
-- remaining on a given CPU through its execution.
--
-- endchoice
--
-- config RCU_TRACE
-- bool "Enable tracing for RCU"
-- depends on TREE_RCU || PREEMPT_RCU
-- help
-- This option provides tracing in RCU which presents stats
-- in debugfs for debugging RCU implementation.
--
-- Say Y here if you want to enable RCU tracing
-- Say N if you are unsure.
--
-- config RCU_FANOUT
-- int "Tree-based hierarchical RCU fanout value"
-- range 2 64 if 64BIT
-- range 2 32 if !64BIT
-- depends on TREE_RCU
-- default 64 if 64BIT
-- default 32 if !64BIT
-- help
-- This option controls the fanout of hierarchical implementations
-- of RCU, allowing RCU to work efficiently on machines with
-- large numbers of CPUs. This value must be at least the cube
-- root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
-- systems and up to 262,144 for 64-bit systems.
--
-- Select a specific number if testing RCU itself.
-- Take the default if unsure.
--
-- config RCU_FANOUT_EXACT
-- bool "Disable tree-based hierarchical RCU auto-balancing"
-- depends on TREE_RCU
-- default n
-- help
-- This option forces use of the exact RCU_FANOUT value specified,
-- regardless of imbalances in the hierarchy. This is useful for
-- testing RCU itself, and might one day be useful on systems with
-- strong NUMA behavior.
--
-- Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
--
-- Say N if unsure.
--
-- config TREE_RCU_TRACE
-- def_bool RCU_TRACE && TREE_RCU
-- select DEBUG_FS
-- help
-- This option provides tracing for the TREE_RCU implementation,
-- permitting Makefile to trivially select kernel/rcutree_trace.c.
--
-- config PREEMPT_RCU_TRACE
-- def_bool RCU_TRACE && PREEMPT_RCU
-- select DEBUG_FS
-- help
-- This option provides tracing for the PREEMPT_RCU implementation,
-- permitting Makefile to trivially select kernel/rcupreempt_trace.c.
DEFINE_TRACE(sched_migrate_task);
#ifdef CONFIG_SMP
+
+ static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- - int highest_prio; /* highest queued rt task prio */
+ + struct {
+ + int curr; /* highest queued rt task prio */
+ +#ifdef CONFIG_SMP
+ + int next; /* next highest */
+ +#endif
+ + } highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
int overloaded;
+ + struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
* slice expiry etc.
*/
-- #define WEIGHT_IDLEPRIO 2
-- #define WMULT_IDLEPRIO (1 << 31)
++ #define WEIGHT_IDLEPRIO 3
++ #define WMULT_IDLEPRIO 1431655765
/*
* Nice levels are multiplicative, with a gentle 10% change for every
#endif
+ +#ifdef CONFIG_PREEMPT
+ +
/*
- - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ + * way at the expense of forcing extra atomic operations in all
+ + * invocations. This assures that the double_lock is acquired using the
+ + * same underlying policy as the spinlock_t on this architecture, which
+ + * reduces latency compared to the unfair variant below. However, it
+ + * also adds more overhead and therefore may reduce throughput.
*/
- -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ + __releases(this_rq->lock)
+ + __acquires(busiest->lock)
+ + __acquires(this_rq->lock)
+ +{
+ + spin_unlock(&this_rq->lock);
+ + double_rq_lock(this_rq, busiest);
+ +
+ + return 1;
+ +}
+ +
+ +#else
+ +/*
+ + * Unfair double_lock_balance: Optimizes throughput at the expense of
+ + * latency by eliminating extra atomic operations when the locks are
+ + * already in proper order on entry. This favors lower cpu-ids and will
+ + * grant the double lock to lower cpus over higher ids under contention,
+ + * regardless of entry order into the function.
+ + */
+ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
- - if (unlikely(!irqs_disabled())) {
- - /* printk() doesn't work good under rq->lock */
- - spin_unlock(&this_rq->lock);
- - BUG_ON(1);
- - }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
return ret;
}
+ +#endif /* CONFIG_PREEMPT */
+ +
+ +/*
+ + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + */
+ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +{
+ + if (unlikely(!irqs_disabled())) {
+ + /* printk() doesn't work good under rq->lock */
+ + spin_unlock(&this_rq->lock);
+ + BUG_ON(1);
+ + }
+ +
+ + return _double_lock_balance(this_rq, busiest);
+ +}
+ +
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
++ if (wakeup)
++ p->se.start_runtime = p->se.sum_exec_runtime;
++
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
-- if (sleep && p->se.last_wakeup) {
-- update_avg(&p->se.avg_overlap,
-- p->se.sum_exec_runtime - p->se.last_wakeup);
-- p->se.last_wakeup = 0;
++ if (sleep) {
++ if (p->se.last_wakeup) {
++ update_avg(&p->se.avg_overlap,
++ p->se.sum_exec_runtime - p->se.last_wakeup);
++ p->se.last_wakeup = 0;
++ } else {
++ update_avg(&p->se.avg_wakeup,
++ sysctl_sched_wakeup_granularity);
++ }
}
sched_info_dequeued(p);
if (!sched_feat(SYNC_WAKEUPS))
sync = 0;
++ if (!sync) {
++ if (current->se.avg_overlap < sysctl_sched_migration_cost &&
++ p->se.avg_overlap < sysctl_sched_migration_cost)
++ sync = 1;
++ } else {
++ if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
++ p->se.avg_overlap >= sysctl_sched_migration_cost)
++ sync = 0;
++ }
++
#ifdef CONFIG_SMP
if (sched_feat(LB_WAKEUP_UPDATE)) {
struct sched_domain *sd;
activate_task(rq, p, 1);
success = 1;
++ /*
++ * Only attribute actual wakeups done by this task.
++ */
++ if (!in_interrupt()) {
++ struct sched_entity *se = ¤t->se;
++ u64 sample = se->sum_exec_runtime;
++
++ if (se->last_wakeup)
++ sample -= se->last_wakeup;
++ else
++ sample -= se->start_runtime;
++ update_avg(&se->avg_wakeup, sample);
++
++ se->last_wakeup = se->sum_exec_runtime;
++ }
++
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
p->sched_class->task_wake_up(rq, p);
#endif
out:
-- current->se.last_wakeup = current->se.sum_exec_runtime;
--
task_rq_unlock(rq, &flags);
return success;
p->se.prev_sum_exec_runtime = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
++ p->se.start_runtime = 0;
++ p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+ + plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ +
put_cpu();
}
{
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+ +#ifdef CONFIG_SMP
+ + int post_schedule = 0;
+ +
+ + if (current->sched_class->needs_post_schedule)
+ + post_schedule = current->sched_class->needs_post_schedule(rq);
+ +#endif
rq->prev_mm = NULL;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
- - if (current->sched_class->post_schedule)
+ + if (post_schedule)
current->sched_class->post_schedule(rq);
#endif
pulled++;
rem_load_move -= p->se.load.weight;
+ +#ifdef CONFIG_PREEMPT
+ + /*
+ + * NEWIDLE balancing is a source of latency, so preemptible kernels
+ + * will stop after the first task is pulled to minimize the critical
+ + * section.
+ + */
+ + if (idle == CPU_NEWLY_IDLE)
+ + goto out;
+ +#endif
+ +
/*
* We only want to steal up to the prescribed amount of weighted load.
*/
sd, idle, all_pinned, &this_best_prio);
class = class->next;
+ +#ifdef CONFIG_PREEMPT
+ + /*
+ + * NEWIDLE balancing is a source of latency, so preemptible
+ + * kernels will stop after the first task is pulled to minimize
+ + * the critical section.
+ + */
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
- -
+ +#endif
} while (class && max_load_move > total_load_moved);
return total_load_moved > 0;
int cpu = smp_processor_id();
if (stop_tick) {
-- cpumask_set_cpu(cpu, nohz.cpu_mask);
cpu_rq(cpu)->in_nohz_recently = 1;
-- /*
-- * If we are going offline and still the leader, give up!
-- */
-- if (!cpu_active(cpu) &&
-- atomic_read(&nohz.load_balancer) == cpu) {
++ if (!cpu_active(cpu)) {
++ if (atomic_read(&nohz.load_balancer) != cpu)
++ return 0;
++
++ /*
++ * If we are going offline and still the leader,
++ * give up!
++ */
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
BUG();
++
return 0;
}
++ cpumask_set_cpu(cpu, nohz.cpu_mask);
++
/* time for ilb owner also to sleep */
if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
if (atomic_read(&nohz.load_balancer) == cpu)
/*
* Underflow?
*/
-- if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
return;
/*
* Is the spinlock portion underflowing?
* sys_setpriority is a more generic, but much slower function that
* does similar things.
*/
-- asmlinkage long sys_nice(int increment)
++ SYSCALL_DEFINE1(nice, int, increment)
{
long nice, retval;
* @policy: new policy.
* @param: structure containing the new RT priority.
*/
-- asmlinkage long
-- sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
++ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
++ struct sched_param __user *, param)
{
/* negative values for policy are not valid */
if (policy < 0)
* @pid: the pid in question.
* @param: structure containing the new RT priority.
*/
-- asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
++ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
return do_sched_setscheduler(pid, -1, param);
}
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
*/
-- asmlinkage long sys_sched_getscheduler(pid_t pid)
++ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
struct task_struct *p;
int retval;
* @pid: the pid in question.
* @param: structure containing the RT priority.
*/
-- asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
++ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
struct sched_param lp;
struct task_struct *p;
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
*/
-- asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
-- unsigned long __user *user_mask_ptr)
++ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
++ unsigned long __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*/
-- asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
-- unsigned long __user *user_mask_ptr)
++ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
++ unsigned long __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
*/
-- asmlinkage long sys_sched_yield(void)
++ SYSCALL_DEFINE0(sched_yield)
{
struct rq *rq = this_rq_lock();
* this syscall returns the maximum rt_priority that can be used
* by a given scheduling class.
*/
-- asmlinkage long sys_sched_get_priority_max(int policy)
++ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
int ret = -EINVAL;
* this syscall returns the minimum rt_priority that can be used
* by a given scheduling class.
*/
-- asmlinkage long sys_sched_get_priority_min(int policy)
++ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
int ret = -EINVAL;
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
*/
-- asmlinkage
-- long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
++ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
++ struct timespec __user *, interval)
{
struct task_struct *p;
unsigned int time_slice;
* groups, so roll our own. Now each node has its own list of groups which
* gets dynamically allocated.
*/
- static DEFINE_PER_CPU(struct sched_domain, node_domains);
+ static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
static struct sched_group ***sched_group_nodes_bycpu;
- static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+ static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
#ifdef CONFIG_NUMA
if (cpumask_weight(cpu_map) >
SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
- sd = &per_cpu(allnodes_domains, i);
+ sd = &per_cpu(allnodes_domains, i).sd;
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), cpu_map);
} else
p = NULL;
- sd = &per_cpu(node_domains, i);
+ sd = &per_cpu(node_domains, i).sd;
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
for_each_cpu(j, nodemask) {
struct sched_domain *sd;
- sd = &per_cpu(node_domains, j);
+ sd = &per_cpu(node_domains, j).sd;
sd->groups = sg;
}
sg->__cpu_power = 0;
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- - rt_rq->highest_prio = MAX_RT_PRIO;
+ + rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ +#ifdef CONFIG_SMP
+ + rt_rq->highest_prio.next = MAX_RT_PRIO;
+ +#endif
#endif
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
+ + plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
runtime = d->rt_runtime;
}
++ #ifdef CONFIG_USER_SCHED
++ if (tg == &root_task_group) {
++ period = global_rt_period();
++ runtime = global_rt_runtime();
++ }
++ #endif
++
/*
* Cannot have more runtime than the period.
*/
struct sched_entity,
run_node);
-- if (vruntime == cfs_rq->min_vruntime)
++ if (!cfs_rq->curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
for_each_sched_entity(se) {
-- struct load_weight *load = &cfs_rq->load;
++ struct load_weight *load;
++
++ cfs_rq = cfs_rq_of(se);
++ load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
struct load_weight lw = cfs_rq->load;
unsigned long thresh = sysctl_sched_latency;
/*
-- * convert the sleeper threshold into virtual time
++ * Convert the sleeper threshold into virtual time.
++ * SCHED_IDLE is a special sub-class. We care about
++ * fairness only relative to other SCHED_IDLE tasks,
++ * all of which have the same weight.
*/
-- if (sched_feat(NORMALIZED_SLEEPER))
++ if (sched_feat(NORMALIZED_SLEEPER) &&
++ task_of(se)->policy != SCHED_IDLE)
thresh = calc_delta_fair(thresh, se);
vruntime -= thresh;
__enqueue_entity(cfs_rq, se);
}
-- static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
++ static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
cfs_rq->last = NULL;
cfs_rq->next = NULL;
}
++ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
++ {
++ for_each_sched_entity(se)
++ __clear_buddies(cfs_rq_of(se), se);
++ }
++
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-- if (delta_exec > ideal_runtime)
++ if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
++ /*
++ * The current task ran long enough, ensure it doesn't get
++ * re-elected due to buddy favours.
++ */
++ clear_buddies(cfs_rq, curr);
++ }
}
static void
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
{
-- struct task_struct *curr = this_rq->curr;
-- struct task_group *tg;
unsigned long tl = this_load;
unsigned long tl_per_task;
++ struct task_group *tg;
unsigned long weight;
int balanced;
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
-- if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-- p->se.avg_overlap > sysctl_sched_migration_cost))
-- sync = 0;
--
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
}
#endif /* CONFIG_SMP */
--static unsigned long wakeup_gran(struct sched_entity *se)
++/*
++ * Adaptive granularity
++ *
++ * se->avg_wakeup gives the average time a task runs until it does a wakeup,
++ * with the limit of wakeup_gran -- when it never does a wakeup.
++ *
++ * So the smaller avg_wakeup is the faster we want this task to preempt,
++ * but we don't want to treat the preemptee unfairly and therefore allow it
++ * to run for at least the amount of time we'd like to run.
++ *
++ * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
++ *
++ * NOTE: we use *nr_running to scale with load, this nicely matches the
++ * degrading latency on load.
++ */
++static unsigned long
++adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
++{
++ u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
++ u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
++ u64 gran = 0;
++
++ if (this_run < expected_wakeup)
++ gran = expected_wakeup - this_run;
++
++ return min_t(s64, gran, sysctl_sched_wakeup_granularity);
++}
++
++static unsigned long
++wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
++ if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
++ gran = adaptive_gran(curr, se);
++
/*
-- * More easily preempt - nice tasks, while not making it harder for
-- * + nice tasks.
++ * Since its curr running now, convert the gran from real-time
++ * to virtual-time in his units.
*/
-- if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
-- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
++ if (sched_feat(ASYM_GRAN)) {
++ /*
++ * By using 'se' instead of 'curr' we penalize light tasks, so
++ * they get preempted easier. That is, if 'se' < 'curr' then
++ * the resulting gran will be larger, therefore penalizing the
++ * lighter, if otoh 'se' > 'curr' then the resulting gran will
++ * be smaller, again penalizing the lighter task.
++ *
++ * This is especially important for buddies when the leftmost
++ * task is higher priority than the buddy.
++ */
++ if (unlikely(se->load.weight != NICE_0_LOAD))
++ gran = calc_delta_fair(gran, se);
++ } else {
++ if (unlikely(curr->load.weight != NICE_0_LOAD))
++ gran = calc_delta_fair(gran, curr);
++ }
return gran;
}
if (vdiff <= 0)
return -1;
-- gran = wakeup_gran(curr);
++ gran = wakeup_gran(curr, se);
if (vdiff > gran)
return 1;
static void set_last_buddy(struct sched_entity *se)
{
-- for_each_sched_entity(se)
-- cfs_rq_of(se)->last = se;
++ if (likely(task_of(se)->policy != SCHED_IDLE)) {
++ for_each_sched_entity(se)
++ cfs_rq_of(se)->last = se;
++ }
}
static void set_next_buddy(struct sched_entity *se)
{
-- for_each_sched_entity(se)
-- cfs_rq_of(se)->next = se;
++ if (likely(task_of(se)->policy != SCHED_IDLE)) {
++ for_each_sched_entity(se)
++ cfs_rq_of(se)->next = se;
++ }
}
/*
return;
/*
-- * Batch tasks do not preempt (their preemption is driven by
++ * Batch and idle tasks do not preempt (their preemption is driven by
* the tick):
*/
-- if (unlikely(p->policy == SCHED_BATCH))
++ if (unlikely(p->policy != SCHED_NORMAL))
+ return;
+
++ /* Idle tasks are by definition preempted by everybody. */
++ if (unlikely(curr->policy == SCHED_IDLE)) {
++ resched_task(curr);
+ return;
++ }
+
if (!sched_feat(WAKEUP_PREEMPT))
return;
-- if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-- (se->avg_overlap < sysctl_sched_migration_cost &&
-- pse->avg_overlap < sysctl_sched_migration_cost))) {
++ if (sched_feat(WAKEUP_OVERLAP) && sync) {
resched_task(curr);
return;
}
do {
se = pick_next_entity(cfs_rq);
++ /*
++ * If se was a buddy, clear it so that it will have to earn
++ * the favour again.
++ */
++ __clear_buddies(cfs_rq, se);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
* policies)
*/
+ +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+ +{
+ + return container_of(rt_se, struct task_struct, rt);
+ +}
+ +
+ +#ifdef CONFIG_RT_GROUP_SCHED
+ +
+ +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+ +{
+ + return rt_rq->rq;
+ +}
+ +
+ +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+ +{
+ + return rt_se->rt_rq;
+ +}
+ +
+ +#else /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+ +{
+ + return container_of(rt_rq, struct rq, rt);
+ +}
+ +
+ +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+ +{
+ + struct task_struct *p = rt_task_of(rt_se);
+ + struct rq *rq = task_rq(p);
+ +
+ + return &rq->rt;
+ +}
+ +
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +
#ifdef CONFIG_SMP
static inline int rt_overloaded(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
- -static void update_rt_migration(struct rq *rq)
+ +static void update_rt_migration(struct rt_rq *rt_rq)
{
- - if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
- - if (!rq->rt.overloaded) {
- - rt_set_overload(rq);
- - rq->rt.overloaded = 1;
+ + if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+ + if (!rt_rq->overloaded) {
+ + rt_set_overload(rq_of_rt_rq(rt_rq));
+ + rt_rq->overloaded = 1;
}
- - } else if (rq->rt.overloaded) {
- - rt_clear_overload(rq);
- - rq->rt.overloaded = 0;
+ + } else if (rt_rq->overloaded) {
+ + rt_clear_overload(rq_of_rt_rq(rt_rq));
+ + rt_rq->overloaded = 0;
}
}
- -#endif /* CONFIG_SMP */
- -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+ +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ + if (rt_se->nr_cpus_allowed > 1)
+ + rt_rq->rt_nr_migratory++;
+ +
+ + update_rt_migration(rt_rq);
+ +}
+ +
+ +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ + if (rt_se->nr_cpus_allowed > 1)
+ + rt_rq->rt_nr_migratory--;
+ +
+ + update_rt_migration(rt_rq);
+ +}
+ +
+ +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ + plist_node_init(&p->pushable_tasks, p->prio);
+ + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +}
+ +
+ +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +}
+ +
+ +#else
+ +
+ +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
- - return container_of(rt_se, struct task_struct, rt);
}
+ +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +}
+ +
+ +static inline
+ +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +static inline
+ +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +#endif /* CONFIG_SMP */
+ +
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
- -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
- -{
- - return rt_rq->rq;
- -}
- -
- -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
- -{
- - return rt_se->rt_rq;
- -}
- -
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
- - if (rt_rq->highest_prio < curr->prio)
+ + if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
- -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
- -{
- - return container_of(rt_rq, struct rq, rt);
- -}
- -
- -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
- -{
- - struct task_struct *p = rt_task_of(rt_se);
- - struct rq *rq = task_rq(p);
- -
- - return &rq->rt;
- -}
- -
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
- - return rt_rq->highest_prio;
+ + return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
}
}
- -static inline
- -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +#if defined CONFIG_SMP
+ +
+ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+ +
+ +static inline int next_prio(struct rq *rq)
{
- - WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- - rt_rq->rt_nr_running++;
- -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- - if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
- -#ifdef CONFIG_SMP
- - struct rq *rq = rq_of_rt_rq(rt_rq);
- -#endif
+ + struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+ +
+ + if (next && rt_prio(next->prio))
+ + return next->prio;
+ + else
+ + return MAX_RT_PRIO;
+ +}
+ +
+ +static void
+ +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+ +{
+ + struct rq *rq = rq_of_rt_rq(rt_rq);
+ +
+ + if (prio < prev_prio) {
+ +
+ + /*
+ + * If the new task is higher in priority than anything on the
+ + * run-queue, we know that the previous high becomes our
+ + * next-highest.
+ + */
+ + rt_rq->highest_prio.next = prev_prio;
- - rt_rq->highest_prio = rt_se_prio(rt_se);
- -#ifdef CONFIG_SMP
if (rq->online)
- - cpupri_set(&rq->rd->cpupri, rq->cpu,
- - rt_se_prio(rt_se));
- -#endif
- - }
- -#endif
- -#ifdef CONFIG_SMP
- - if (rt_se->nr_cpus_allowed > 1) {
- - struct rq *rq = rq_of_rt_rq(rt_rq);
+ + cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
- - rq->rt.rt_nr_migratory++;
- - }
+ + } else if (prio == rt_rq->highest_prio.curr)
+ + /*
+ + * If the next task is equal in priority to the highest on
+ + * the run-queue, then we implicitly know that the next highest
+ + * task cannot be any lower than current
+ + */
+ + rt_rq->highest_prio.next = prio;
+ + else if (prio < rt_rq->highest_prio.next)
+ + /*
+ + * Otherwise, we need to recompute next-highest
+ + */
+ + rt_rq->highest_prio.next = next_prio(rq);
+ +}
- - update_rt_migration(rq_of_rt_rq(rt_rq));
- -#endif
- -#ifdef CONFIG_RT_GROUP_SCHED
- - if (rt_se_boosted(rt_se))
- - rt_rq->rt_nr_boosted++;
+ +static void
+ +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+ +{
+ + struct rq *rq = rq_of_rt_rq(rt_rq);
- - if (rt_rq->tg)
- - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
- -#else
- - start_rt_bandwidth(&def_rt_bandwidth);
- -#endif
+ + if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+ + rt_rq->highest_prio.next = next_prio(rq);
+ +
+ + if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
+ +#else /* CONFIG_SMP */
+ +
static inline
- -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
- -{
- -#ifdef CONFIG_SMP
- - int highest_prio = rt_rq->highest_prio;
- -#endif
+ +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+ +static inline
+ +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+ +
+ +#endif /* CONFIG_SMP */
- - WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- - WARN_ON(!rt_rq->rt_nr_running);
- - rt_rq->rt_nr_running--;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ +static void
+ +inc_rt_prio(struct rt_rq *rt_rq, int prio)
+ +{
+ + int prev_prio = rt_rq->highest_prio.curr;
+ +
+ + if (prio < prev_prio)
+ + rt_rq->highest_prio.curr = prio;
+ +
+ + inc_rt_prio_smp(rt_rq, prio, prev_prio);
+ +}
+ +
+ +static void
+ +dec_rt_prio(struct rt_rq *rt_rq, int prio)
+ +{
+ + int prev_prio = rt_rq->highest_prio.curr;
+ +
if (rt_rq->rt_nr_running) {
- - struct rt_prio_array *array;
- - WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
- - if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
- - /* recalculate */
- - array = &rt_rq->active;
- - rt_rq->highest_prio =
+ + WARN_ON(prio < prev_prio);
+ +
+ + /*
+ + * This may have been our highest task, and therefore
+ + * we may have some recomputation to do
+ + */
+ + if (prio == prev_prio) {
+ + struct rt_prio_array *array = &rt_rq->active;
+ +
+ + rt_rq->highest_prio.curr =
sched_find_first_bit(array->bitmap);
- - } /* otherwise leave rq->highest prio alone */
+ + }
+ +
} else
- - rt_rq->highest_prio = MAX_RT_PRIO;
- -#endif
- -#ifdef CONFIG_SMP
- - if (rt_se->nr_cpus_allowed > 1) {
- - struct rq *rq = rq_of_rt_rq(rt_rq);
- - rq->rt.rt_nr_migratory--;
- - }
+ + rt_rq->highest_prio.curr = MAX_RT_PRIO;
- - if (rt_rq->highest_prio != highest_prio) {
- - struct rq *rq = rq_of_rt_rq(rt_rq);
+ + dec_rt_prio_smp(rt_rq, prio, prev_prio);
+ +}
- - if (rq->online)
- - cpupri_set(&rq->rd->cpupri, rq->cpu,
- - rt_rq->highest_prio);
- - }
+ +#else
+ +
+ +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+ +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+ +
+ +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
- - update_rt_migration(rq_of_rt_rq(rt_rq));
- -#endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
+ +
+ +static void
+ +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ + if (rt_se_boosted(rt_se))
+ + rt_rq->rt_nr_boosted++;
+ +
+ + if (rt_rq->tg)
+ + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+ +}
+ +
+ +static void
+ +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
- -#endif
+ +}
+ +
+ +#else /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static void
+ +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ + start_rt_bandwidth(&def_rt_bandwidth);
+ +}
+ +
+ +static inline
+ +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+ +
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static inline
+ +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ + int prio = rt_se_prio(rt_se);
+ +
+ + WARN_ON(!rt_prio(prio));
+ + rt_rq->rt_nr_running++;
+ +
+ + inc_rt_prio(rt_rq, prio);
+ + inc_rt_migration(rt_se, rt_rq);
+ + inc_rt_group(rt_se, rt_rq);
+ +}
+ +
+ +static inline
+ +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ + WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ + WARN_ON(!rt_rq->rt_nr_running);
+ + rt_rq->rt_nr_running--;
+ +
+ + dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ + dec_rt_migration(rt_se, rt_rq);
+ + dec_rt_group(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
enqueue_rt_entity(rt_se);
+ + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ + enqueue_pushable_task(rq, p);
+ +
inc_cpu_load(rq, p->se.load.weight);
}
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ + dequeue_pushable_task(rq, p);
+ +
dec_cpu_load(rq, p->se.load.weight);
}
return next;
}
- -static struct task_struct *pick_next_task_rt(struct rq *rq)
+ +static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
p = rt_task_of(rt_se);
p->se.exec_start = rq->clock;
+ +
+ + return p;
+ +}
+ +
+ +static struct task_struct *pick_next_task_rt(struct rq *rq)
+ +{
+ + struct task_struct *p = _pick_next_task_rt(rq);
+ +
+ + /* The running task is never eligible for pushing */
+ + if (p)
+ + dequeue_pushable_task(rq, p);
+ +
return p;
}
{
update_curr_rt(rq);
p->se.exec_start = 0;
+ +
+ + /*
+ + * The previous task needs to be made eligible for pushing
+ + * if it is still active
+ + */
+ + if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ + enqueue_pushable_task(rq, p);
}
#ifdef CONFIG_SMP
if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
return this_cpu;
-- first = first_cpu(*mask);
-- if (first != NR_CPUS)
++ first = cpumask_first(mask);
++ if (first < nr_cpu_ids)
return first;
return -1;
}
/* If this rq is still suitable use it. */
- - if (lowest_rq->rt.highest_prio > task->prio)
+ + if (lowest_rq->rt.highest_prio.curr > task->prio)
break;
/* try again */
return lowest_rq;
}
+ +static inline int has_pushable_tasks(struct rq *rq)
+ +{
+ + return !plist_head_empty(&rq->rt.pushable_tasks);
+ +}
+ +
+ +static struct task_struct *pick_next_pushable_task(struct rq *rq)
+ +{
+ + struct task_struct *p;
+ +
+ + if (!has_pushable_tasks(rq))
+ + return NULL;
+ +
+ + p = plist_first_entry(&rq->rt.pushable_tasks,
+ + struct task_struct, pushable_tasks);
+ +
+ + BUG_ON(rq->cpu != task_cpu(p));
+ + BUG_ON(task_current(rq, p));
+ + BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ +
+ + BUG_ON(!p->se.on_rq);
+ + BUG_ON(!rt_task(p));
+ +
+ + return p;
+ +}
+ +
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
{
struct task_struct *next_task;
struct rq *lowest_rq;
- - int ret = 0;
- - int paranoid = RT_MAX_TRIES;
if (!rq->rt.overloaded)
return 0;
- - next_task = pick_next_highest_task_rt(rq, -1);
+ + next_task = pick_next_pushable_task(rq);
if (!next_task)
return 0;
struct task_struct *task;
/*
* find lock_lowest_rq releases rq->lock
- - * so it is possible that next_task has changed.
- - * If it has, then try again.
+ + * so it is possible that next_task has migrated.
+ + *
+ + * We need to make sure that the task is still on the same
+ + * run-queue and is also still the next task eligible for
+ + * pushing.
*/
- - task = pick_next_highest_task_rt(rq, -1);
- - if (unlikely(task != next_task) && task && paranoid--) {
- - put_task_struct(next_task);
- - next_task = task;
- - goto retry;
+ + task = pick_next_pushable_task(rq);
+ + if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ + /*
+ + * If we get here, the task hasnt moved at all, but
+ + * it has failed to push. We will not try again,
+ + * since the other cpus will pull from us when they
+ + * are ready.
+ + */
+ + dequeue_pushable_task(rq, next_task);
+ + goto out;
}
- - goto out;
+ +
+ + if (!task)
+ + /* No more tasks, just exit */
+ + goto out;
+ +
+ + /*
+ + * Something has shifted, try again.
+ + */
+ + put_task_struct(next_task);
+ + next_task = task;
+ + goto retry;
}
deactivate_task(rq, next_task, 0);
double_unlock_balance(rq, lowest_rq);
- - ret = 1;
out:
put_task_struct(next_task);
- - return ret;
+ + return 1;
}
- -/*
- - * TODO: Currently we just use the second highest prio task on
- - * the queue, and stop when it can't migrate (or there's
- - * no more RT tasks). There may be a case where a lower
- - * priority RT task has a different affinity than the
- - * higher RT task. In this case the lower RT task could
- - * possibly be able to migrate where as the higher priority
- - * RT task could not. We currently ignore this issue.
- - * Enhancements are welcome!
- - */
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
- - struct task_struct *p, *next;
+ + struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
- - next = pick_next_task_rt(this_rq);
- -
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
+ +
+ + /*
+ + * Don't bother taking the src_rq->lock if the next highest
+ + * task is known to be lower-priority than our current task.
+ + * This may look racy, but if this value is about to go
+ + * logically higher, the src_rq will push this task away.
+ + * And if its going logically lower, we do not care
+ + */
+ + if (src_rq->rt.highest_prio.next >=
+ + this_rq->rt.highest_prio.curr)
+ + continue;
+ +
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
- - * steal our next task - hence we must cause
- - * the caller to recalculate the next task
- - * in that case:
+ + * alter this_rq
*/
- - if (double_lock_balance(this_rq, src_rq)) {
- - struct task_struct *old_next = next;
- -
- - next = pick_next_task_rt(this_rq);
- - if (next != old_next)
- - ret = 1;
- - }
+ + double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
- - if (p && (!next || (p->prio < next->prio))) {
+ + if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->se.on_rq);
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
- - * current task on the run queue or
- - * this_rq next task is lower in prio than
- - * the current task on that rq.
+ + * current task on the run queue
*/
- - if (p->prio < src_rq->curr->prio ||
- - (next && next->prio < src_rq->curr->prio))
+ + if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
* case there's an even higher prio task
* in another runqueue. (low likelyhood
* but possible)
- - *
- - * Update next so that we won't pick a task
- - * on another cpu with a priority lower (or equal)
- - * than the one we just picked.
*/
- - next = p;
- -
}
skip:
double_unlock_balance(this_rq, src_rq);
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- - if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ + if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
pull_rt_task(rq);
}
+ +/*
+ + * assumes rq->lock is held
+ + */
+ +static int needs_post_schedule_rt(struct rq *rq)
+ +{
+ + return has_pushable_tasks(rq);
+ +}
+ +
static void post_schedule_rt(struct rq *rq)
{
/*
- - * If we have more than one rt_task queued, then
- - * see if we can push the other rt_tasks off to other CPUS.
- - * Note we may release the rq lock, and since
- - * the lock was owned by prev, we need to release it
- - * first via finish_lock_switch and then reaquire it here.
+ + * This is only called if needs_post_schedule_rt() indicates that
+ + * we need to push tasks away
*/
- - if (unlikely(rq->rt.overloaded)) {
- - spin_lock_irq(&rq->lock);
- - push_rt_tasks(rq);
- - spin_unlock_irq(&rq->lock);
- - }
+ + spin_lock_irq(&rq->lock);
+ + push_rt_tasks(rq);
+ + spin_unlock_irq(&rq->lock);
}
/*
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- - rq->rt.overloaded)
+ + has_pushable_tasks(rq) &&
+ + p->rt.nr_cpus_allowed > 1)
push_rt_tasks(rq);
}
if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
+ + if (!task_current(rq, p)) {
+ + /*
+ + * Make sure we dequeue this task from the pushable list
+ + * before going further. It will either remain off of
+ + * the list because we are no longer pushable, or it
+ + * will be requeued.
+ + */
+ + if (p->rt.nr_cpus_allowed > 1)
+ + dequeue_pushable_task(rq, p);
+ +
+ + /*
+ + * Requeue if our weight is changing and still > 1
+ + */
+ + if (weight > 1)
+ + enqueue_pushable_task(rq, p);
+ +
+ + }
+ +
if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
rq->rt.rt_nr_migratory++;
} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
rq->rt.rt_nr_migratory--;
}
- - update_rt_migration(rq);
+ + update_rt_migration(&rq->rt);
}
cpumask_copy(&p->cpus_allowed, new_mask);
__enable_runtime(rq);
- - cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+ + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
}
/* Assumes rq->lock is held */
* can release the rq lock and p could migrate.
* Only reschedule if p is still on the same runqueue.
*/
- - if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ + if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
+ +
+ + /* The running task is never eligible for pushing */
+ + dequeue_pushable_task(rq, p);
}
static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
+ + .needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,