X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=kernel%2Fcpuset.c;h=3f9db31c5d043287b7e566fdaace6446ee46e6a1;hb=b5321a15afce10a7ec0862da761a5627971531d7;hp=02a8ea5c99632907b2ae887015c59bd4ad35da9d;hpb=65d70e79cdb96b208c72a30b34525a850ee640cb;p=firefly-linux-kernel-4.4.55.git diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02a8ea5c9963..3f9db31c5d04 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -99,6 +98,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; + cpumask_var_t cpus_requested; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -286,6 +286,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -323,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -333,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } @@ -385,7 +398,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_requested, q->cpus_requested) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -485,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_requested, c->cpus_requested)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -944,17 +957,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) return -EINVAL; + + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); } /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; retval = validate_change(cs, trialcs); @@ -963,6 +977,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ @@ -971,31 +986,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; + struct cpuset_migrate_mm_work *mwork; - tsk->mems_allowed = *to; - - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +static void cpuset_post_attach(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1096,7 +1131,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1541,11 +1577,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1710,6 +1746,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -1731,7 +1768,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) switch (type) { case FILE_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); @@ -1920,11 +1957,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; + if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_allowed; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_cpus; + goto free_requested; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_requested); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); @@ -1933,7 +1973,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return &cs->css; -free_cpus: +free_requested: + free_cpumask_var(cs->cpus_requested); +free_allowed: free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); @@ -1996,6 +2038,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->cpus_requested, parent->cpus_requested); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: @@ -2030,6 +2073,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); kfree(cs); } @@ -2051,6 +2095,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task, void *priv) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2059,7 +2117,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = 1, }; @@ -2078,8 +2138,11 @@ int __init cpuset_init(void) BUG(); if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_requested); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); @@ -2213,7 +2276,7 @@ retry: goto retry; } - cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); @@ -2355,6 +2418,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /**