cgroup, memcg, cpuset: implement cgroup_taskset_for_each_leader()
authorTejun Heo <tj@kernel.org>
Fri, 11 Sep 2015 19:00:19 +0000 (15:00 -0400)
committerTejun Heo <tj@kernel.org>
Tue, 22 Sep 2015 16:46:53 +0000 (12:46 -0400)
It wasn't explicitly documented but, when a process is being migrated,
cpuset and memcg depend on cgroup_taskset_first() returning the
threadgroup leader; however, this approach is somewhat ghetto and
would no longer work for the planned multi-process migration.

This patch introduces explicit cgroup_taskset_for_each_leader() which
iterates over only the threadgroup leaders and replaces
cgroup_taskset_first() usages for accessing the leader with it.

This prepares both memcg and cpuset for multi-process migration.  This
patch also updates the documentation for cgroup_taskset_for_each() to
clarify the iteration rules and removes comments mentioning task
ordering in tasksets.

v2: A previous patch which added threadgroup leader test was dropped.
    Patch updated accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
include/linux/cgroup.h
kernel/cgroup.c
kernel/cpuset.c
mm/memcontrol.c

index fb717f2cba5ba25b9b3e2de8bf3191322635592d..e9c3eac074e224b9c1484e7f5983c37927577060 100644 (file)
@@ -232,11 +232,33 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
  * @tset: taskset to iterate
+ *
+ * @tset may contain multiple tasks and they may belong to multiple
+ * processes.  When there are multiple tasks in @tset, if a task of a
+ * process is in @tset, all tasks of the process are in @tset.  Also, all
+ * are guaranteed to share the same source and destination csses.
+ *
+ * Iteration is not in any specific order.
  */
 #define cgroup_taskset_for_each(task, tset)                            \
        for ((task) = cgroup_taskset_first((tset)); (task);             \
             (task) = cgroup_taskset_next((tset)))
 
+/**
+ * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
+ * @leader: the loop cursor
+ * @tset: takset to iterate
+ *
+ * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
+ * may not contain any.
+ */
+#define cgroup_taskset_for_each_leader(leader, tset)                   \
+       for ((leader) = cgroup_taskset_first((tset)); (leader);         \
+            (leader) = cgroup_taskset_next((tset)))                    \
+               if ((leader) != (leader)->group_leader)                 \
+                       ;                                               \
+               else
+
 /*
  * Inline functions.
  */
index 0be276ffe08a0cbf0ed31e680d963e7851cb29bd..7f4b85af03dcb1bde68fcec276f8bc869121264a 100644 (file)
@@ -2217,13 +2217,6 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 
        get_css_set(new_cset);
        rcu_assign_pointer(tsk->cgroups, new_cset);
-
-       /*
-        * Use move_tail so that cgroup_taskset_first() still returns the
-        * leader after migration.  This works because cgroup_migrate()
-        * ensures that the dst_cset of the leader is the first on the
-        * tset's dst_csets list.
-        */
        list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 
        /*
@@ -2419,10 +2412,6 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
                if (!cset->mg_src_cgrp)
                        goto next;
 
-               /*
-                * cgroup_taskset_first() must always return the leader.
-                * Take care to avoid disturbing the ordering.
-                */
                list_move_tail(&task->cg_list, &cset->mg_tasks);
                if (list_empty(&cset->mg_node))
                        list_add_tail(&cset->mg_node, &tset.src_csets);
index 0b361a0b58f69fe8d6aaa2ede0d2b402cf8809a5..e4d999929903b72ff75fba0e20297fb1a6d617db 100644 (file)
@@ -1488,7 +1488,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        /* static buf protected by cpuset_mutex */
        static nodemask_t cpuset_attach_nodemask_to;
        struct task_struct *task;
-       struct task_struct *leader = cgroup_taskset_first(tset);
+       struct task_struct *leader;
        struct cpuset *cs = css_cs(css);
        struct cpuset *oldcs = cpuset_attach_old_cs;
 
@@ -1514,12 +1514,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        }
 
        /*
-        * Change mm, possibly for multiple threads in a threadgroup. This
-        * is expensive and may sleep and should be moved outside migration
-        * path proper.
+        * Change mm for all threadgroup leaders. This is expensive and may
+        * sleep and should be moved outside migration path proper.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
-       if (thread_group_leader(leader)) {
+       cgroup_taskset_for_each_leader(leader, tset) {
                struct mm_struct *mm = get_task_mm(leader);
 
                if (mm) {
index 9f331402e5025fd790b3ca340bdcb946c199d1a5..33c8dad6830f86b36eb8fca89f734e1b16f139a9 100644 (file)
@@ -4828,7 +4828,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *from;
-       struct task_struct *p;
+       struct task_struct *leader, *p;
        struct mm_struct *mm;
        unsigned long move_flags;
        int ret = 0;
@@ -4842,7 +4842,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
        if (!move_flags)
                return 0;
 
-       p = cgroup_taskset_first(tset);
+       /*
+        * Multi-process migrations only happen on the default hierarchy
+        * where charge immigration is not used.  Perform charge
+        * immigration if @tset contains a leader and whine if there are
+        * multiple.
+        */
+       p = NULL;
+       cgroup_taskset_for_each_leader(leader, tset) {
+               WARN_ON_ONCE(p);
+               p = leader;
+       }
+       if (!p)
+               return 0;
+
        from = mem_cgroup_from_task(p);
 
        VM_BUG_ON(from == memcg);