summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-03-31 13:59:51 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-03-31 13:59:51 -0700
commit53d85a205644036d914a1408a3a301f7068f320b (patch)
tree1b83146a8b2191ab7c0bdee1e134a832185fd414
parentdbf00d8d23b43a7a1d717e704c50651731d01d71 (diff)
parent089f3fcd690c71cb3d8ca09f34027764e28920a0 (diff)
Merge tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo: - Fix cgroup rmdir racing with dying tasks. Deferred task cgroup unlink introduced a window where cgroup.procs is empty but the cgroup is still populated, causing rmdir to fail with -EBUSY and selftest failures. Make rmdir wait for dying tasks to fully leave and fix selftests to not depend on synchronous populated updates. - Fix cpuset v1 task migration failure from empty cpusets under strict security policies. When CPU hotplug removes the last CPU from a v1 cpuset, tasks must be migrated to an ancestor without a security_task_setscheduler() check that would block the migration. * tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup/cpuset: Skip security check for hotplug induced v1 task migration cgroup/cpuset: Simplify setsched decision check in task iteration loop of cpuset_can_attach() cgroup: Fix cgroup_drain_dying() testing the wrong condition selftests/cgroup: Don't require synchronous populated update on task exit cgroup: Wait for dying tasks to leave on rmdir
-rw-r--r--include/linux/cgroup-defs.h3
-rw-r--r--kernel/cgroup/cgroup.c88
-rw-r--r--kernel/cgroup/cpuset.c29
-rw-r--r--tools/testing/selftests/cgroup/lib/cgroup_util.c15
-rw-r--r--tools/testing/selftests/cgroup/lib/include/cgroup_util.h2
-rw-r--r--tools/testing/selftests/cgroup/test_core.c3
-rw-r--r--tools/testing/selftests/cgroup/test_kill.c7
7 files changed, 131 insertions, 16 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index bb92f5c169ca..7f87399938fa 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -609,6 +609,9 @@ struct cgroup {
/* used to wait for offlining of csses */
wait_queue_head_t offline_waitq;
+ /* used by cgroup_rmdir() to wait for dying tasks to leave */
+ wait_queue_head_t dying_populated_waitq;
+
/* used to schedule release agent */
struct work_struct release_agent_work;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 01fc2a93f3ef..4ca3cb993da2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
+ init_waitqueue_head(&cgrp->dying_populated_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -6224,6 +6225,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return 0;
};
+/**
+ * cgroup_drain_dying - wait for dying tasks to leave before rmdir
+ * @cgrp: the cgroup being removed
+ *
+ * cgroup.procs and cgroup.threads use css_task_iter which filters out
+ * PF_EXITING tasks so that userspace doesn't see tasks that have already been
+ * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
+ * cgroup has non-empty css_sets - is only updated when dying tasks pass through
+ * cgroup_task_dead() in finish_task_switch(). This creates a window where
+ * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
+ * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
+ * tasks.
+ *
+ * This function aligns cgroup_has_tasks() with what userspace can observe. If
+ * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
+ * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
+ * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
+ *
+ * This function only concerns itself with this cgroup's own dying tasks.
+ * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
+ *
+ * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
+ * retry the full check from scratch.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+static int cgroup_drain_dying(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ DEFINE_WAIT(wait);
+
+ lockdep_assert_held(&cgroup_mutex);
+retry:
+ if (!cgroup_has_tasks(cgrp))
+ return 0;
+
+ /* Same iterator as cgroup.threads - if any task is visible, it's busy */
+ css_task_iter_start(&cgrp->self, 0, &it);
+ task = css_task_iter_next(&it);
+ css_task_iter_end(&it);
+
+ if (task)
+ return -EBUSY;
+
+ /*
+ * All remaining tasks are PF_EXITING and will pass through
+ * cgroup_task_dead() shortly. Wait for a kick and retry.
+ *
+ * cgroup_has_tasks() can't transition from false to true while we're
+ * holding cgroup_mutex, but the true to false transition happens
+ * under css_set_lock (via cgroup_task_dead()). We must retest and
+ * prepare_to_wait() under css_set_lock. Otherwise, the transition
+ * can happen between our first test and prepare_to_wait(), and we
+ * sleep with no one to wake us.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (!cgroup_has_tasks(cgrp)) {
+ spin_unlock_irq(&css_set_lock);
+ return 0;
+ }
+ prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&css_set_lock);
+ mutex_unlock(&cgroup_mutex);
+ schedule();
+ finish_wait(&cgrp->dying_populated_waitq, &wait);
+ mutex_lock(&cgroup_mutex);
+ goto retry;
+}
+
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -6233,9 +6306,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
- ret = cgroup_destroy_locked(cgrp);
- if (!ret)
- TRACE_CGROUP_PATH(rmdir, cgrp);
+ ret = cgroup_drain_dying(cgrp);
+ if (!ret) {
+ ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ TRACE_CGROUP_PATH(rmdir, cgrp);
+ }
cgroup_kn_unlock(kn);
return ret;
@@ -6995,6 +7071,7 @@ void cgroup_task_exit(struct task_struct *tsk)
static void do_cgroup_task_dead(struct task_struct *tsk)
{
+ struct cgrp_cset_link *link;
struct css_set *cset;
unsigned long flags;
@@ -7008,6 +7085,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+ /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ if (waitqueue_active(&link->cgrp->dying_populated_waitq))
+ wake_up(&link->cgrp->dying_populated_waitq);
+
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d21868455341..1335e437098e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2988,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
- bool cpus_updated, mems_updated;
+ bool setsched_check;
int ret;
/* used later by cpuset_attach() */
@@ -3003,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (ret)
goto out_unlock;
- cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
- mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+ /*
+ * Skip rights over task setsched check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ setsched_check = !cpuset_v2() ||
+ !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
+ !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * A v1 cpuset with tasks will have no CPU left only when CPU hotplug
+ * brings the last online CPU offline as users are not allowed to empty
+ * cpuset.cpus when there are active tasks inside. When that happens,
+ * we should allow tasks to migrate out without security check to make
+ * sure they will be able to run after migration.
+ */
+ if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
+ setsched_check = false;
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
- /*
- * Skip rights over task check in v2 when nothing changes,
- * migration permission derives from hierarchy ownership in
- * cgroup_procs_write_permission()).
- */
- if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ if (setsched_check) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index ce6c2642fd9b..6a7295347e90 100644
--- a/tools/testing/selftests/cgroup/lib/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -123,6 +123,21 @@ int cg_read_strcmp(const char *cgroup, const char *control,
return ret;
}
+int cg_read_strcmp_wait(const char *cgroup, const char *control,
+ const char *expected)
+{
+ int i, ret;
+
+ for (i = 0; i < 100; i++) {
+ ret = cg_read_strcmp(cgroup, control, expected);
+ if (!ret)
+ return ret;
+ usleep(10000);
+ }
+
+ return ret;
+}
+
int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
{
char buf[PAGE_SIZE];
diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index 77f386dab5e8..567b1082974c 100644
--- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -61,6 +61,8 @@ extern int cg_read(const char *cgroup, const char *control,
char *buf, size_t len);
extern int cg_read_strcmp(const char *cgroup, const char *control,
const char *expected);
+extern int cg_read_strcmp_wait(const char *cgroup, const char *control,
+ const char *expected);
extern int cg_read_strstr(const char *cgroup, const char *control,
const char *needle);
extern long cg_read_long(const char *cgroup, const char *control);
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 102262555a59..7b83c7e7c9d4 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -233,7 +233,8 @@ static int test_cgcore_populated(const char *root)
if (err)
goto cleanup;
- if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+ if (cg_read_strcmp_wait(cg_test_d, "cgroup.events",
+ "populated 0\n"))
goto cleanup;
/* Remove cgroup. */
diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c
index c8c9d306925b..f6cd23a8ecc7 100644
--- a/tools/testing/selftests/cgroup/test_kill.c
+++ b/tools/testing/selftests/cgroup/test_kill.c
@@ -86,7 +86,7 @@ cleanup:
wait_for_pid(pids[i]);
if (ret == KSFT_PASS &&
- cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+ cg_read_strcmp_wait(cgroup, "cgroup.events", "populated 0\n"))
ret = KSFT_FAIL;
if (cgroup)
@@ -190,7 +190,8 @@ cleanup:
wait_for_pid(pids[i]);
if (ret == KSFT_PASS &&
- cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
+ cg_read_strcmp_wait(cgroup[0], "cgroup.events",
+ "populated 0\n"))
ret = KSFT_FAIL;
for (i = 9; i >= 0 && cgroup[i]; i--) {
@@ -251,7 +252,7 @@ cleanup:
wait_for_pid(pid);
if (ret == KSFT_PASS &&
- cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+ cg_read_strcmp_wait(cgroup, "cgroup.events", "populated 0\n"))
ret = KSFT_FAIL;
if (cgroup)