summaryrefslogtreecommitdiff
path: root/kernel/cgroup/cpuset.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 13:20:50 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 13:20:50 -0800
commitff661eeee26038f15ed9dd33c91809632e11d9eb (patch)
tree6ad1a8e8a47b929ac3e0c563a8b85e7a53363403 /kernel/cgroup/cpuset.c
parent9bdc64892dcce732d55b2c07d80b36a6c3e1b5f4 (diff)
parent8b1f3c54f930c3aeda0b5bad97bc317fc80267fd (diff)
Merge tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cpuset changes: - Continue separating v1 and v2 implementations by moving more v1-specific logic into cpuset-v1.c - Improve partition handling. Sibling partitions are no longer invalidated on cpuset.cpus conflict, cpuset.cpus changes no longer fail in v2, and effective_xcpus computation is made consistent - Fix partition effective CPUs overlap that caused a warning on cpuset removal when sibling partitions shared CPUs - Increase the maximum cgroup subsystem count from 16 to 32 to accommodate future subsystem additions - Misc cleanups and selftest improvements including switching to css_is_online() helper, removing dead code and stale documentation references, using lockdep_assert_cpuset_lock_held() consistently, and adding polling helpers for asynchronously updated cgroup statistics * tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits) cpuset: fix overlap of partition effective CPUs cgroup: increase maximum subsystem count from 16 to 32 cgroup: Remove stale cpu.rt.max reference from documentation cpuset: replace direct lockdep_assert_held() with lockdep_assert_cpuset_lock_held() cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change() cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict cgroup/cpuset: Don't fail cpuset.cpus change in v2 cgroup/cpuset: Consistently compute effective_xcpus in update_cpumasks_hier() cgroup/cpuset: Streamline rm_siblings_excl_cpus() cpuset: remove dead code in cpuset-v1.c cpuset: remove v1-specific code from generate_sched_domains cpuset: separate generate_sched_domains for v1 and v2 cpuset: move update_domain_attr_tree to cpuset_v1.c cpuset: add cpuset1_init helper for v1 initialization cpuset: add cpuset1_online_css helper for v1-specific operations cpuset: add lockdep_assert_cpuset_lock_held helper cpuset: Remove unnecessary checks in rebuild_sched_domains_locked cgroup: switch to css_is_online() helper selftests: cgroup: Replace sleep with cg_read_key_long_poll() for waiting on nr_dying_descendants selftests: cgroup: make test_memcg_sock robust against delayed sock stats ...
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r--kernel/cgroup/cpuset.c499
1 files changed, 120 insertions, 379 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 01a553caee56..c43efef7df71 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -119,6 +119,17 @@ static bool force_sd_rebuild;
* For simplicity, a local partition can be created under a local or remote
* partition but a remote partition cannot have any partition root in its
* ancestor chain except the cgroup root.
+ *
+ * A valid partition can be formed by setting exclusive_cpus or cpus_allowed
+ * if exclusive_cpus is not set. In the case of partition with empty
+ * exclusive_cpus, all the conflicting exclusive CPUs specified in the
+ * following cpumasks of sibling cpusets will be removed from its
+ * cpus_allowed in determining its effective_xcpus.
+ * - effective_xcpus
+ * - exclusive_cpus
+ *
+ * The "cpuset.cpus.exclusive" control file should be used for setting up
+ * partition if the users want to get as many CPUs as possible.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -201,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* If cpu_online_mask is used while a hotunplug operation is happening in
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
-static struct cpuset top_cpuset = {
+struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
- .relax_domain_level = -1,
- .remote_partition = false,
};
/*
@@ -261,6 +270,11 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+void lockdep_assert_cpuset_lock_held(void)
+{
+ lockdep_assert_held(&cpuset_mutex);
+}
+
/**
* cpuset_full_lock - Acquire full protection for cpuset modification
*
@@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
*/
static inline void dec_attach_in_progress_locked(struct cpuset *cs)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
cs->attach_in_progress--;
if (!cs->attach_in_progress)
@@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
-static inline bool cpuset_is_populated(struct cpuset *cs)
-{
- lockdep_assert_held(&cpuset_mutex);
-
- /* Cpusets in the process of attaching should be considered as populated */
- return cgroup_is_populated(cs->css.cgroup) ||
- cs->attach_in_progress;
-}
-
/**
* partition_is_populated - check if partition has tasks
* @cs: partition root to be checked
@@ -603,36 +608,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
/**
* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
- * @cs1: first cpuset to check
- * @cs2: second cpuset to check
+ * @trial: the trial cpuset to be checked
+ * @sibling: a sibling cpuset to be checked against
+ * @xcpus_changed: set if exclusive_cpus has been set
*
* Returns: true if CPU exclusivity conflict exists, false otherwise
*
* Conflict detection rules:
- * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
- * 2. exclusive_cpus masks cannot intersect between cpusets
- * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ * o cgroup v1
+ * See cpuset1_cpus_excl_conflict()
+ * o cgroup v2
+ * - The exclusive_cpus values cannot overlap.
+ * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
*/
-static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
+ bool xcpus_changed)
{
- /* If either cpuset is exclusive, check if they are mutually exclusive */
- if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
- return !cpusets_are_exclusive(cs1, cs2);
-
- /* Exclusive_cpus cannot intersect */
- if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
- return true;
-
- /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
- if (!cpumask_empty(cs1->cpus_allowed) &&
- cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
- return true;
+ if (!cpuset_v2())
+ return cpuset1_cpus_excl_conflict(trial, sibling);
- if (!cpumask_empty(cs2->cpus_allowed) &&
- cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
+ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
+ cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
return true;
- return false;
+ /* Exclusive_cpus cannot intersect */
+ return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
}
static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
@@ -666,6 +667,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ bool xcpus_changed;
int ret = 0;
rcu_read_lock();
@@ -682,20 +684,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if (cpuset_is_populated(cur)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
-
- /*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks. This check is not done when scheduling is disabled as the
* users should know what they are doing.
@@ -722,10 +710,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
+ xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
cpuset_for_each_child(c, css, par) {
if (c == cur)
continue;
- if (cpus_excl_conflict(trial, c))
+ if (cpus_excl_conflict(trial, c, xcpus_changed))
goto out;
if (mems_excl_conflict(trial, c))
goto out;
@@ -738,49 +727,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
- return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
-}
-
-/* Must be called with cpuset_mutex held. */
-static inline int nr_cpusets(void)
-{
- /* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key.key) + 1;
-}
/*
* generate_sched_domains()
@@ -820,103 +766,46 @@ static inline int nr_cpusets(void)
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- * The double nested loops below over i, j scan over the load
- * balanced cpusets (using the array of cpuset pointers in csa[])
- * looking for pairs of cpusets that have overlapping cpus_allowed
- * and merging them using a union-find algorithm.
- *
- * The union of the cpus_allowed masks from the set of all cpusets
- * having the same root then form the one element of the partition
- * (one sched domain) to be passed to partition_sched_domains().
- *
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
- bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cpuset_v2();
- int nslot_update;
+
+ if (!cpuset_v2())
+ return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
-single_root_domain:
+ if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
-
- goto done;
+ /* !csa will be checked and can be correctly handled */
+ goto generate_doms;
}
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
- csn = 0;
+ /* Find how many partitions and cache them to csa[] */
rcu_read_lock();
- if (root_load_balance)
- csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
-
- if (cgrpv2)
- goto v2;
-
- /*
- * v1:
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
-
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
-
-v2:
/*
* Only valid partition roots that are not isolated and with
- * non-empty effective_cpus will be saved into csn[].
+ * non-empty effective_cpus will be saved into csa[].
*/
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
+ csa[ndoms++] = cp;
/*
* Skip @cp's subtree if not a partition root and has no
@@ -927,40 +816,18 @@ v2:
}
rcu_read_unlock();
- /*
- * If there are only isolated partitions underneath the cgroup root,
- * we can optimize out unneeded sched domains scanning.
- */
- if (root_load_balance && (csn == 1))
- goto single_root_domain;
-
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
-
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j])) {
+ for (i = 0; i < ndoms; i++) {
+ for (j = i + 1; j < ndoms; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
/*
* Cgroup v2 shouldn't pass down overlapping
* partition root cpusets.
*/
- WARN_ON_ONCE(cgrpv2);
- uf_union(&csa[i]->node, &csa[j]->node);
- }
+ WARN_ON_ONCE(1);
}
}
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
+generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -977,45 +844,19 @@ v2:
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
*/
- if (cgrpv2) {
- for (i = 0; i < ndoms; i++) {
- /*
- * The top cpuset may contain some boot time isolated
- * CPUs that need to be excluded from the sched domain.
- */
- if (csa[i] == &top_cpuset)
- cpumask_and(doms[i], csa[i]->effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- else
- cpumask_copy(doms[i], csa[i]->effective_cpus);
- if (dattr)
- dattr[i] = SD_ATTR_INIT;
- }
- goto done;
- }
-
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
-
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
- BUG_ON(nslot != ndoms);
done:
kfree(csa);
@@ -1055,7 +896,7 @@ void dl_rebuild_rd_accounting(void)
int cpu;
u64 cookie = ++dl_cookie;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1100,53 +941,33 @@ void dl_rebuild_rd_accounting(void)
*/
void rebuild_sched_domains_locked(void)
{
- struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
- struct cpuset *cs;
int ndoms;
+ int i;
lockdep_assert_cpus_held();
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
force_sd_rebuild = false;
- /*
- * If we have raced with CPU hotplug, return early to avoid
- * passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
- *
- * With no CPUs in any subpartitions, top_cpuset's effective CPUs
- * should be the same as the active CPUs, so checking only top_cpuset
- * is enough to detect racing CPU offlines.
- */
- if (cpumask_empty(subpartitions_cpus) &&
- !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
/*
- * With subpartition CPUs, however, the effective CPUs of a partition
- * root should be only a subset of the active CPUs. Since a CPU in any
- * partition root could be offlined, all must be checked.
- */
- if (!cpumask_empty(subpartitions_cpus)) {
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_valid(cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- if (!cpumask_subset(cs->effective_cpus,
- cpu_active_mask)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ * cpuset_hotplug_workfn is invoked synchronously now, thus this
+ * function should not race with CPU hotplug. And the effective CPUs
+ * must not include any offline CPUs. Passing an offline CPU in the
+ * doms to partition_sched_domains() will trigger a kernel panic.
+ *
+ * We perform a final check here: if the doms contains any
+ * offline CPUs, a warning is emitted and we return directly to
+ * prevent the panic.
+ */
+ for (i = 0; i < ndoms; ++i) {
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ return;
}
- /* Generate domain masks and attrs */
- ndoms = generate_sched_domains(&doms, &attr);
-
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
}
@@ -1501,23 +1322,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
int retval = 0;
if (cpumask_empty(excpus))
- return retval;
+ return 0;
/*
- * Exclude exclusive CPUs from siblings
+ * Remove exclusive CPUs from siblings
*/
rcu_read_lock();
cpuset_for_each_child(sibling, css, parent) {
+ struct cpumask *sibling_xcpus;
+
if (sibling == cs)
continue;
- if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
- cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
- retval++;
- continue;
- }
- if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
- cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ /*
+ * If exclusive_cpus is defined, effective_xcpus will always
+ * be a subset. Otherwise, effective_xcpus will only be set
+ * in a valid partition root.
+ */
+ sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
+ ? sibling->effective_xcpus
+ : sibling->exclusive_cpus;
+
+ if (cpumask_intersects(excpus, sibling_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling_xcpus);
retval++;
}
}
@@ -1806,7 +1633,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int parent_prs = parent->partition_root_state;
bool nocpu;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
@@ -2315,17 +2142,13 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_excpus(cp, cp->effective_xcpus);
-
/*
- * Make sure effective_xcpus is properly set for a valid
- * partition root.
+ * Need to compute effective_xcpus if either exclusive_cpus
+ * is non-empty or it is a valid partition root.
*/
- if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
- cpumask_and(cp->effective_xcpus,
- cp->cpus_allowed, parent->effective_xcpus);
- else if (new_prs < 0)
+ if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
+ compute_excpus(cp, cp->effective_xcpus);
+ if (new_prs <= 0)
reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
@@ -2378,7 +2201,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -2387,27 +2210,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* It is possible a change in parent's effective_cpus
* due to a change in a child partition's effective_xcpus will impact
* its siblings even if they do not inherit parent's effective_cpus
- * directly.
+ * directly. It should not impact valid partition.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
- if (sibling == cs)
+ if (sibling == cs || is_partition_valid(sibling))
continue;
- if (!is_partition_valid(sibling)) {
- compute_effective_cpumask(tmp->new_cpus, sibling,
- parent);
- if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
- continue;
- } else if (is_remote_partition(sibling)) {
- /*
- * Change in a sibling cpuset won't affect a remote
- * partition root.
- */
+
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
- }
if (!css_tryget_online(&sibling->css))
continue;
@@ -2463,43 +2279,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri
return PERR_NONE;
}
-static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
- struct tmpmasks *tmp)
-{
- int retval;
- struct cpuset *parent = parent_cs(cs);
-
- retval = validate_change(cs, trialcs);
-
- if ((retval == -EINVAL) && cpuset_v2()) {
- struct cgroup_subsys_state *css;
- struct cpuset *cp;
-
- /*
- * The -EINVAL error code indicates that partition sibling
- * CPU exclusivity rule has been violated. We still allow
- * the cpumask change to proceed while invalidating the
- * partition. However, any conflicting sibling partitions
- * have to be marked as invalid too.
- */
- trialcs->prs_err = PERR_NOTEXCL;
- rcu_read_lock();
- cpuset_for_each_child(cp, css, parent) {
- struct cpumask *xcpus = user_xcpus(trialcs);
-
- if (is_partition_valid(cp) &&
- cpumask_intersects(xcpus, cp->effective_xcpus)) {
- rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- retval = 0;
- }
- return retval;
-}
-
/**
* partition_cpus_change - Handle partition state changes due to CPU mask updates
* @cs: The target cpuset being modified
@@ -2559,15 +2338,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- if (alloc_tmpmasks(&tmp))
- return -ENOMEM;
-
compute_trialcs_excpus(trialcs, cs);
trialcs->prs_err = PERR_NONE;
- retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ retval = validate_change(cs, trialcs);
if (retval < 0)
- goto out_free;
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2590,7 +2369,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
-out_free:
+
free_tmpmasks(&tmp);
return retval;
}
@@ -3249,7 +3028,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
if (cs != &top_cpuset)
guarantee_active_cpus(task, cpus_attach);
@@ -3605,8 +3384,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
+ cpuset1_init(cs);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cpuset_v2())
@@ -3619,17 +3397,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
if (!parent)
return 0;
cpuset_full_lock();
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
@@ -3644,39 +3416,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
}
spin_unlock_irq(&callback_lock);
+ cpuset1_online_css(css);
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- goto out_unlock;
-
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * historical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- }
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-out_unlock:
cpuset_full_unlock();
return 0;
}
@@ -3876,7 +3617,7 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
- fmeter_init(&top_cpuset.fmeter);
+ cpuset1_init(&top_cpuset);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4210,7 +3951,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask
*/
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
__cpuset_cpus_allowed_locked(tsk, pmask);
}