diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-11 13:20:50 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-11 13:20:50 -0800 |
| commit | ff661eeee26038f15ed9dd33c91809632e11d9eb (patch) | |
| tree | 6ad1a8e8a47b929ac3e0c563a8b85e7a53363403 /kernel/cgroup/cpuset.c | |
| parent | 9bdc64892dcce732d55b2c07d80b36a6c3e1b5f4 (diff) | |
| parent | 8b1f3c54f930c3aeda0b5bad97bc317fc80267fd (diff) | |
Merge tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- cpuset changes:
- Continue separating v1 and v2 implementations by moving more
v1-specific logic into cpuset-v1.c
- Improve partition handling. Sibling partitions are no longer
invalidated on cpuset.cpus conflict, cpuset.cpus changes no longer
fail in v2, and effective_xcpus computation is made consistent
- Fix partition effective CPUs overlap that caused a warning on
cpuset removal when sibling partitions shared CPUs
- Increase the maximum cgroup subsystem count from 16 to 32 to
accommodate future subsystem additions
- Misc cleanups and selftest improvements including switching to
css_is_online() helper, removing dead code and stale documentation
references, using lockdep_assert_cpuset_lock_held() consistently, and
adding polling helpers for asynchronously updated cgroup statistics
* tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits)
cpuset: fix overlap of partition effective CPUs
cgroup: increase maximum subsystem count from 16 to 32
cgroup: Remove stale cpu.rt.max reference from documentation
cpuset: replace direct lockdep_assert_held() with lockdep_assert_cpuset_lock_held()
cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change()
cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict
cgroup/cpuset: Don't fail cpuset.cpus change in v2
cgroup/cpuset: Consistently compute effective_xcpus in update_cpumasks_hier()
cgroup/cpuset: Streamline rm_siblings_excl_cpus()
cpuset: remove dead code in cpuset-v1.c
cpuset: remove v1-specific code from generate_sched_domains
cpuset: separate generate_sched_domains for v1 and v2
cpuset: move update_domain_attr_tree to cpuset_v1.c
cpuset: add cpuset1_init helper for v1 initialization
cpuset: add cpuset1_online_css helper for v1-specific operations
cpuset: add lockdep_assert_cpuset_lock_held helper
cpuset: Remove unnecessary checks in rebuild_sched_domains_locked
cgroup: switch to css_is_online() helper
selftests: cgroup: Replace sleep with cg_read_key_long_poll() for waiting on nr_dying_descendants
selftests: cgroup: make test_memcg_sock robust against delayed sock stats
...
Diffstat (limited to 'kernel/cgroup/cpuset.c')
| -rw-r--r-- | kernel/cgroup/cpuset.c | 499 |
1 files changed, 120 insertions, 379 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 01a553caee56..c43efef7df71 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -119,6 +119,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -201,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -static struct cpuset top_cpuset = { +struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, - .relax_domain_level = -1, - .remote_partition = false, }; /* @@ -261,6 +270,11 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +void lockdep_assert_cpuset_lock_held(void) +{ + lockdep_assert_held(&cpuset_mutex); +} + /** * cpuset_full_lock - Acquire full protection for cpuset modification * @@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) @@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -603,36 +608,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts - * @cs1: first cpuset to check - * @cs2: second cpuset to check + * @trial: the trial cpuset to be checked + * @sibling: a sibling cpuset to be checked against + * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ -static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, + bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) - return !cpusets_are_exclusive(cs1, cs2); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) - return true; - - /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ - if (!cpumask_empty(cs1->cpus_allowed) && - cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); - if (!cpumask_empty(cs2->cpus_allowed) && - cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ + if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && + cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -666,6 +667,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; + bool xcpus_changed; int ret = 0; rcu_read_lock(); @@ -682,20 +684,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. @@ -722,10 +710,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; + xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; - if (cpus_excl_conflict(trial, c)) + if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; @@ -738,49 +727,6 @@ out: } #ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->effective_cpus, b->effective_cpus); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) -{ - struct cpuset *cp; - struct cgroup_subsys_state *pos_css; - - rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp doesn't have any CPU */ - if (cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - } - rcu_read_unlock(); -} - -/* Must be called with cpuset_mutex held. */ -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} /* * generate_sched_domains() @@ -820,103 +766,46 @@ static inline int nr_cpusets(void) * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The double nested loops below over i, j scan over the load - * balanced cpusets (using the array of cpuset pointers in csa[]) - * looking for pairs of cpusets that have overlapping cpus_allowed - * and merging them using a union-find algorithm. - * - * The union of the cpus_allowed masks from the set of all cpusets - * having the same root then form the one element of the partition - * (one sched domain) to be passed to partition_sched_domains(). - * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cpuset_v2(); - int nslot_update; + + if (!cpuset_v2()) + return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && cpumask_empty(subpartitions_cpus)) { -single_root_domain: + if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - - goto done; + /* !csa will be checked and can be correctly handled */ + goto generate_doms; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; - csn = 0; + /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { - if (cp == &top_cpuset) - continue; - - if (cgrpv2) - goto v2; - - /* - * v1: - * Continue traversing beyond @cp iff @cp has some CPUs and - * isn't load balancing. The former is obvious. The - * latter: All child cpusets contain a subset of the - * parent's cpus, so just skip them, and then we call - * update_domain_attr_tree() to calc relax_domain_level of - * the corresponding sched domain. - */ - if (!cpumask_empty(cp->cpus_allowed) && - !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_TYPE_DOMAIN)))) - continue; - - if (is_sched_load_balance(cp) && - !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; - - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); - continue; - -v2: /* * Only valid partition roots that are not isolated and with - * non-empty effective_cpus will be saved into csn[]. + * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; + csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no @@ -927,40 +816,18 @@ v2: } rcu_read_unlock(); - /* - * If there are only isolated partitions underneath the cgroup root, - * we can optimize out unneeded sched domains scanning. - */ - if (root_load_balance && (csn == 1)) - goto single_root_domain; - - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); - - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) { + for (i = 0; i < ndoms; i++) { + for (j = i + 1; j < ndoms; j++) { + if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ - WARN_ON_ONCE(cgrpv2); - uf_union(&csa[i]->node, &csa[j]->node); - } + WARN_ON_ONCE(1); } } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - - /* - * Now we know how many domains to create. - * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. - */ +generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -977,45 +844,19 @@ v2: * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN housekeeping CPUs. */ - if (cgrpv2) { - for (i = 0; i < ndoms; i++) { - /* - * The top cpuset may contain some boot time isolated - * CPUs that need to be excluded from the sched domain. - */ - if (csa[i] == &top_cpuset) - cpumask_and(doms[i], csa[i]->effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - else - cpumask_copy(doms[i], csa[i]->effective_cpus); - if (dattr) - dattr[i] = SD_ATTR_INIT; - } - goto done; - } - - for (nslot = 0, i = 0; i < csn; i++) { - nslot_update = 0; - for (j = i; j < csn; j++) { - if (uf_find(&csa[j]->node) == &csa[i]->node) { - struct cpumask *dp = doms[nslot]; - - if (i == j) { - nslot_update = 1; - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - } - cpumask_or(dp, dp, csa[j]->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); - if (dattr) - update_domain_attr_tree(dattr + nslot, csa[j]); - } - } - if (nslot_update) - nslot++; + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (!csa || csa[i] == &top_cpuset) + cpumask_and(doms[i], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } - BUG_ON(nslot != ndoms); done: kfree(csa); @@ -1055,7 +896,7 @@ void dl_rebuild_rd_accounting(void) int cpu; u64 cookie = ++dl_cookie; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1100,53 +941,33 @@ void dl_rebuild_rd_accounting(void) */ void rebuild_sched_domains_locked(void) { - struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; - struct cpuset *cs; int ndoms; + int i; lockdep_assert_cpus_held(); - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; - /* - * If we have raced with CPU hotplug, return early to avoid - * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_handle_hotplug() will rebuild sched domains. - * - * With no CPUs in any subpartitions, top_cpuset's effective CPUs - * should be the same as the active CPUs, so checking only top_cpuset - * is enough to detect racing CPU offlines. - */ - if (cpumask_empty(subpartitions_cpus) && - !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* Generate domain masks and attrs */ + ndoms = generate_sched_domains(&doms, &attr); /* - * With subpartition CPUs, however, the effective CPUs of a partition - * root should be only a subset of the active CPUs. Since a CPU in any - * partition root could be offlined, all must be checked. - */ - if (!cpumask_empty(subpartitions_cpus)) { - rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_valid(cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - if (!cpumask_subset(cs->effective_cpus, - cpu_active_mask)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + * cpuset_hotplug_workfn is invoked synchronously now, thus this + * function should not race with CPU hotplug. And the effective CPUs + * must not include any offline CPUs. Passing an offline CPU in the + * doms to partition_sched_domains() will trigger a kernel panic. + * + * We perform a final check here: if the doms contains any + * offline CPUs, a warning is emitted and we return directly to + * prevent the panic. + */ + for (i = 0; i < ndoms; ++i) { + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + return; } - /* Generate domain masks and attrs */ - ndoms = generate_sched_domains(&doms, &attr); - /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } @@ -1501,23 +1322,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, int retval = 0; if (cpumask_empty(excpus)) - return retval; + return 0; /* - * Exclude exclusive CPUs from siblings + * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { + struct cpumask *sibling_xcpus; + if (sibling == cs) continue; - if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { - cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); - retval++; - continue; - } - if (cpumask_intersects(excpus, sibling->effective_xcpus)) { - cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + /* + * If exclusive_cpus is defined, effective_xcpus will always + * be a subset. Otherwise, effective_xcpus will only be set + * in a valid partition root. + */ + sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) + ? sibling->effective_xcpus + : sibling->exclusive_cpus; + + if (cpumask_intersects(excpus, sibling_xcpus)) { + cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } @@ -1806,7 +1633,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int parent_prs = parent->partition_root_state; bool nocpu; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* @@ -2315,17 +2142,13 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_excpus(cp, cp->effective_xcpus); - /* - * Make sure effective_xcpus is properly set for a valid - * partition root. + * Need to compute effective_xcpus if either exclusive_cpus + * is non-empty or it is a valid partition root. */ - if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) - cpumask_and(cp->effective_xcpus, - cp->cpus_allowed, parent->effective_xcpus); - else if (new_prs < 0) + if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) + compute_excpus(cp, cp->effective_xcpus); + if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); @@ -2378,7 +2201,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() @@ -2387,27 +2210,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus - * directly. + * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { - if (sibling == cs) + if (sibling == cs || is_partition_valid(sibling)) continue; - if (!is_partition_valid(sibling)) { - compute_effective_cpumask(tmp->new_cpus, sibling, - parent); - if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) - continue; - } else if (is_remote_partition(sibling)) { - /* - * Change in a sibling cpuset won't affect a remote - * partition root. - */ + + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; - } if (!css_tryget_online(&sibling->css)) continue; @@ -2463,43 +2279,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2559,15 +2338,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2590,7 +2369,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } @@ -3249,7 +3028,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); @@ -3605,8 +3384,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; + cpuset1_init(cs); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3619,17 +3397,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); - struct cpuset *tmp_cs; - struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ @@ -3644,39 +3416,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); + cpuset1_online_css(css); - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) - goto out_unlock; - - /* - * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is - * set. This flag handling is implemented in cgroup core for - * historical reasons - the flag may be specified during mount. - * - * Currently, if any sibling cpusets have exclusive cpus or mem, we - * refuse to clone the configuration - thereby refusing the task to - * be entered, and as a result refusing the sys_unshare() or - * clone() which initiated it. If this becomes a problem for some - * users who wish to allow that scenario, then this could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. - */ - rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_css, parent) { - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { - rcu_read_unlock(); - goto out_unlock; - } - } - rcu_read_unlock(); - - spin_lock_irq(&callback_lock); - cs->mems_allowed = parent->mems_allowed; - cs->effective_mems = parent->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); -out_unlock: cpuset_full_unlock(); return 0; } @@ -3876,7 +3617,7 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); - fmeter_init(&top_cpuset.fmeter); + cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4210,7 +3951,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } |
