summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/arch/arm64/asymmetric-32bit.rst12
-rw-r--r--Documentation/core-api/housekeeping.rst111
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--arch/arm64/kernel/cpufeature.c6
-rw-r--r--block/blk-mq.c6
-rw-r--r--drivers/base/cpu.c2
-rw-r--r--drivers/pci/pci-driver.c71
-rw-r--r--include/linux/cpuhplock.h1
-rw-r--r--include/linux/cpuset.h8
-rw-r--r--include/linux/kthread.h1
-rw-r--r--include/linux/memcontrol.h4
-rw-r--r--include/linux/mmu_context.h2
-rw-r--r--include/linux/pci.h3
-rw-r--r--include/linux/percpu-rwsem.h1
-rw-r--r--include/linux/sched/isolation.h16
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/linux/workqueue.h2
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup/cpuset.c53
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/kthread.c160
-rw-r--r--kernel/sched/isolation.c145
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/time/timer_migration.c25
-rw-r--r--kernel/workqueue.c17
-rw-r--r--mm/memcontrol.c31
-rw-r--r--mm/vmstat.c15
-rw-r--r--net/core/net-sysfs.c2
28 files changed, 538 insertions, 206 deletions
diff --git a/Documentation/arch/arm64/asymmetric-32bit.rst b/Documentation/arch/arm64/asymmetric-32bit.rst
index 57b8d7476f71..fc0c350c5e00 100644
--- a/Documentation/arch/arm64/asymmetric-32bit.rst
+++ b/Documentation/arch/arm64/asymmetric-32bit.rst
@@ -154,10 +154,14 @@ mode will return to host userspace with an ``exit_reason`` of
``KVM_EXIT_FAIL_ENTRY`` and will remain non-runnable until successfully
re-initialised by a subsequent ``KVM_ARM_VCPU_INIT`` operation.
-NOHZ FULL
----------
+SCHEDULER DOMAIN ISOLATION
+--------------------------
-To avoid perturbing an adaptive-ticks CPU (specified using
-``nohz_full=``) when a 32-bit task is forcefully migrated, these CPUs
+To avoid perturbing a boot-defined domain isolated CPU (specified using
+``isolcpus=[domain]``) when a 32-bit task is forcefully migrated, these CPUs
are treated as 64-bit-only when support for asymmetric 32-bit systems
is enabled.
+
+However as opposed to boot-defined domain isolation, runtime-defined domain
+isolation using cpuset isolated partition is not advised on asymmetric
+32-bit systems and will result in undefined behaviour.
diff --git a/Documentation/core-api/housekeeping.rst b/Documentation/core-api/housekeeping.rst
new file mode 100644
index 000000000000..e5417302774c
--- /dev/null
+++ b/Documentation/core-api/housekeeping.rst
@@ -0,0 +1,111 @@
+======================================
+Housekeeping
+======================================
+
+
+CPU Isolation moves away kernel work that may otherwise run on any CPU.
+The purpose of its related features is to reduce the OS jitter that some
+extreme workloads can't stand, such as in some DPDK usecases.
+
+The kernel work moved away by CPU isolation is commonly described as
+"housekeeping" because it includes ground work that performs cleanups,
+statistics maintainance and actions relying on them, memory release,
+various deferrals etc...
+
+Sometimes housekeeping is just some unbound work (unbound workqueues,
+unbound timers, ...) that gets easily assigned to non-isolated CPUs.
+But sometimes housekeeping is tied to a specific CPU and requires
+elaborated tricks to be offloaded to non-isolated CPUs (RCU_NOCB, remote
+scheduler tick, etc...).
+
+Thus, a housekeeping CPU can be considered as the reverse of an isolated
+CPU. It is simply a CPU that can execute housekeeping work. There must
+always be at least one online housekeeping CPU at any time. The CPUs that
+are not isolated are automatically assigned as housekeeping.
+
+Housekeeping is currently divided in four features described
+by the ``enum hk_type type``:
+
+1. HK_TYPE_DOMAIN matches the work moved away by scheduler domain
+ isolation performed through ``isolcpus=domain`` boot parameter or
+ isolated cpuset partitions in cgroup v2. This includes scheduler
+ load balancing, unbound workqueues and timers.
+
+2. HK_TYPE_KERNEL_NOISE matches the work moved away by tick isolation
+ performed through ``nohz_full=`` or ``isolcpus=nohz`` boot
+ parameters. This includes remote scheduler tick, vmstat and lockup
+ watchdog.
+
+3. HK_TYPE_MANAGED_IRQ matches the IRQ handlers moved away by managed
+ IRQ isolation performed through ``isolcpus=managed_irq``.
+
+4. HK_TYPE_DOMAIN_BOOT matches the work moved away by scheduler domain
+ isolation performed through ``isolcpus=domain`` only. It is similar
+ to HK_TYPE_DOMAIN except it ignores the isolation performed by
+ cpusets.
+
+
+Housekeeping cpumasks
+=================================
+
+Housekeeping cpumasks include the CPUs that can execute the work moved
+away by the matching isolation feature. These cpumasks are returned by
+the following function::
+
+ const struct cpumask *housekeeping_cpumask(enum hk_type type)
+
+By default, if neither ``nohz_full=``, nor ``isolcpus``, nor cpuset's
+isolated partitions are used, which covers most usecases, this function
+returns the cpu_possible_mask.
+
+Otherwise the function returns the cpumask complement of the isolation
+feature. For example:
+
+With isolcpus=domain,7 the following will return a mask with all possible
+CPUs except 7::
+
+ housekeeping_cpumask(HK_TYPE_DOMAIN)
+
+Similarly with nohz_full=5,6 the following will return a mask with all
+possible CPUs except 5,6::
+
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)
+
+
+Synchronization against cpusets
+=================================
+
+Cpuset can modify the HK_TYPE_DOMAIN housekeeping cpumask while creating,
+modifying or deleting an isolated partition.
+
+The users of HK_TYPE_DOMAIN cpumask must then make sure to synchronize
+properly against cpuset in order to make sure that:
+
+1. The cpumask snapshot stays coherent.
+
+2. No housekeeping work is queued on a newly made isolated CPU.
+
+3. Pending housekeeping work that was queued to a non isolated
+ CPU which just turned isolated through cpuset must be flushed
+ before the related created/modified isolated partition is made
+ available to userspace.
+
+This synchronization is maintained by an RCU based scheme. The cpuset update
+side waits for an RCU grace period after updating the HK_TYPE_DOMAIN
+cpumask and before flushing pending works. On the read side, care must be
+taken to gather the housekeeping target election and the work enqueue within
+the same RCU read side critical section.
+
+A typical layout example would look like this on the update side
+(``housekeeping_update()``)::
+
+ rcu_assign_pointer(housekeeping_cpumasks[type], trial);
+ synchronize_rcu();
+ flush_workqueue(example_workqueue);
+
+And then on the read side::
+
+ rcu_read_lock();
+ cpu = housekeeping_any_cpu(HK_TYPE_DOMAIN);
+ queue_work_on(cpu, example_workqueue, work);
+ rcu_read_unlock();
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 5eb0fbbbc323..79fe7735692e 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -25,6 +25,7 @@ it.
symbol-namespaces
asm-annotations
real-time/index
+ housekeeping.rst
Data structures and low-level utilities
=======================================
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c840a93b9ef9..f0e66cb27d17 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1669,7 +1669,7 @@ const struct cpumask *system_32bit_el0_cpumask(void)
const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
{
- return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK));
+ return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_DOMAIN));
}
static int __init parse_32bit_el0_param(char *str)
@@ -3987,8 +3987,8 @@ static int enable_mismatched_32bit_el0(unsigned int cpu)
bool cpu_32bit = false;
if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
- if (!housekeeping_cpu(cpu, HK_TYPE_TICK))
- pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu);
+ if (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN))
+ pr_info("Treating domain isolated CPU %u as 64-bit only\n", cpu);
else
cpu_32bit = true;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ae83d3e210f..8d052924b0c8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4270,12 +4270,16 @@ static void blk_mq_map_swqueue(struct request_queue *q)
/*
* Rule out isolated CPUs from hctx->cpumask to avoid
- * running block kworker on isolated CPUs
+ * running block kworker on isolated CPUs.
+ * FIXME: cpuset should propagate further changes to isolated CPUs
+ * here.
*/
+ rcu_read_lock();
for_each_cpu(cpu, hctx->cpumask) {
if (cpu_is_isolated(cpu))
cpumask_clear_cpu(cpu, hctx->cpumask);
}
+ rcu_read_unlock();
/*
* Initialize batch roundrobin counts
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index c6c57b6f61c6..3e3fa031e605 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -291,7 +291,7 @@ static ssize_t print_cpus_isolated(struct device *dev,
return -ENOMEM;
cpumask_andnot(isolated, cpu_possible_mask,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
len = sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(isolated));
free_cpumask_var(isolated);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 7c2d9d596258..a9590601835a 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -302,9 +302,8 @@ struct drv_dev_and_id {
const struct pci_device_id *id;
};
-static long local_pci_probe(void *_ddi)
+static int local_pci_probe(struct drv_dev_and_id *ddi)
{
- struct drv_dev_and_id *ddi = _ddi;
struct pci_dev *pci_dev = ddi->dev;
struct pci_driver *pci_drv = ddi->drv;
struct device *dev = &pci_dev->dev;
@@ -338,6 +337,21 @@ static long local_pci_probe(void *_ddi)
return 0;
}
+static struct workqueue_struct *pci_probe_wq;
+
+struct pci_probe_arg {
+ struct drv_dev_and_id *ddi;
+ struct work_struct work;
+ int ret;
+};
+
+static void local_pci_probe_callback(struct work_struct *work)
+{
+ struct pci_probe_arg *arg = container_of(work, struct pci_probe_arg, work);
+
+ arg->ret = local_pci_probe(arg->ddi);
+}
+
static bool pci_physfn_is_probed(struct pci_dev *dev)
{
#ifdef CONFIG_PCI_IOV
@@ -362,40 +376,55 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
dev->is_probed = 1;
cpu_hotplug_disable();
-
/*
* Prevent nesting work_on_cpu() for the case where a Virtual Function
* device is probed from work_on_cpu() of the Physical device.
*/
if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
pci_physfn_is_probed(dev)) {
- cpu = nr_cpu_ids;
+ error = local_pci_probe(&ddi);
} else {
- cpumask_var_t wq_domain_mask;
+ struct pci_probe_arg arg = { .ddi = &ddi };
- if (!zalloc_cpumask_var(&wq_domain_mask, GFP_KERNEL)) {
- error = -ENOMEM;
- goto out;
+ INIT_WORK_ONSTACK(&arg.work, local_pci_probe_callback);
+ /*
+ * The target election and the enqueue of the work must be within
+ * the same RCU read side section so that when the workqueue pool
+ * is flushed after a housekeeping cpumask update, further readers
+ * are guaranteed to queue the probing work to the appropriate
+ * targets.
+ */
+ rcu_read_lock();
+ cpu = cpumask_any_and(cpumask_of_node(node),
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ if (cpu < nr_cpu_ids) {
+ struct workqueue_struct *wq = pci_probe_wq;
+
+ if (WARN_ON_ONCE(!wq))
+ wq = system_percpu_wq;
+ queue_work_on(cpu, wq, &arg.work);
+ rcu_read_unlock();
+ flush_work(&arg.work);
+ error = arg.ret;
+ } else {
+ rcu_read_unlock();
+ error = local_pci_probe(&ddi);
}
- cpumask_and(wq_domain_mask,
- housekeeping_cpumask(HK_TYPE_WQ),
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpu = cpumask_any_and(cpumask_of_node(node),
- wq_domain_mask);
- free_cpumask_var(wq_domain_mask);
+ destroy_work_on_stack(&arg.work);
}
- if (cpu < nr_cpu_ids)
- error = work_on_cpu(cpu, local_pci_probe, &ddi);
- else
- error = local_pci_probe(&ddi);
-out:
dev->is_probed = 0;
cpu_hotplug_enable();
return error;
}
+void pci_probe_flush_workqueue(void)
+{
+ flush_workqueue(pci_probe_wq);
+}
+
/**
* __pci_device_probe - check if a driver wants to claim a specific PCI device
* @drv: driver to call to check if it wants the PCI device
@@ -1733,6 +1762,10 @@ static int __init pci_driver_init(void)
{
int ret;
+ pci_probe_wq = alloc_workqueue("sync_wq", WQ_PERCPU, 0);
+ if (!pci_probe_wq)
+ return -ENOMEM;
+
ret = bus_register(&pci_bus_type);
if (ret)
return ret;
diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h
index f7aa20f62b87..286b3ab92e15 100644
--- a/include/linux/cpuhplock.h
+++ b/include/linux/cpuhplock.h
@@ -13,6 +13,7 @@
struct device;
extern int lockdep_is_cpus_held(void);
+extern int lockdep_is_cpus_write_held(void);
#ifdef CONFIG_HOTPLUG_CPU
void cpus_write_lock(void);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a98d3330385c..a4aa2f1767d0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,6 +18,8 @@
#include <linux/mmu_context.h>
#include <linux/jump_label.h>
+extern bool lockdep_is_cpuset_held(void);
+
#ifdef CONFIG_CPUSETS
/*
@@ -77,7 +79,6 @@ extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
-extern bool cpuset_cpu_is_isolated(int cpu);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
@@ -213,11 +214,6 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
return false;
}
-static inline bool cpuset_cpu_is_isolated(int cpu)
-{
- return false;
-}
-
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
return node_possible_map;
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 8d27403888ce..c92c1149ee6e 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -100,6 +100,7 @@ void kthread_unpark(struct task_struct *k);
void kthread_parkme(void);
void kthread_exit(long result) __noreturn;
void kthread_complete_and_exit(struct completion *, long) __noreturn;
+int kthreads_update_housekeeping(void);
int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0651865a4564..5b004b95648b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1037,6 +1037,8 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
return id;
}
+void mem_cgroup_flush_workqueue(void);
+
extern int mem_cgroup_init(void);
#else /* CONFIG_MEMCG */
@@ -1436,6 +1438,8 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
return 0;
}
+static inline void mem_cgroup_flush_workqueue(void) { }
+
static inline int mem_cgroup_init(void) { return 0; }
#endif /* CONFIG_MEMCG */
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index ac01dc4eb2ce..ed3dd0f3fe19 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -24,7 +24,7 @@ static inline void leave_mm(void) { }
#ifndef task_cpu_possible_mask
# define task_cpu_possible_mask(p) cpu_possible_mask
# define task_cpu_possible(cpu, p) true
-# define task_cpu_fallback_mask(p) housekeeping_cpumask(HK_TYPE_TICK)
+# define task_cpu_fallback_mask(p) housekeeping_cpumask(HK_TYPE_DOMAIN)
#else
# define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
#endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b5cc0c2b9906..9357e9b00e1c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1206,6 +1206,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata,
struct list_head *resources);
int pci_host_probe(struct pci_host_bridge *bridge);
+void pci_probe_flush_workqueue(void);
int pci_bus_insert_busn_res(struct pci_bus *b, int bus, int busmax);
int pci_bus_update_busn_res_end(struct pci_bus *b, int busmax);
void pci_bus_release_busn_res(struct pci_bus *b);
@@ -2079,6 +2080,8 @@ static inline int pci_has_flag(int flag) { return 0; }
_PCI_NOP_ALL(read, *)
_PCI_NOP_ALL(write,)
+static inline void pci_probe_flush_workqueue(void) { }
+
static inline struct pci_dev *pci_get_device(unsigned int vendor,
unsigned int device,
struct pci_dev *from)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 288f5235649a..c8cb010d655e 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -161,6 +161,7 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
__percpu_init_rwsem(sem, #sem, &rwsem_key); \
})
+#define percpu_rwsem_is_write_held(sem) lockdep_is_held_type(sem, 0)
#define percpu_rwsem_is_held(sem) lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem)
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d8501f4709b5..dc3975ff1b2e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -2,13 +2,21 @@
#define _LINUX_SCHED_ISOLATION_H
#include <linux/cpumask.h>
-#include <linux/cpuset.h>
#include <linux/init.h>
#include <linux/tick.h>
enum hk_type {
+ /* Inverse of boot-time isolcpus= argument */
+ HK_TYPE_DOMAIN_BOOT,
+ /*
+ * Same as HK_TYPE_DOMAIN_BOOT but also includes the
+ * inverse of cpuset isolated partitions. As such it
+ * is always a subset of HK_TYPE_DOMAIN_BOOT.
+ */
HK_TYPE_DOMAIN,
+ /* Inverse of boot-time isolcpus=managed_irq argument */
HK_TYPE_MANAGED_IRQ,
+ /* Inverse of boot-time nohz_full= or isolcpus=nohz arguments */
HK_TYPE_KERNEL_NOISE,
HK_TYPE_MAX,
@@ -31,6 +39,7 @@ extern const struct cpumask *housekeeping_cpumask(enum hk_type type);
extern bool housekeeping_enabled(enum hk_type type);
extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
+extern int housekeeping_update(struct cpumask *isol_mask);
extern void __init housekeeping_init(void);
#else
@@ -58,6 +67,7 @@ static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
return true;
}
+static inline int housekeeping_update(struct cpumask *isol_mask) { return 0; }
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */
@@ -72,9 +82,7 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
static inline bool cpu_is_isolated(int cpu)
{
- return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
- !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
- cpuset_cpu_is_isolated(cpu);
+ return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN);
}
#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3398a345bda8..1909b945b3ea 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -303,6 +303,7 @@ int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
int (*calculate_pressure)(struct zone *));
+void vmstat_flush_workqueue(void);
#else /* CONFIG_SMP */
/*
@@ -403,6 +404,7 @@ static inline void __dec_node_page_state(struct page *page,
static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }
+static inline void vmstat_flush_workqueue(void) { }
static inline void drain_zonestat(struct zone *zone,
struct per_cpu_zonestat *pzstats) { }
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index dabc351cc127..a4749f56398f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -588,7 +588,7 @@ struct workqueue_attrs *alloc_workqueue_attrs_noprof(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs);
-extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
+extern int workqueue_unbound_housekeeping_update(const struct cpumask *hk);
extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work);
diff --git a/init/Kconfig b/init/Kconfig
index 160c1c4ef253..fcc0785dc28c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1257,6 +1257,7 @@ config CPUSETS
bool "Cpuset controller"
depends on SMP
select UNION_FIND
+ select CPU_ISOLATION
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c06e2e96f79d..01a553caee56 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -26,7 +26,6 @@
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
-#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
@@ -86,12 +85,6 @@ static cpumask_var_t isolated_cpus;
static bool isolated_cpus_updating;
/*
- * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
- */
-static cpumask_var_t boot_hk_cpus;
-static bool have_boot_isolcpus;
-
-/*
* A flag to force sched domain rebuild at the end of an operation.
* It can be set in
* - update_partition_sd_lb()
@@ -286,6 +279,13 @@ void cpuset_full_unlock(void)
cpus_read_unlock();
}
+#ifdef CONFIG_LOCKDEP
+bool lockdep_is_cpuset_held(void)
+{
+ return lockdep_is_held(&cpuset_mutex);
+}
+#endif
+
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
@@ -1205,11 +1205,10 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
+ * PF_KTHREAD tasks are handled by housekeeping.
* PF_NO_SETAFFINITY tasks are ignored.
- * All per cpu kthreads should have PF_NO_SETAFFINITY
- * flag set, see kthread_set_per_cpu().
*/
- if (task->flags & PF_NO_SETAFFINITY)
+ if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY))
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
@@ -1450,15 +1449,16 @@ static bool isolated_cpus_can_update(struct cpumask *add_cpus,
* @new_cpus: cpu mask
* Return: true if there is conflict, false otherwise
*
- * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an
* isolated partition.
*/
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
- if (!have_boot_isolcpus)
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
return false;
- if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
+ if ((prstate != PRS_ISOLATED) &&
+ !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)))
return true;
return false;
@@ -1477,29 +1477,13 @@ static void update_isolation_cpumasks(void)
if (!isolated_cpus_updating)
return;
- lockdep_assert_cpus_held();
-
- ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
- WARN_ON_ONCE(ret < 0);
-
- ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
+ ret = housekeeping_update(isolated_cpus);
WARN_ON_ONCE(ret < 0);
isolated_cpus_updating = false;
}
/**
- * cpuset_cpu_is_isolated - Check if the given CPU is isolated
- * @cpu: the CPU number to be checked
- * Return: true if CPU is used in an isolated partition, false otherwise
- */
-bool cpuset_cpu_is_isolated(int cpu)
-{
- return cpumask_test_cpu(cpu, isolated_cpus);
-}
-EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
-
-/**
* rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
* @parent: Parent cpuset containing all siblings
* @cs: Current cpuset (will be skipped)
@@ -3896,12 +3880,9 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
- have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
- if (have_boot_isolcpus) {
- BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
- cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
- }
+ if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
+ cpumask_andnot(isolated_cpus, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
return 0;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8df2d773fe3b..01968a5c4a16 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -534,6 +534,11 @@ int lockdep_is_cpus_held(void)
{
return percpu_rwsem_is_held(&cpu_hotplug_lock);
}
+
+int lockdep_is_cpus_write_held(void)
+{
+ return percpu_rwsem_is_write_held(&cpu_hotplug_lock);
+}
#endif
static void lockdep_acquire_cpus_lock(void)
@@ -1410,6 +1415,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpus_write_lock();
+ /*
+ * Keep at least one housekeeping cpu onlined to avoid generating
+ * an empty sched_domain span.
+ */
+ if (cpumask_any_and(cpu_online_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN)) >= nr_cpu_ids) {
+ ret = -EBUSY;
+ goto out;
+ }
+
cpuhp_tasks_frozen = tasks_frozen;
prev_state = cpuhp_set_state(cpu, st, target);
@@ -1456,22 +1471,8 @@ out:
return ret;
}
-struct cpu_down_work {
- unsigned int cpu;
- enum cpuhp_state target;
-};
-
-static long __cpu_down_maps_locked(void *arg)
-{
- struct cpu_down_work *work = arg;
-
- return _cpu_down(work->cpu, 0, work->target);
-}
-
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
- struct cpu_down_work work = { .cpu = cpu, .target = target, };
-
/*
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
@@ -1480,18 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
-
- /*
- * Ensure that the control task does not run on the to be offlined
- * CPU to prevent a deadlock against cfs_b->period_timer.
- * Also keep at least one housekeeping cpu onlined to avoid generating
- * an empty sched_domain span.
- */
- for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
- if (cpu != work.cpu)
- return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
- }
- return -EBUSY;
+ return _cpu_down(cpu, 0, target);
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 39511dd2abc9..c9507689e181 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,8 +35,8 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
-static LIST_HEAD(kthreads_hotplug);
-static DEFINE_MUTEX(kthreads_hotplug_lock);
+static LIST_HEAD(kthread_affinity_list);
+static DEFINE_MUTEX(kthread_affinity_lock);
struct kthread_create_info
{
@@ -69,7 +69,7 @@ struct kthread {
/* To store the full name if task comm is truncated. */
char *full_name;
struct task_struct *task;
- struct list_head hotplug_node;
+ struct list_head affinity_node;
struct cpumask *preferred_affinity;
};
@@ -128,7 +128,7 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
- INIT_LIST_HEAD(&kthread->hotplug_node);
+ INIT_LIST_HEAD(&kthread->affinity_node);
p->vfork_done = &kthread->exited;
kthread->task = p;
@@ -323,10 +323,10 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
- if (!list_empty(&kthread->hotplug_node)) {
- mutex_lock(&kthreads_hotplug_lock);
- list_del(&kthread->hotplug_node);
- mutex_unlock(&kthreads_hotplug_lock);
+ if (!list_empty(&kthread->affinity_node)) {
+ mutex_lock(&kthread_affinity_lock);
+ list_del(&kthread->affinity_node);
+ mutex_unlock(&kthread_affinity_lock);
if (kthread->preferred_affinity) {
kfree(kthread->preferred_affinity);
@@ -362,17 +362,20 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum
{
const struct cpumask *pref;
+ guard(rcu)();
+
if (kthread->preferred_affinity) {
pref = kthread->preferred_affinity;
} else {
- if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
- return;
- pref = cpumask_of_node(kthread->node);
+ if (kthread->node == NUMA_NO_NODE)
+ pref = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ else
+ pref = cpumask_of_node(kthread->node);
}
- cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (cpumask_empty(cpumask))
- cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
}
static void kthread_affine_node(void)
@@ -380,32 +383,29 @@ static void kthread_affine_node(void)
struct kthread *kthread = to_kthread(current);
cpumask_var_t affinity;
- WARN_ON_ONCE(kthread_is_per_cpu(current));
+ if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
+ return;
- if (kthread->node == NUMA_NO_NODE) {
- housekeeping_affine(current, HK_TYPE_KTHREAD);
- } else {
- if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
- WARN_ON_ONCE(1);
- return;
- }
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
- mutex_lock(&kthreads_hotplug_lock);
- WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
- list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
- /*
- * The node cpumask is racy when read from kthread() but:
- * - a racing CPU going down will either fail on the subsequent
- * call to set_cpus_allowed_ptr() or be migrated to housekeepers
- * afterwards by the scheduler.
- * - a racing CPU going up will be handled by kthreads_online_cpu()
- */
- kthread_fetch_affinity(kthread, affinity);
- set_cpus_allowed_ptr(current, affinity);
- mutex_unlock(&kthreads_hotplug_lock);
+ mutex_lock(&kthread_affinity_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
+ list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down will either fail on the subsequent
+ * call to set_cpus_allowed_ptr() or be migrated to housekeepers
+ * afterwards by the scheduler.
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthread_affinity_lock);
- free_cpumask_var(affinity);
- }
+ free_cpumask_var(affinity);
}
static int kthread(void *_create)
@@ -453,6 +453,10 @@ static int kthread(void *_create)
self->started = 1;
+ /*
+ * Apply default node affinity if no call to kthread_bind[_mask]() nor
+ * kthread_affine_preferred() was issued before the first wake-up.
+ */
if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
kthread_affine_node();
@@ -820,12 +824,13 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, comm);
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
cgroup_init_kthreadd();
+ kthread_affine_node();
+
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
@@ -851,6 +856,18 @@ int kthreadd(void *unused)
return 0;
}
+/**
+ * kthread_affine_preferred - Define a kthread's preferred affinity
+ * @p: thread created by kthread_create().
+ * @mask: preferred mask of CPUs (might not be online, must be possible) for @p
+ * to run on.
+ *
+ * Similar to kthread_bind_mask() except that the affinity is not a requirement
+ * but rather a preference that can be constrained by CPU isolation or CPU hotplug.
+ * Must be called before the first wakeup of the kthread.
+ *
+ * Returns 0 if the affinity has been applied.
+ */
int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
struct kthread *kthread = to_kthread(p);
@@ -873,16 +890,16 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
goto out;
}
- mutex_lock(&kthreads_hotplug_lock);
+ mutex_lock(&kthread_affinity_lock);
cpumask_copy(kthread->preferred_affinity, mask);
- WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
- list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
+ list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
kthread_fetch_affinity(kthread, affinity);
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
set_cpus_allowed_force(p, affinity);
- mutex_unlock(&kthreads_hotplug_lock);
+ mutex_unlock(&kthread_affinity_lock);
out:
free_cpumask_var(affinity);
@@ -890,22 +907,15 @@ out:
}
EXPORT_SYMBOL_GPL(kthread_affine_preferred);
-/*
- * Re-affine kthreads according to their preferences
- * and the newly online CPU. The CPU down part is handled
- * by select_fallback_rq() which default re-affines to
- * housekeepers from other nodes in case the preferred
- * affinity doesn't apply anymore.
- */
-static int kthreads_online_cpu(unsigned int cpu)
+static int kthreads_update_affinity(bool force)
{
cpumask_var_t affinity;
struct kthread *k;
int ret;
- guard(mutex)(&kthreads_hotplug_lock);
+ guard(mutex)(&kthread_affinity_lock);
- if (list_empty(&kthreads_hotplug))
+ if (list_empty(&kthread_affinity_list))
return 0;
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
@@ -913,14 +923,29 @@ static int kthreads_online_cpu(unsigned int cpu)
ret = 0;
- list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ list_for_each_entry(k, &kthread_affinity_list, affinity_node) {
if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
kthread_is_per_cpu(k->task))) {
ret = -EINVAL;
continue;
}
- kthread_fetch_affinity(k, affinity);
- set_cpus_allowed_ptr(k->task, affinity);
+
+ /*
+ * Unbound kthreads without preferred affinity are already affine
+ * to housekeeping, whether those CPUs are online or not. So no need
+ * to handle newly online CPUs for them. However housekeeping changes
+ * have to be applied.
+ *
+ * But kthreads with a preferred affinity or node are different:
+ * if none of their preferred CPUs are online and part of
+ * housekeeping at the same time, they must be affine to housekeeping.
+ * But as soon as one of their preferred CPU becomes online, they must
+ * be affine to them.
+ */
+ if (force || k->preferred_affinity || k->node != NUMA_NO_NODE) {
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
}
free_cpumask_var(affinity);
@@ -928,6 +953,33 @@ static int kthreads_online_cpu(unsigned int cpu)
return ret;
}
+/**
+ * kthreads_update_housekeeping - Update kthreads affinity on cpuset change
+ *
+ * When cpuset changes a partition type to/from "isolated" or updates related
+ * cpumasks, propagate the housekeeping cpumask change to preferred kthreads
+ * affinity.
+ *
+ * Returns 0 if successful, -ENOMEM if temporary mask couldn't
+ * be allocated or -EINVAL in case of internal error.
+ */
+int kthreads_update_housekeeping(void)
+{
+ return kthreads_update_affinity(true);
+}
+
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers from other nodes in case the preferred
+ * affinity doesn't apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ return kthreads_update_affinity(false);
+}
+
static int kthreads_init(void)
{
return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3ad0d6df6a0a..3b725d39c06e 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,9 +8,11 @@
*
*/
#include <linux/sched/isolation.h>
+#include <linux/pci.h>
#include "sched.h"
enum hk_flags {
+ HK_FLAG_DOMAIN_BOOT = BIT(HK_TYPE_DOMAIN_BOOT),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
@@ -20,7 +22,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
struct housekeeping {
- cpumask_var_t cpumasks[HK_TYPE_MAX];
+ struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
unsigned long flags;
};
@@ -28,21 +30,62 @@ static struct housekeeping housekeeping;
bool housekeeping_enabled(enum hk_type type)
{
- return !!(housekeeping.flags & BIT(type));
+ return !!(READ_ONCE(housekeeping.flags) & BIT(type));
}
EXPORT_SYMBOL_GPL(housekeeping_enabled);
+static bool housekeeping_dereference_check(enum hk_type type)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) {
+ /* Cpuset isn't even writable yet? */
+ if (system_state <= SYSTEM_SCHEDULING)
+ return true;
+
+ /* CPU hotplug write locked, so cpuset partition can't be overwritten */
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held())
+ return true;
+
+ /* Cpuset lock held, partitions not writable */
+ if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held())
+ return true;
+
+ return false;
+ }
+
+ return true;
+}
+
+static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type)
+{
+ return rcu_dereference_all_check(housekeeping.cpumasks[type],
+ housekeeping_dereference_check(type));
+}
+
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
+{
+ const struct cpumask *mask = NULL;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (READ_ONCE(housekeeping.flags) & BIT(type))
+ mask = housekeeping_cpumask_dereference(type);
+ }
+ if (!mask)
+ mask = cpu_possible_mask;
+ return mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
int housekeeping_any_cpu(enum hk_type type)
{
int cpu;
if (static_branch_unlikely(&housekeeping_overridden)) {
if (housekeeping.flags & BIT(type)) {
- cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
+ cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
if (cpu < nr_cpu_ids)
return cpu;
- cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
+ cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
if (likely(cpu < nr_cpu_ids))
return cpu;
/*
@@ -58,32 +101,69 @@ int housekeeping_any_cpu(enum hk_type type)
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
-const struct cpumask *housekeeping_cpumask(enum hk_type type)
-{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping.flags & BIT(type))
- return housekeeping.cpumasks[type];
- return cpu_possible_mask;
-}
-EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-
void housekeeping_affine(struct task_struct *t, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping.flags & BIT(type))
- set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
+ set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping.flags & BIT(type))
- return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
+ if (static_branch_unlikely(&housekeeping_overridden) &&
+ READ_ONCE(housekeeping.flags) & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+int housekeeping_update(struct cpumask *isol_mask)
+{
+ struct cpumask *trial, *old = NULL;
+ int err;
+
+ lockdep_assert_cpus_held();
+
+ trial = kmalloc(cpumask_size(), GFP_KERNEL);
+ if (!trial)
+ return -ENOMEM;
+
+ cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask);
+ if (!cpumask_intersects(trial, cpu_online_mask)) {
+ kfree(trial);
+ return -EINVAL;
+ }
+
+ if (!housekeeping.flags)
+ static_branch_enable_cpuslocked(&housekeeping_overridden);
+
+ if (housekeeping.flags & HK_FLAG_DOMAIN)
+ old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
+ else
+ WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN);
+ rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial);
+
+ synchronize_rcu();
+
+ pci_probe_flush_workqueue();
+ mem_cgroup_flush_workqueue();
+ vmstat_flush_workqueue();
+
+ err = workqueue_unbound_housekeeping_update(housekeeping_cpumask(HK_TYPE_DOMAIN));
+ WARN_ON_ONCE(err < 0);
+
+ err = tmigr_isolated_exclude_cpumask(isol_mask);
+ WARN_ON_ONCE(err < 0);
+
+ err = kthreads_update_housekeeping();
+ WARN_ON_ONCE(err < 0);
+
+ kfree(old);
+
+ return 0;
+}
+
void __init housekeeping_init(void)
{
enum hk_type type;
@@ -95,20 +175,33 @@ void __init housekeeping_init(void)
if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
-
+ /*
+ * Realloc with a proper allocator so that any cpumask update
+ * can indifferently free the old version with kfree().
+ */
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
+
+ if (WARN_ON_ONCE(!nmask))
+ return;
+
+ omask = rcu_dereference(housekeeping.cpumasks[type]);
+
/* We need at least one CPU to handle housekeeping work */
- WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ WARN_ON_ONCE(cpumask_empty(omask));
+ cpumask_copy(nmask, omask);
+ RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
+ memblock_free(omask, cpumask_size());
}
}
static void __init housekeeping_setup_type(enum hk_type type,
cpumask_var_t housekeeping_staging)
{
+ struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
- alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
- cpumask_copy(housekeeping.cpumasks[type],
- housekeeping_staging);
+ cpumask_copy(mask, housekeeping_staging);
+ RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
}
static int __init housekeeping_setup(char *str, unsigned long flags)
@@ -161,7 +254,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
if (!cpumask_equal(housekeeping_staging,
- housekeeping.cpumasks[type])) {
+ housekeeping_cpumask(type))) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
goto free_housekeeping_staging;
}
@@ -182,7 +275,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
cpumask_first_and_and(cpu_present_mask,
- housekeeping_staging, housekeeping.cpumasks[type]);
+ housekeeping_staging, housekeeping_cpumask(type));
if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
pr_warn("Housekeeping: must include one present CPU "
"neither in nohz_full= nor in isolcpus=domain, "
@@ -239,7 +332,7 @@ static int __init housekeeping_isolcpus_setup(char *str)
if (!strncmp(str, "domain,", 7)) {
str += 7;
- flags |= HK_FLAG_DOMAIN;
+ flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
continue;
}
@@ -269,7 +362,7 @@ static int __init housekeeping_isolcpus_setup(char *str)
/* Default behaviour for isolcpus without flags */
if (!flags)
- flags |= HK_FLAG_DOMAIN;
+ flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
return housekeeping_setup(str, flags);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bd350e40859d..bcd78b19149b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -30,6 +30,7 @@
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/fs_api.h>
@@ -42,6 +43,8 @@
#include <linux/ktime_api.h>
#include <linux/lockdep_api.h>
#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/memcontrol.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -65,6 +68,7 @@
#include <linux/types.h>
#include <linux/u64_stats_sync_api.h>
#include <linux/uaccess.h>
+#include <linux/vmstat.h>
#include <linux/wait_api.h>
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 18dda1aa782d..6da9cd562b20 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -466,9 +466,8 @@ static inline bool tmigr_is_isolated(int cpu)
{
if (!static_branch_unlikely(&tmigr_exclude_isolated))
return false;
- return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
- cpuset_cpu_is_isolated(cpu)) &&
- housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+ return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) &&
+ housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE));
}
/*
@@ -1497,7 +1496,7 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
return 0;
}
-static int tmigr_set_cpu_available(unsigned int cpu)
+static int __tmigr_set_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1505,9 +1504,6 @@ static int tmigr_set_cpu_available(unsigned int cpu)
if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL;
- if (tmigr_is_isolated(cpu))
- return 0;
-
guard(mutex)(&tmigr_available_mutex);
cpumask_set_cpu(cpu, tmigr_available_cpumask);
@@ -1523,6 +1519,14 @@ static int tmigr_set_cpu_available(unsigned int cpu)
return 0;
}
+static int tmigr_set_cpu_available(unsigned int cpu)
+{
+ if (tmigr_is_isolated(cpu))
+ return 0;
+
+ return __tmigr_set_cpu_available(cpu);
+}
+
static void tmigr_cpu_isolate(struct work_struct *ignored)
{
tmigr_clear_cpu_available(smp_processor_id());
@@ -1530,7 +1534,12 @@ static void tmigr_cpu_isolate(struct work_struct *ignored)
static void tmigr_cpu_unisolate(struct work_struct *ignored)
{
- tmigr_set_cpu_available(smp_processor_id());
+ /*
+ * Don't call tmigr_is_isolated() ->housekeeping_cpu() directly because
+ * the cpuset mutex is correctly held by the workqueue caller but lockdep
+ * doesn't know that.
+ */
+ __tmigr_set_cpu_available(smp_processor_id());
}
/**
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6..eb5660013222 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6959,13 +6959,16 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
/**
- * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
- * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
+ * workqueue_unbound_housekeeping_update - Propagate housekeeping cpumask update
+ * @hk: the new housekeeping cpumask
*
- * This function can be called from cpuset code to provide a set of isolated
- * CPUs that should be excluded from wq_unbound_cpumask.
+ * Update the unbound workqueue cpumask on top of the new housekeeping cpumask such
+ * that the effective unbound affinity is the intersection of the new housekeeping
+ * with the requested affinity set via nohz_full=/isolcpus= or sysfs.
+ *
+ * Return: 0 on success and -errno on failure.
*/
-int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
+int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
{
cpumask_var_t cpumask;
int ret = 0;
@@ -6981,14 +6984,14 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
* (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
* by any subsequent write to workqueue/cpumask sysfs file.
*/
- if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk))
cpumask_copy(cpumask, wq_requested_unbound_cpumask);
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
/* Save the current isolated cpumask & export it via sysfs */
if (!ret)
- cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, hk);
mutex_unlock(&wq_pool_mutex);
free_cpumask_var(cpumask);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 86f43b7e5f71..783b3b008fef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -96,6 +96,8 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct workqueue_struct *memcg_wq __ro_after_init;
+
static struct kmem_cache *memcg_cachep;
static struct kmem_cache *memcg_pn_cachep;
@@ -2003,6 +2005,19 @@ static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
return flush;
}
+static void schedule_drain_work(int cpu, struct work_struct *work)
+{
+ /*
+ * Protect housekeeping cpumask read and work enqueue together
+ * in the same RCU critical section so that later cpuset isolated
+ * partition update only need to wait for an RCU GP and flush the
+ * pending work on newly isolated CPUs.
+ */
+ guard(rcu)();
+ if (!cpu_is_isolated(cpu))
+ queue_work_on(cpu, memcg_wq, work);
+}
+
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
@@ -2032,8 +2047,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
&memcg_st->flags)) {
if (cpu == curcpu)
drain_local_memcg_stock(&memcg_st->work);
- else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &memcg_st->work);
+ else
+ schedule_drain_work(cpu, &memcg_st->work);
}
if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
@@ -2042,8 +2057,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
&obj_st->flags)) {
if (cpu == curcpu)
drain_local_obj_stock(&obj_st->work);
- else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &obj_st->work);
+ else
+ schedule_drain_work(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -5112,6 +5127,11 @@ void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
refill_stock(memcg, nr_pages);
}
+void mem_cgroup_flush_workqueue(void)
+{
+ flush_workqueue(memcg_wq);
+}
+
static int __init cgroup_memory(char *s)
{
char *token;
@@ -5154,6 +5174,9 @@ int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
+ memcg_wq = alloc_workqueue("memcg", WQ_PERCPU, 0);
+ WARN_ON(!memcg_wq);
+
for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
drain_local_memcg_stock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 65de88cdf40e..d6e814c82952 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2124,6 +2124,11 @@ static void vmstat_shepherd(struct work_struct *w);
static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
+void vmstat_flush_workqueue(void)
+{
+ flush_workqueue(mm_percpu_wq);
+}
+
static void vmstat_shepherd(struct work_struct *w)
{
int cpu;
@@ -2144,11 +2149,13 @@ static void vmstat_shepherd(struct work_struct *w)
* infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
* for all isolated CPUs to avoid interference with the isolated workload.
*/
- if (cpu_is_isolated(cpu))
- continue;
+ scoped_guard(rcu) {
+ if (cpu_is_isolated(cpu))
+ continue;
- if (!delayed_work_pending(dw) && need_update(cpu))
- queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+ if (!delayed_work_pending(dw) && need_update(cpu))
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+ }
cond_resched();
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ca878525ad7c..07624b682b08 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1022,7 +1022,7 @@ static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
int rps_cpumask_housekeeping(struct cpumask *mask)
{
if (!cpumask_empty(mask)) {
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
if (cpumask_empty(mask))
return -EINVAL;