summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorBing Jiao <bingjiao@google.com>2026-01-14 20:53:02 +0000
committerAndrew Morton <akpm@linux-foundation.org>2026-02-12 15:42:52 -0800
commit1aceed565ff172fc0331dd1d5e7e65139b711139 (patch)
treed1243c39d0e85aa9b016b8187a6d3aa5a00117ff /mm
parentfb4ddf2085115ed28dedc427d9491707b476bbfe (diff)
mm/vmscan: fix demotion targets checks in reclaim/demotion
Patch series "mm/vmscan: fix demotion targets checks in reclaim/demotion", v9. This patch series addresses two issues in demote_folio_list(), can_demote(), and next_demotion_node() in reclaim/demotion. 1. demote_folio_list() and can_demote() do not correctly check demotion target against cpuset.mems_effective, which will cause (a) pages to be demoted to not-allowed nodes and (b) pages fail demotion even if the system still has allowed demotion nodes. Patch 1 fixes this bug by updating cpuset_node_allowed() and mem_cgroup_node_allowed() to return effective_mems, allowing directly logic-and operation against demotion targets. 2. next_demotion_node() returns a preferred demotion target, but it does not check the node against allowed nodes. Patch 2 ensures that next_demotion_node() filters against the allowed node mask and selects the closest demotion target to the source node. This patch (of 2): Fix two bugs in demote_folio_list() and can_demote() due to incorrect demotion target checks against cpuset.mems_effective in reclaim/demotion. Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") introduces the cpuset.mems_effective check and applies it to can_demote(). However: 1. It does not apply this check in demote_folio_list(), which leads to situations where pages are demoted to nodes that are explicitly excluded from the task's cpuset.mems. 2. It checks only the nodes in the immediate next demotion hierarchy and does not check all allowed demotion targets in can_demote(). This can cause pages to never be demoted if the nodes in the next demotion hierarchy are not set in mems_effective. These bugs break resource isolation provided by cpuset.mems. This is visible from userspace because pages can either fail to be demoted entirely or are demoted to nodes that are not allowed in multi-tier memory systems. To address these bugs, update cpuset_node_allowed() and mem_cgroup_node_allowed() to return effective_mems, allowing directly logic-and operation against demotion targets. Also update can_demote() and demote_folio_list() accordingly. Bug 1 reproduction: Assume a system with 4 nodes, where nodes 0-1 are top-tier and nodes 2-3 are far-tier memory. All nodes have equal capacity. Test script: echo 1 > /sys/kernel/mm/numa/demotion_enabled mkdir /sys/fs/cgroup/test echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control echo "0-2" > /sys/fs/cgroup/test/cpuset.mems echo $$ > /sys/fs/cgroup/test/cgroup.procs swapoff -a # Expectation: Should respect node 0-2 limit. # Observation: Node 3 shows significant allocation (MemFree drops) stress-ng --oomable --vm 1 --vm-bytes 150% --mbind 0,1 Bug 2 reproduction: Assume a system with 6 nodes, where nodes 0-2 are top-tier, node 3 is a far-tier node, and nodes 4-5 are the farthest-tier nodes. All nodes have equal capacity. Test script: echo 1 > /sys/kernel/mm/numa/demotion_enabled mkdir /sys/fs/cgroup/test echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control echo "0-2,4-5" > /sys/fs/cgroup/test/cpuset.mems echo $$ > /sys/fs/cgroup/test/cgroup.procs swapoff -a # Expectation: Pages are demoted to Nodes 4-5 # Observation: No pages are demoted before oom. stress-ng --oomable --vm 1 --vm-bytes 150% --mbind 0,1,2 Link: https://lkml.kernel.org/r/20260114205305.2869796-1-bingjiao@google.com Link: https://lkml.kernel.org/r/20260114205305.2869796-2-bingjiao@google.com Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") Signed-off-by: Bing Jiao <bingjiao@google.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@kernel.org> Cc: Gregory Price <gourry@gourry.net> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Waiman Long <longman@redhat.com> Cc: Wei Xu <weixugc@google.com> Cc: Yuanchu Xie <yuanchu@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c16
-rw-r--r--mm/vmscan.c34
2 files changed, 36 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 21d17975c4ac..007413a53b45 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5593,9 +5593,21 @@ subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
{
- return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+ nodemask_t allowed;
+
+ if (!memcg)
+ return;
+
+ /*
+ * Since this interface is intended for use by migration paths, and
+ * reclaim and migration are subject to race conditions such as changes
+ * in effective_mems and hot-unpluging of nodes, inaccurate allowed
+ * mask is acceptable.
+ */
+ cpuset_nodes_allowed(memcg->css.cgroup, &allowed);
+ nodes_and(*mask, *mask, allowed);
}
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3fc4a4461927..911614723689 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -343,19 +343,21 @@ static void flush_reclaim_state(struct scan_control *sc)
static bool can_demote(int nid, struct scan_control *sc,
struct mem_cgroup *memcg)
{
- int demotion_nid;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ nodemask_t allowed_mask;
- if (!numa_demotion_enabled)
+ if (!pgdat || !numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- demotion_nid = next_demotion_node(nid);
- if (demotion_nid == NUMA_NO_NODE)
+ node_get_allowed_targets(pgdat, &allowed_mask);
+ if (nodes_empty(allowed_mask))
return false;
- /* If demotion node isn't in the cgroup's mems_allowed, fall back */
- return mem_cgroup_node_allowed(memcg, demotion_nid);
+ /* Filter out nodes that are not in cgroup's mems_allowed. */
+ mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
+ return !nodes_empty(allowed_mask);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -1017,9 +1019,10 @@ static struct folio *alloc_demote_folio(struct folio *src,
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
{
- int target_nid = next_demotion_node(pgdat->node_id);
+ int target_nid;
unsigned int nr_succeeded;
nodemask_t allowed_mask;
@@ -1031,7 +1034,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
@@ -1039,10 +1041,18 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
if (list_empty(demote_folios))
return 0;
- if (target_nid == NUMA_NO_NODE)
+ node_get_allowed_targets(pgdat, &allowed_mask);
+ mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
+ if (nodes_empty(allowed_mask))
return 0;
- node_get_allowed_targets(pgdat, &allowed_mask);
+ target_nid = next_demotion_node(pgdat->node_id);
+ if (target_nid == NUMA_NO_NODE)
+ /* No lower-tier nodes or nodes were hot-unplugged. */
+ return 0;
+ if (!node_isset(target_nid, allowed_mask))
+ target_nid = node_random(&allowed_mask);
+ mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1564,7 +1574,7 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */