mm: memcontrol: prepare for reparenting non-hierarchical stats

To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg.  This could cause problems for non-hierarchical
stats.

As Yosry Ahmed pointed out:

In short, if memory is charged to a dying cgroup at the time of
reparenting, when the memory gets uncharged the stats updates will occur
at the parent. This will update both hierarchical and non-hierarchical
stats of the parent, which would corrupt the parent's non-hierarchical
stats (because those counters were never incremented when the memory was
charged).

Now we have the following two types of non-hierarchical stats, and they
are only used in CONFIG_MEMCG_V1:

a. memcg->vmstats->state_local[i]
b. pn->lruvec_stats->state_local[i]

To ensure that these non-hierarchical stats work properly, we need to
reparent these non-hierarchical stats after reparenting LRU folios. To
this end, this commit makes the following preparations:

1. implement reparent_state_local() to reparent non-hierarchical stats
2. make css_killed_work_fn() to be called in rcu work, and implement
   get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
   between mod_memcg_state()/mod_memcg_lruvec_state()
   and reparent_state_local()

Link: https://lore.kernel.org/e862995c45a7101a541284b6ebee5e5c32c89066.1772711148.git.zhengqi.arch@bytedance.com
Co-developed-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Qi Zheng
2026-03-05 19:52:48 +08:00
committed by Andrew Morton
parent 5371e350fd
commit 8285917d6f
4 changed files with 125 additions and 4 deletions
+5 -4
View File
@@ -6050,8 +6050,9 @@ out_unlock:
*/
static void css_killed_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys_state *css;
css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
cgroup_lock();
@@ -6072,8 +6073,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_offline_wq, &css->destroy_work);
INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
}
}
+16
View File
@@ -1884,6 +1884,22 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
int i;
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
}
void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
int i;
for (i = 0; i < NR_LRU_LISTS; i++)
reparent_memcg_lruvec_state_local(memcg, parent, i);
}
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
+7
View File
@@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
void reparent_memcg_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent, int idx);
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent, int idx);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
+97
View File
@@ -225,6 +225,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
return objcg;
}
#ifdef CONFIG_MEMCG_V1
static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
/*
* Reparent stats exposed non-hierarchically. Flush @memcg's stats first
* to read its stats accurately , and conservatively flush @parent's
* stats after reparenting to avoid hiding a potentially large stat
* update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
*/
__mem_cgroup_flush_stats(memcg, true);
/* The following counts are all non-hierarchical and need to be reparented. */
reparent_memcg1_state_local(memcg, parent);
reparent_memcg1_lruvec_state_local(memcg, parent);
__mem_cgroup_flush_stats(parent, true);
}
#else
static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
}
#endif
static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
spin_lock_irq(&objcg_lock);
@@ -472,6 +500,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
#ifdef CONFIG_MEMCG_V1
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
enum node_stat_item idx, int val);
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent, int idx)
{
int nid;
for_each_node(nid) {
struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
unsigned long value = lruvec_page_state_local(child_lruvec, idx);
struct mem_cgroup_per_node *child_pn, *parent_pn;
child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
__mod_memcg_lruvec_state(child_pn, idx, -value);
__mod_memcg_lruvec_state(parent_pn, idx, value);
}
}
#endif
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -717,6 +769,42 @@ static int memcg_state_val_in_pages(int idx, int val)
return max(val * unit / PAGE_SIZE, 1UL);
}
#ifdef CONFIG_MEMCG_V1
/*
* Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
* reparenting of non-hierarchical state_locals.
*/
static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
{
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg;
rcu_read_lock();
while (memcg_is_dying(memcg))
memcg = parent_mem_cgroup(memcg);
return memcg;
}
static inline void get_non_dying_memcg_end(void)
{
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
rcu_read_unlock();
}
#else
static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
{
return memcg;
}
static inline void get_non_dying_memcg_end(void)
{
}
#endif
static void __mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx, int val)
{
@@ -768,6 +856,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
void reparent_memcg_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent, int idx)
{
unsigned long value = memcg_page_state_local(memcg, idx);
__mod_memcg_state(memcg, idx, -value);
__mod_memcg_state(parent, idx, value);
}
#endif
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,