mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-05-05 09:57:21 +02:00
mm: memcontrol: prepare for reparenting non-hierarchical stats
To resolve the dying memcg issue, we need to reparent LRU folios of child memcg to its parent memcg. This could cause problems for non-hierarchical stats. As Yosry Ahmed pointed out: In short, if memory is charged to a dying cgroup at the time of reparenting, when the memory gets uncharged the stats updates will occur at the parent. This will update both hierarchical and non-hierarchical stats of the parent, which would corrupt the parent's non-hierarchical stats (because those counters were never incremented when the memory was charged). Now we have the following two types of non-hierarchical stats, and they are only used in CONFIG_MEMCG_V1: a. memcg->vmstats->state_local[i] b. pn->lruvec_stats->state_local[i] To ensure that these non-hierarchical stats work properly, we need to reparent these non-hierarchical stats after reparenting LRU folios. To this end, this commit makes the following preparations: 1. implement reparent_state_local() to reparent non-hierarchical stats 2. make css_killed_work_fn() to be called in rcu work, and implement get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race between mod_memcg_state()/mod_memcg_lruvec_state() and reparent_state_local() Link: https://lore.kernel.org/e862995c45a7101a541284b6ebee5e5c32c89066.1772711148.git.zhengqi.arch@bytedance.com Co-developed-by: Yosry Ahmed <yosry@kernel.org> Signed-off-by: Yosry Ahmed <yosry@kernel.org> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Allen Pais <apais@linux.microsoft.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Baoquan He <bhe@redhat.com> Cc: Chengming Zhou <chengming.zhou@linux.dev> Cc: Chen Ridong <chenridong@huawei.com> Cc: David Hildenbrand <david@kernel.org> Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Cc: Harry Yoo <harry.yoo@oracle.com> Cc: Hugh Dickins <hughd@google.com> Cc: Imran Khan <imran.f.khan@oracle.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com> Cc: Lance Yang <lance.yang@linux.dev> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Michal Koutný <mkoutny@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Usama Arif <usamaarif642@gmail.com> Cc: Vlastimil Babka <vbabka@kernel.org> Cc: Wei Xu <weixugc@google.com> Cc: Yuanchu Xie <yuanchu@google.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
@@ -6050,8 +6050,9 @@ out_unlock:
|
||||
*/
|
||||
static void css_killed_work_fn(struct work_struct *work)
|
||||
{
|
||||
struct cgroup_subsys_state *css =
|
||||
container_of(work, struct cgroup_subsys_state, destroy_work);
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
|
||||
|
||||
cgroup_lock();
|
||||
|
||||
@@ -6072,8 +6073,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
|
||||
container_of(ref, struct cgroup_subsys_state, refcnt);
|
||||
|
||||
if (atomic_dec_and_test(&css->online_cnt)) {
|
||||
INIT_WORK(&css->destroy_work, css_killed_work_fn);
|
||||
queue_work(cgroup_offline_wq, &css->destroy_work);
|
||||
INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
|
||||
queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1884,6 +1884,22 @@ static const unsigned int memcg1_events[] = {
|
||||
PGMAJFAULT,
|
||||
};
|
||||
|
||||
void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
|
||||
reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
|
||||
}
|
||||
|
||||
void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||
reparent_memcg_lruvec_state_local(memcg, parent, i);
|
||||
}
|
||||
|
||||
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
|
||||
{
|
||||
unsigned long memory, memsw;
|
||||
|
||||
@@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
|
||||
unsigned long nr_memory, int nid);
|
||||
|
||||
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
|
||||
void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
|
||||
void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
|
||||
|
||||
void reparent_memcg_state_local(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *parent, int idx);
|
||||
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *parent, int idx);
|
||||
|
||||
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
|
||||
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
|
||||
|
||||
@@ -225,6 +225,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
|
||||
return objcg;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_V1
|
||||
static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
|
||||
|
||||
static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
|
||||
{
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Reparent stats exposed non-hierarchically. Flush @memcg's stats first
|
||||
* to read its stats accurately , and conservatively flush @parent's
|
||||
* stats after reparenting to avoid hiding a potentially large stat
|
||||
* update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
|
||||
*/
|
||||
__mem_cgroup_flush_stats(memcg, true);
|
||||
|
||||
/* The following counts are all non-hierarchical and need to be reparented. */
|
||||
reparent_memcg1_state_local(memcg, parent);
|
||||
reparent_memcg1_lruvec_state_local(memcg, parent);
|
||||
|
||||
__mem_cgroup_flush_stats(parent, true);
|
||||
}
|
||||
#else
|
||||
static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
|
||||
{
|
||||
spin_lock_irq(&objcg_lock);
|
||||
@@ -472,6 +500,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
||||
return x;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_V1
|
||||
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
|
||||
enum node_stat_item idx, int val);
|
||||
|
||||
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *parent, int idx)
|
||||
{
|
||||
int nid;
|
||||
|
||||
for_each_node(nid) {
|
||||
struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
||||
struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
|
||||
unsigned long value = lruvec_page_state_local(child_lruvec, idx);
|
||||
struct mem_cgroup_per_node *child_pn, *parent_pn;
|
||||
|
||||
child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
|
||||
parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
|
||||
|
||||
__mod_memcg_lruvec_state(child_pn, idx, -value);
|
||||
__mod_memcg_lruvec_state(parent_pn, idx, value);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Subset of vm_event_item to report for memcg event stats */
|
||||
static const unsigned int memcg_vm_event_stat[] = {
|
||||
#ifdef CONFIG_MEMCG_V1
|
||||
@@ -717,6 +769,42 @@ static int memcg_state_val_in_pages(int idx, int val)
|
||||
return max(val * unit / PAGE_SIZE, 1UL);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_V1
|
||||
/*
|
||||
* Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
|
||||
* reparenting of non-hierarchical state_locals.
|
||||
*/
|
||||
static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return memcg;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
while (memcg_is_dying(memcg))
|
||||
memcg = parent_mem_cgroup(memcg);
|
||||
|
||||
return memcg;
|
||||
}
|
||||
|
||||
static inline void get_non_dying_memcg_end(void)
|
||||
{
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return;
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
#else
|
||||
static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
|
||||
{
|
||||
return memcg;
|
||||
}
|
||||
|
||||
static inline void get_non_dying_memcg_end(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static void __mod_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx, int val)
|
||||
{
|
||||
@@ -768,6 +856,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
|
||||
#endif
|
||||
return x;
|
||||
}
|
||||
|
||||
void reparent_memcg_state_local(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *parent, int idx)
|
||||
{
|
||||
unsigned long value = memcg_page_state_local(memcg, idx);
|
||||
|
||||
__mod_memcg_state(memcg, idx, -value);
|
||||
__mod_memcg_state(parent, idx, value);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
|
||||
|
||||
Reference in New Issue
Block a user