From 1d224e7cbeac00292f15564310cdfa3976785b0e Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 22 Apr 2026 16:04:45 +0800 Subject: [PATCH 001/321] selftests/mm: respect build verbosity settings for 32/64-bit targets Patch series "selftests/mm: clean up build output and verbosity", v3. Currently, the build process for the mm selftests is unnecessarily noisy. First, it leaks raw compiler errors during the liburing feature probe if the headers are missing, which is confusing since the build system already handles this gracefully with a clear warning. Second, the specific 32-bit and 64-bit compilation targets ignore the standard kbuild verbosity settings, always printing their full compiler commands even during a default quiet build. This patch (of 2): The 32-bit and 64-bit compilation rules invoke $(CC) directly, bypassing the $(Q) quiet prefix and $(call msg,...) helper used by the rest of the selftests build system. This causes these rules to always print the full compiler command line, even when V=0 (the default). Wrap the commands with $(Q) and $(call msg,CC,,$@) to match the convention used by lib.mk, so that quiet and verbose builds behave consistently across all targets. ==== Build logs ==== ... CC merge CC rmap CC soft-dirty gcc -Wall -O2 -I /usr/src/25/tools/testing/selftests/../../.. -isystem /usr/src/25/tools/testing/selftests/../../../usr/include -isystem /usr/src/25/tools/testing/selftests/../../../tools/include/uapi -Wunreachable-code -U_FORTIFY_SOURCE -no-pie -D_GNU_SOURCE= -I/usr/src/25/tools/testing/selftests/../../../tools/testing/selftests -m32 -mxsave protection_keys.c vm_util.c thp_settings.c pkey_util.c -lrt -lpthread -lm -lrt -ldl -lm -o /usr/src/25/tools/testing/selftests/mm/protection_keys_32 gcc -Wall -O2 -I /usr/src/25/tools/testing/selftests/../../.. -isystem /usr/src/25/tools/testing/selftests/../../../usr/include -isystem /usr/src/25/tools/testing/selftests/../../../tools/include/uapi -Wunreachable-code -U_FORTIFY_SOURCE -no-pie -D_GNU_SOURCE= -I/usr/src/25/tools/testing/selftests/../../../tools/testing/selftests -m32 -mxsave pkey_sighandler_tests.c vm_util.c thp_settings.c pkey_util.c -lrt -lpthread -lm -lrt -ldl -lm -o /usr/src/25/tools/testing/selftests/mm/pkey_sighandler_tests_32 ... Link: https://lore.kernel.org/20260422080446.26020-1-wangli.ahau@gmail.com Link: https://lore.kernel.org/20260422080446.26020-2-wangli.ahau@gmail.com Signed-off-by: Li Wang Reported-by: Andrew Morton Tested-by: Andrew Morton Tested-by: David Hildenbrand (Arm) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index cd24596cdd27..6195770eba6e 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -216,7 +216,8 @@ ifeq ($(CAN_BUILD_I386),1) $(BINARIES_32): CFLAGS += -m32 -mxsave $(BINARIES_32): LDLIBS += -lrt -ldl -lm $(BINARIES_32): $(OUTPUT)/%_32: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif @@ -224,7 +225,8 @@ ifeq ($(CAN_BUILD_X86_64),1) $(BINARIES_64): CFLAGS += -m64 -mxsave $(BINARIES_64): LDLIBS += -lrt -ldl $(BINARIES_64): $(OUTPUT)/%_64: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) endif From 04cf82a741e096bf74e77bb8cf11e66481b4dcdf Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 22 Apr 2026 16:04:46 +0800 Subject: [PATCH 002/321] selftests/mm: suppress compiler error in liburing check When building the mm selftests on a system without liburing development headers, check_config.sh leaks a raw compiler error: /tmp/tmp.kIIOIqwe3n.c:2:10: fatal error: liburing.h: No such file or directory 2 | #include | ^~~~~~~~~~~~ Since this is an expected failure during the configuration probe, redirect the compiler output to /dev/null to hide it. And the build system prints a clear warning when this occurs: Warning: missing liburing support. Some tests will be skipped. Because the user is properly notified about the missing dependency, the raw compiler error is redundant and only confuse users. Additionally, update the Makefile to use $(Q) and $(call msg,...) for the check_config.sh execution. This aligns the probe with standard kbuild output formatting, providing a clean "CHK" message instead of printing the raw command during the build. Link: https://lore.kernel.org/20260422080446.26020-3-wangli.ahau@gmail.com Signed-off-by: Li Wang Tested-by: David Hildenbrand (Arm) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 3 ++- tools/testing/selftests/mm/check_config.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 6195770eba6e..18779045b7f6 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -263,7 +263,8 @@ $(OUTPUT)/migration: LDLIBS += -lnuma $(OUTPUT)/rmap: LDLIBS += -lnuma local_config.mk local_config.h: check_config.sh - CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh + $(call msg,CHK,config,$@) + $(Q)CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh EXTRA_CLEAN += local_config.mk local_config.h diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index b84c82bbf875..32beaefe279e 100755 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -16,7 +16,7 @@ echo "#include " > $tmpfile_c echo "#include " >> $tmpfile_c echo "int func(void) { return 0; }" >> $tmpfile_c -$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o +$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE From b001cf7d16dd18f14bd372a8018ecbf48197289d Mon Sep 17 00:00:00 2001 From: Hrushikesh Salunke Date: Wed, 22 Apr 2026 10:26:58 +0000 Subject: [PATCH 003/321] mm/page_alloc: replace kernel_init_pages() with batch page clearing When init_on_alloc is enabled, kernel_init_pages() clears every page one at a time via clear_highpage_kasan_tagged(), which incurs per-page kmap_local_page()/kunmap_local() overhead and prevents the architecture clearing primitive from operating on contiguous ranges. Introduce clear_highpages_kasan_tagged() as a static batch clearing helper in page_alloc.c that calls clear_pages() for the full contiguous range on !HIGHMEM systems, bypassing the per-page kmap overhead and allowing a single invocation of the arch clearing primitive across the entire allocation. The HIGHMEM path falls back to per-page clearing since those pages require kmap. Replace kernel_init_pages() with direct calls to the new helper, as it becomes a trivial wrapper. Allocating 8192 x 2MB HugeTLB pages (16GB) with init_on_alloc=1: Before: 0.445s After: 0.166s (-62.7%, 2.68x faster) Kernel time (sys) reduction per workload with init_on_alloc=1: Workload Before After Change Graph500 64C128T 30m 41.8s 15m 14.8s -50.3% Graph500 16C32T 15m 56.7s 9m 43.7s -39.0% Pagerank 32T 1m 58.5s 1m 12.8s -38.5% Pagerank 128T 2m 36.3s 1m 40.4s -35.7% [hsalunke@amd.com: move clear_highpages_kasan_tagged() to page_alloc.c] Link: https://lore.kernel.org/20260504063942.553438-1-hsalunke@amd.com Link: https://lore.kernel.org/20260422102729.166599-1-hsalunke@amd.com Signed-off-by: Hrushikesh Salunke Acked-by: Vlastimil Babka (SUSE) Acked-by: Zi Yan Acked-by: Pankaj Gupta Acked-by: David Hildenbrand (Arm) Acked-by: Lorenzo Stoakes Cc: Ankur Arora Cc: Bharata B Rao Cc: Brendan Jackman Cc: Johannes Weiner Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shivank Garg Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d49c254174da..bf53242d3db7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1211,14 +1211,18 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -static void kernel_init_pages(struct page *page, int numpages) +static void clear_highpages_kasan_tagged(struct page *page, int numpages) { - int i; - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) - clear_highpage_kasan_tagged(page + i); + if (!IS_ENABLED(CONFIG_HIGHMEM)) { + clear_pages(kasan_reset_tag(page_address(page)), numpages); + } else { + int i; + + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); + } kasan_enable_current(); } @@ -1423,7 +1427,7 @@ __always_inline bool __free_pages_prepare(struct page *page, init = false; } if (init) - kernel_init_pages(page, 1 << order); + clear_highpages_kasan_tagged(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1848,7 +1852,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } /* If memory is still not initialized, initialize it now. */ if (init) - kernel_init_pages(page, 1 << order); + clear_highpages_kasan_tagged(page, 1 << order); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); From eb92d97f7e6a325607b9c3981131067c0469bfcf Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 23 Apr 2026 09:41:42 +0800 Subject: [PATCH 004/321] Revert "tmpfs: don't enable large folios if not supported" This reverts commit 5a90c155defa684f3a21f68c3f8e40c056e6114c. Currently, when shmem mounts are initialized, they only use 'sbinfo->huge' to determine whether the shmem mount supports large folios. However, for anonymous shmem, whether it supports large folios can be dynamically configured via sysfs interfaces, so setting or not setting mapping_set_large_folios() during initialization cannot accurately reflect whether anonymous shmem actually supports large folios, which has already caused some confusion[1]. Moreover, for tmpfs mounts, relying on 'sbinfo->huge' cannot keep the mapping_set_large_folios() setting consistent across all mappings in the entire tmpfs mount. In other words, under the same tmpfs mount, after remount, we might end up with some mappings supporting large folios (calling mapping_set_large_folios()) while others don't. After some investigation, I found that the write performance regression addressed by commit 5a90c155defa has already been fixed by the following commit 665575cff098b ("filemap: move prefaulting out of hot write path"). See the following test data: Base: dd if=/dev/zero of=/mnt/tmpfs/test bs=400K count=10485 (3.2 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=800K count=5242 (3.2 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=1600K count=2621 (3.1 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=2200K count=1906 (3.0 GB/s ) dd if=/dev/zero of=/mnt/tmpfs/test bs=3000K count=1398 (3.0 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=4500K count=932 (3.1 GB/s) Base + revert 5a90c155defa: dd if=/dev/zero of=/mnt/tmpfs/test bs=400K count=10485 (3.3 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=800K count=5242 (3.3 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=1600K count=2621 (3.2 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=2200K count=1906 (3.1 GB/s) dd if=/dev/zero of=/mnt/tmpfs/testbs=3000K count=1398 (3.0 GB/s) dd if=/dev/zero of=/mnt/tmpfs/test bs=4500K count=932 (3.1 GB/s) The data is basically consistent with minor fluctuation noise. So we can now safely revert commit 5a90c155defa to set mapping_set_large_folios() for all shmem mounts unconditionally. Link: https://lore.kernel.org/b2c7deee259a94b0d00a7c320d8d24d2c421f761.1776908112.git.baolin.wang@linux.alibaba.com Link: https://lore.kernel.org/all/ec927492-4577-4192-8fad-85eb1bb43121@linux.alibaba.com/ [1] Link: https://lore.kernel.org/all/116df9f9-4db7-40d4-a4a4-30a87c0feffa@linux.alibaba.com/ Fixes: 5a90c155defa ("tmpfs: don't enable large folios if not supported") Signed-off-by: Baolin Wang Acked-by: Zi Yan Reviewed-by: Kefeng Wang Reviewed-by: Lance Yang Acked-by: David Hildenbrand (Arm) Acked-by: Lorenzo Stoakes Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3b5dc21b323c..bab3529af23c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3101,10 +3101,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); - - /* Don't consider 'deny' for emergencies and 'force' for testing */ - if (sbinfo->huge) - mapping_set_large_folios(inode->i_mapping); + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: From 085a7acf732f6040fd36b002e6a49c90b76db41c Mon Sep 17 00:00:00 2001 From: "Barry Song (Xiaomi)" Date: Thu, 23 Apr 2026 11:49:17 +0800 Subject: [PATCH 005/321] mm/huge_memory: fix outdated comment about freeing subpages in __folio_split The comment appears to be outdated. add_to_swap() no longer exists, and the explanation of why we need to call put_page() after splitting could be made more general. Link: https://lore.kernel.org/20260423034917.8234-1-baohua@kernel.org Signed-off-by: Barry Song (Xiaomi) Acked-by: David Hildenbrand (Arm) Acked-by: Zi Yan Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Lance Yang Cc: Chris Li Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Cc: Baoquan He Cc: Youngjun Park Signed-off-by: Andrew Morton --- mm/huge_memory.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 970e077019b7..4586f3ccb133 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4190,11 +4190,10 @@ fail: folio_unlock(new_folio); /* - * Subpages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. + * Subpages whose mapping has been zapped may be freed + * earlier, but freeing them requires taking the + * lru_lock, so we defer put_page() on tail pages until + * after the split completes. */ free_folio_and_swap_cache(new_folio); } From 8613803cf5d532316aa886f17066c5e5968ea21e Mon Sep 17 00:00:00 2001 From: Chengkaitao Date: Thu, 23 Apr 2026 18:14:41 +0800 Subject: [PATCH 006/321] mm: convert vmemmap_p?d_populate() to static functions Since the vmemmap_p?d_populate functions are unused outside the mm subsystem, we can remove their external declarations and convert them to static functions. Link: https://lore.kernel.org/20260423101441.7089-1-kaitao.cheng@linux.dev Signed-off-by: Chengkaitao Acked-by: David Hildenbrand (arm) Acked-by: Mike Rapoport (Microsoft) Acked-by: Oscar Salvador Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ------- mm/sparse-vmemmap.c | 10 +++++----- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06bbe9eba636..e3b6112a8d79 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4860,13 +4860,6 @@ unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); -p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); -pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); -pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); -pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap, unsigned long ptpfn, - unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 6eadb9d116e4..3c35d2303a61 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -151,7 +151,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, start, end - 1); } -pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, +static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, unsigned long ptpfn, unsigned long flags) { @@ -195,7 +195,7 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) return p; } -pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +static pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { @@ -208,7 +208,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } -pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) +static pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { @@ -221,7 +221,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) return pud; } -p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) +static p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { @@ -234,7 +234,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) return p4d; } -pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { From 4221aadd720bef7df1268391d6eb1ea1f0476b38 Mon Sep 17 00:00:00 2001 From: Bunyod Suvonov Date: Thu, 23 Apr 2026 18:37:53 +0800 Subject: [PATCH 007/321] mm/vmscan: add balance_pgdat begin/end tracepoints Vmscan has six main reclaim entry points: try_to_free_pages() for direct reclaim, try_to_free_mem_cgroup_pages() for memcg reclaim, mem_cgroup_shrink_node() for memcg soft limit reclaim, node_reclaim() for node reclaim, shrink_all_memory() for hibernation reclaim, and balance_pgdat() for kswapd reclaim. All of them, except for shrink_all_memory() and balance_pgdat(), already have begin/end tracepoints. This makes it harder to trace which reclaim path is responsible for memory reclaim activity, because kswapd reclaim cannot be identified as cleanly as other reclaim entry points, even though it is the main background reclaim path under memory pressure. There may be no need to trace shrink_all_memory() as it is primarily used during hibernation. So this patch adds the missing tracepoint pair for balance_pgdat(). The begin tracepoint records the node id, requested reclaim order, and the requested classzone bound (highest_zoneidx). The end tracepoint records the node id, the reclaim order that balance_pgdat() finished with, the requested classzone bound, and nr_reclaimed. Together, they show the requested reclaim order and classzone bound, whether reclaim fell back to a lower order, and how much reclaim work was done. The end tracepoint also records highest_zoneidx even though it does not change within a balance_pgdat() invocation. This keeps the end event self-contained, so users can analyze reclaim results directly from end events without depending on begin/end correlation, which is less convenient when tracing is filtered or records are dropped. It also makes it straightforward to relate nr_reclaimed and the final reclaim order to the requested classzone bound. Link: https://lore.kernel.org/20260424031418.174597-1-b.suvonov@sjtu.edu.cn Link: https://lore.kernel.org/20260423103753.546582-1-b.suvonov@sjtu.edu.cn Signed-off-by: Bunyod Suvonov Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Steven Rostedt Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 52 +++++++++++++++++++++++++++++++++++ mm/vmscan.c | 5 ++++ 2 files changed, 57 insertions(+) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 4445a8d9218d..b4bf7b8def1f 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -96,6 +96,58 @@ TRACE_EVENT(mm_vmscan_kswapd_wake, __entry->order) ); +TRACE_EVENT(mm_vmscan_balance_pgdat_begin, + + TP_PROTO(int nid, int order, int highest_zoneidx), + + TP_ARGS(nid, order, highest_zoneidx), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(int, highest_zoneidx) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->highest_zoneidx = highest_zoneidx; + ), + + TP_printk("nid=%d order=%d highest_zoneidx=%-8s", + __entry->nid, + __entry->order, + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) +); + +TRACE_EVENT(mm_vmscan_balance_pgdat_end, + + TP_PROTO(int nid, int order, int highest_zoneidx, + unsigned long nr_reclaimed), + + TP_ARGS(nid, order, highest_zoneidx, nr_reclaimed), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(int, highest_zoneidx) + __field(unsigned long, nr_reclaimed) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->highest_zoneidx = highest_zoneidx; + __entry->nr_reclaimed = nr_reclaimed; + ), + + TP_printk("nid=%d order=%d highest_zoneidx=%-8s nr_reclaimed=%lu", + __entry->nid, + __entry->order, + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE), + __entry->nr_reclaimed) +); + TRACE_EVENT(mm_vmscan_wakeup_kswapd, TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), diff --git a/mm/vmscan.c b/mm/vmscan.c index bd1b1aa12581..b2d89ed69d22 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7121,6 +7121,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) .may_unmap = 1, }; + trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order, + highest_zoneidx); set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); @@ -7314,6 +7316,9 @@ out: psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); + trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order, + highest_zoneidx, sc.nr_reclaimed); + /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller From 4aa4abf1f14bd6d0748b7d35a803cc2376a8e20b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 1 Apr 2026 11:16:19 +0100 Subject: [PATCH 008/321] mm/page_alloc: optimize free_contig_range() Patch series "mm: Free contiguous order-0 pages efficiently", v6. A recent change to vmalloc caused some performance benchmark regressions (see [1]). I'm attempting to fix that (and at the same time significantly improve beyond the baseline) by freeing a contiguous set of order-0 pages as a batch. At the same time I observed that free_contig_range() was essentially doing the same thing as vfree() so I've fixed it there too. While at it, optimize the __free_contig_frozen_range() as well. Check that the contiguous range falls in the same section. If they aren't enabled, the if conditions get optimized out by the compiler as memdesc_section() returns 0. See num_pages_contiguous() for more details about it. This patch (of 3): Decompose the range of order-0 pages to be freed into the set of largest possible power-of-2 size and aligned chunks and free them to the pcp or buddy. This improves on the previous approach which freed each order-0 page individually in a loop. Testing shows performance to be improved by more than 10x in some cases. Since each page is order-0, we must decrement each page's reference count individually and only consider the page for freeing as part of a high order chunk if the reference count goes to zero. Additionally free_pages_prepare() must be called for each individual order-0 page too, so that the struct page state and global accounting state can be appropriately managed. But once this is done, the resulting high order chunks can be freed as a unit to the pcp or buddy. This significantly speeds up the free operation but also has the side benefit that high order blocks are added to the pcp instead of each page ending up on the pcp order-0 list; memory remains more readily available in high orders. vmalloc will shortly become a user of this new optimized free_contig_range() since it aggressively allocates high order non-compound pages, but then calls split_page() to end up with contiguous order-0 pages. These can now be freed much more efficiently. The execution time of the following function was measured in a server class arm64 machine: static int page_alloc_high_order_test(void) { unsigned int order = HPAGE_PMD_ORDER; struct page *page; int i; for (i = 0; i < 100000; i++) { page = alloc_pages(GFP_KERNEL, order); if (!page) return -1; split_page(page, order); free_contig_range(page_to_pfn(page), 1UL << order); } return 0; } Execution time before: 4097358 usec Execution time after: 729831 usec Perf trace before: 99.63% 0.00% kthreadd [kernel.kallsyms] [.] kthread | ---kthread 0xffffb33c12a26af8 | |--98.13%--0xffffb33c12a26060 | | | |--97.37%--free_contig_range | | | | | |--94.93%--___free_pages | | | | | | | |--55.42%--__free_frozen_pages | | | | | | | | | --43.20%--free_frozen_page_commit | | | | | | | | | --35.37%--_raw_spin_unlock_irqrestore | | | | | | | |--11.53%--_raw_spin_trylock | | | | | | | |--8.19%--__preempt_count_dec_and_test | | | | | | | |--5.64%--_raw_spin_unlock | | | | | | | |--2.37%--__get_pfnblock_flags_mask.isra.0 | | | | | | | --1.07%--free_frozen_page_commit | | | | | --1.54%--__free_frozen_pages | | | --0.77%--___free_pages | --0.98%--0xffffb33c12a26078 alloc_pages_noprof Perf trace after: 8.42% 2.90% kthreadd [kernel.kallsyms] [k] __free_contig_range | |--5.52%--__free_contig_range | | | |--5.00%--free_prepared_contig_range | | | | | |--1.43%--__free_frozen_pages | | | | | | | --0.51%--free_frozen_page_commit | | | | | |--1.08%--_raw_spin_trylock | | | | | --0.89%--_raw_spin_unlock | | | --0.52%--free_pages_prepare | --2.90%--ret_from_fork kthread 0xffffae1c12abeaf8 0xffffae1c12abe7a0 | --2.69%--vfree __free_contig_range Link: https://lore.kernel.org/20260401101634.2868165-1-usama.anjum@arm.com Link: https://lore.kernel.org/20260401101634.2868165-2-usama.anjum@arm.com Link: https://lore.kernel.org/all/66919a28-bc81-49c9-b68f-dd7c73395a0d@arm.com [1] Signed-off-by: Ryan Roberts Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka (SUSE) Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Sterba Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Terrell Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 + mm/page_alloc.c | 112 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 4 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 51ef13ed756e..87259e309dee 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -467,6 +467,8 @@ void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages); void free_contig_range(unsigned long pfn, unsigned long nr_pages); #endif +void __free_contig_range(unsigned long pfn, unsigned long nr_pages); + DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) #endif /* __LINUX_GFP_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf53242d3db7..9d4fb1ea084a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -90,6 +90,9 @@ typedef int __bitwise fpi_t; /* Free the page without taking locks. Rely on trylock only. */ #define FPI_TRYLOCK ((__force fpi_t)BIT(2)) +/* free_pages_prepare() has already been called for page(s) being freed. */ +#define FPI_PREPARED ((__force fpi_t)BIT(3)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -1307,8 +1310,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #endif /* CONFIG_MEM_ALLOC_PROFILING */ -__always_inline bool __free_pages_prepare(struct page *page, - unsigned int order, fpi_t fpi_flags) +static __always_inline bool __free_pages_prepare(struct page *page, + unsigned int order, fpi_t fpi_flags) { int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page); @@ -1316,6 +1319,9 @@ __always_inline bool __free_pages_prepare(struct page *page, bool compound = PageCompound(page); struct folio *folio = page_folio(page); + if (fpi_flags & FPI_PREPARED) + return true; + VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); @@ -6762,6 +6768,105 @@ void __init page_alloc_sysctl_init(void) register_sysctl_init("vm", page_alloc_sysctl_table); } +static void free_prepared_contig_range(struct page *page, + unsigned long nr_pages) +{ + unsigned long pfn = page_to_pfn(page); + + while (nr_pages) { + unsigned int order; + + /* We are limited by the largest buddy order. */ + order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER; + /* Don't exceed the number of pages to free. */ + order = min_t(unsigned int, order, ilog2(nr_pages)); + order = min_t(unsigned int, order, MAX_PAGE_ORDER); + + /* + * Free the chunk as a single block. Our caller has already + * called free_pages_prepare() for each order-0 page. + */ + __free_frozen_pages(page, order, FPI_PREPARED); + + pfn += 1UL << order; + page += 1UL << order; + nr_pages -= 1UL << order; + } +} + +static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages, + bool is_frozen) +{ + struct page *page, *start = NULL; + unsigned long nr_start = 0; + unsigned long start_sec; + unsigned long i; + + for (i = 0; i < nr_pages; i++) { + bool can_free = true; + + /* + * Contiguous PFNs might not have contiguous "struct pages" + * in some kernel configs: page++ across a section boundary + * is undefined. Use pfn_to_page() for each PFN. + */ + page = pfn_to_page(pfn + i); + + VM_WARN_ON_ONCE(PageHead(page)); + VM_WARN_ON_ONCE(PageTail(page)); + + if (!is_frozen) + can_free = put_page_testzero(page); + + if (can_free) + can_free = free_pages_prepare(page, 0); + + if (!can_free) { + if (start) { + free_prepared_contig_range(start, i - nr_start); + start = NULL; + } + continue; + } + + if (start && memdesc_section(page->flags) != start_sec) { + free_prepared_contig_range(start, i - nr_start); + start = page; + nr_start = i; + start_sec = memdesc_section(page->flags); + } else if (!start) { + start = page; + nr_start = i; + start_sec = memdesc_section(page->flags); + } + } + + if (start) + free_prepared_contig_range(start, nr_pages - nr_start); +} + +/** + * __free_contig_range - Free contiguous range of order-0 pages. + * @pfn: Page frame number of the first page in the range. + * @nr_pages: Number of pages to free. + * + * For each order-0 struct page in the physically contiguous range, put a + * reference. Free any page who's reference count falls to zero. The + * implementation is functionally equivalent to, but significantly faster than + * calling __free_page() for each struct page in a loop. + * + * Memory allocated with alloc_pages(order>=1) then subsequently split to + * order-0 with split_page() is an example of appropriate contiguous pages that + * can be freed with this API. + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ +void __free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false); +} + #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) @@ -7308,8 +7413,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) return; - for (; nr_pages--; pfn++) - __free_page(pfn_to_page(pfn)); + __free_contig_range(pfn, nr_pages); } EXPORT_SYMBOL(free_contig_range); #endif /* CONFIG_CONTIG_ALLOC */ From 60ced5818f64ac356620d1ad3e0d473c457dbf5b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 1 Apr 2026 11:16:20 +0100 Subject: [PATCH 009/321] vmalloc: optimize vfree with free_pages_bulk() Whenever vmalloc allocates high order pages (e.g. for a huge mapping) it must immediately split_page() to order-0 so that it remains compatible with users that want to access the underlying struct page. Commit a06157804399 ("mm/vmalloc: request large order pages from buddy allocator") recently made it much more likely for vmalloc to allocate high order pages which are subsequently split to order-0. Unfortunately this had the side effect of causing performance regressions for tight vmalloc/vfree loops (e.g. test_vmalloc.ko benchmarks). See Closes: tag. This happens because the high order pages must be gotten from the buddy but then because they are split to order-0, when they are freed they are freed to the order-0 pcp. Previously allocation was for order-0 pages so they were recycled from the pcp. It would be preferable if when vmalloc allocates an (e.g.) order-3 page that it also frees that order-3 page to the order-3 pcp, then the regression could be removed. So let's do exactly that; update stats separately first as coalescing is hard to do correctly without complexity. Use free_pages_bulk() which uses the new __free_contig_range() API to batch-free contiguous ranges of pfns. This not only removes the regression, but significantly improves performance of vfree beyond the baseline. A selection of test_vmalloc benchmarks running on arm64 server class system. mm-new is the baseline. Commit a06157804399 ("mm/vmalloc: request large order pages from buddy allocator") was added in v6.19-rc1 where we see regressions. Then with this change performance is much better. (>0 is faster, <0 is slower, (R)/(I) = statistically significant Regression/Improvement): +-----------------+----------------------------------------------------------+-------------------+--------------------+ | Benchmark | Result Class | mm-new | this series | +=================+==========================================================+===================+====================+ | micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | 1331843.33 | (I) 67.17% | | | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 415907.33 | -5.14% | | | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 755448.00 | (I) 53.55% | | | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | 1591331.33 | (I) 57.26% | | | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 1594345.67 | (I) 68.46% | | | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 1071826.00 | (I) 79.27% | | | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 1018385.00 | (I) 84.17% | | | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 3970899.67 | (I) 77.01% | | | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 3821788.67 | (I) 89.44% | | | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | 7795968.00 | (I) 82.67% | | | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | 6530169.67 | (I) 118.09% | | | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 626808.33 | -0.98% | | | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | 532145.67 | -1.68% | | | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | 537032.67 | -0.96% | | | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | 8805069.00 | (I) 74.58% | | | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | 500824.67 | 4.35% | | | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 1637554.67 | (I) 76.99% | | | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 4556288.67 | (I) 72.23% | | | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 107371.00 | -0.70% | +-----------------+----------------------------------------------------------+-------------------+--------------------+ Link: https://lore.kernel.org/20260401101634.2868165-3-usama.anjum@arm.com Fixes: a06157804399 ("mm/vmalloc: request large order pages from buddy allocator") Closes: https://lore.kernel.org/all/66919a28-bc81-49c9-b68f-dd7c73395a0d@arm.com/ Signed-off-by: Ryan Roberts Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Acked-by: Vlastimil Babka (SUSE) Acked-by: Zi Yan Acked-by: David Hildenbrand (Arm) Reviewed-by: Uladzislau Rezki (Sony) Cc: Brendan Jackman Cc: David Sterba Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Terrell Cc: Suren Baghdasaryan Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 ++ mm/page_alloc.c | 28 ++++++++++++++++++++++++++++ mm/vmalloc.c | 16 +++++----------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 87259e309dee..cdf95a9f0b87 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -239,6 +239,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) +void free_pages_bulk(struct page **page_array, unsigned long nr_pages); + unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d4fb1ea084a..91bef811a771 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5189,6 +5189,34 @@ failed: } EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); +/* + * free_pages_bulk - Free an array of order-0 pages + * @page_array: Array of pages to free + * @nr_pages: The number of pages in the array + * + * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous + * run are released with a single __free_contig_range() call. + * + * This assumes page_array is sorted in ascending PFN order. Without that, + * the function still frees all pages, but contiguous runs may not be + * detected and the freeing pattern can degrade to freeing one page at a + * time. + * + * Context: Sleepable process context only; calls cond_resched() + */ +void free_pages_bulk(struct page **page_array, unsigned long nr_pages) +{ + while (nr_pages) { + unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages); + + __free_contig_range(page_to_pfn(*page_array), nr_contig); + + nr_pages -= nr_contig; + page_array += nr_contig; + cond_resched(); + } +} + /* * This is the 'heart' of the zoned buddy allocator. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bb6ae08d18f5..99fce4f9f6e4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3459,19 +3459,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - for (i = 0; i < vm->nr_pages; i++) { - struct page *page = vm->pages[i]; - BUG_ON(!page); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_lruvec_page_state(page, NR_VMALLOC, -1); - __free_page(page); - cond_resched(); + if (!(vm->flags & VM_MAP_PUT_PAGES)) { + for (i = 0; i < vm->nr_pages; i++) + mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); } + free_pages_bulk(vm->pages, vm->nr_pages); + kvfree(vm->pages); kfree(vm); } From b971e47fd98f97d66ab3b1c0864916d844fa0104 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 1 Apr 2026 11:16:21 +0100 Subject: [PATCH 010/321] mm/page_alloc: optimize __free_contig_frozen_range() Apply the same batch-freeing optimization from free_contig_range() to the frozen page path. The previous __free_contig_frozen_range() freed each order-0 page individually via free_frozen_pages(), which is slow for the same reason the old free_contig_range() was: each page goes to the order-0 pcp list rather than being coalesced into higher-order blocks. Rewrite __free_contig_frozen_range() to call free_pages_prepare() for each order-0 page, then batch the prepared pages into the largest possible power-of-2 aligned chunks via free_prepared_contig_range(). If free_pages_prepare() fails (e.g. HWPoison, bad page) the page is deliberately not freed; it should not be returned to the allocator. I've tested CMA through debugfs. The test allocates 16384 pages per allocation for several iterations. There is 3.5x improvement. Before: 1406 usec per iteration After: 402 usec per iteration Before: 70.89% 0.69% cma [kernel.kallsyms] [.] free_contig_frozen_range | |--70.20%--free_contig_frozen_range | | | |--46.41%--__free_frozen_pages | | | | | --36.18%--free_frozen_page_commit | | | | | --29.63%--_raw_spin_unlock_irqrestore | | | |--8.76%--_raw_spin_trylock | | | |--7.03%--__preempt_count_dec_and_test | | | |--4.57%--_raw_spin_unlock | | | |--1.96%--__get_pfnblock_flags_mask.isra.0 | | | --1.15%--free_frozen_page_commit | --0.69%--el0t_64_sync After: 23.57% 0.00% cma [kernel.kallsyms] [.] free_contig_frozen_range | ---free_contig_frozen_range | |--20.45%--__free_contig_frozen_range | | | |--17.77%--free_pages_prepare | | | --0.72%--free_prepared_contig_range | | | --0.55%--__free_frozen_pages | --3.12%--free_pages_prepare Link: https://lore.kernel.org/20260401101634.2868165-4-usama.anjum@arm.com Signed-off-by: Muhammad Usama Anjum Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka (SUSE) Reviewed-by: Zi Yan Suggested-by: Zi Yan Cc: Brendan Jackman Cc: David Sterba Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Terrell Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 91bef811a771..a81ae5781036 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7032,8 +7032,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) { - for (; nr_pages--; pfn++) - free_frozen_pages(pfn_to_page(pfn), 0); + __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true); } /** From 5c5bc5e326fe4bcfe1c6f5c69a0b8df809bdc2e4 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Apr 2026 07:17:54 +0200 Subject: [PATCH 011/321] mm/gup: cleanup pgtable entry accessors PMD and PUD entries revalidation has the same semantics as PTE entry revalidation. Convert the remaining direct entry dereferences to the corresponding accessors. The PTE validation in gup_fast_pte_range() is inconsistent with the prior value acquisition in the sense that it drops the lockless access semantics. Use the lockless accessor not only for the PTE, but also for the PMD validation, which is likewise inconsistent with the prior value acquisition in gup_fast_pmd_range(). Link: https://lore.kernel.org/20260421051754.1691221-1-agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Acked-by: David Hildenbrand (Arm) Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Brodsky Cc: Peter Xu Cc: Ryan Roberts Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/gup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index ad9ded39609c..0692119b7904 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2865,8 +2865,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!folio) goto pte_unmap; - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get_lockless(pmdp))) || + unlikely(pte_val(pte) != pte_val(ptep_get_lockless(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2942,7 +2942,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!folio) return 0; - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + if (unlikely(pmd_val(orig) != pmd_val(pmdp_get_lockless(pmdp)))) { gup_put_folio(folio, refs, flags); return 0; } @@ -2985,7 +2985,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (!folio) return 0; - if (unlikely(pud_val(orig) != pud_val(*pudp))) { + if (unlikely(pud_val(orig) != pud_val(pudp_get(pudp)))) { gup_put_folio(folio, refs, flags); return 0; } From 214f9ab72ce6e16120c20ad670389656f059e685 Mon Sep 17 00:00:00 2001 From: Aditya Sharma Date: Fri, 24 Apr 2026 14:52:17 +0530 Subject: [PATCH 012/321] mm/memory: update stale locking comments for fault handlers Update the comments for wp_page_copy(), do_wp_page(), do_swap_page(), do_anonymous_page(), __do_fault(), do_fault(), handle_pte_fault(), __handle_mm_fault(), and handle_mm_fault() to concisely clarify that they can be entered holding either the mmap_lock or the VMA lock, and that the lock may be released upon returning VM_FAULT_RETRY. Additionally, make the following corrections: - In do_anonymous_page(), correct the outdated claim that the function is entered with the PTE "mapped but not yet locked". Since handle_pte_fault() unmaps the empty PTE before routing to do_pte_missing(), the comment now correctly states it is entered with the PTE unmapped and unlocked. - In __do_fault(), update the stale reference from __lock_page_retry() to __folio_lock_or_retry(). Link: https://lore.kernel.org/20260424092217.263648-1-adi.sharma@zohomail.in Signed-off-by: Aditya Sharma Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 55 ++++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 86a973119bd4..02ec74a1273f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3837,8 +3837,8 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * - * Called with mmap_lock locked and the old page referenced, but - * without the ptl held. + * Called with either the VMA lock or the mmap_lock held (see FAULT_FLAG_VMA_LOCK) + * and the old page referenced, but without the ptl held. * * High level logic flow: * @@ -4237,9 +4237,9 @@ static bool wp_can_reuse_anon_folio(struct folio *folio, * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_lock still held, but pte unmapped and unlocked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK) and pte both mapped and locked. We return with + * the same lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) @@ -4785,12 +4785,12 @@ static void check_swap_exclusive(struct folio *folio, swp_entry_t entry, } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * - * We return with the mmap_lock locked or unlocked in the same cases - * as does filemap_fault(). + * When returning, the lock may have been released in the same cases + * as done by filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { @@ -5330,9 +5330,10 @@ static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_lock still held, but pte unmapped and unlocked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK), and pte unmapped and unlocked. + * We return with the lock still held, but pte unmapped and unlocked. + * If VM_FAULT_RETRY is returned, the lock may have been released. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { @@ -5440,9 +5441,10 @@ oom: } /* - * The mmap_lock must have been held on entry, and may have been - * released depending on flags and vma->vm_ops->fault() return value. - * See filemap_fault() and __lock_page_retry(). + * Either the VMA lock or the mmap_lock must have been held on entry + * (see FAULT_FLAG_VMA_LOCK) and may have been released depending on + * flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { @@ -6003,11 +6005,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults). - * The mmap_lock may have been released depending on flags and our + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK). + * The lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). - * If mmap_lock is released, vma may become invalid (for example + * If the lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) @@ -6374,10 +6376,11 @@ static void fix_spurious_fault(struct vm_fault *vmf, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow - * concurrent faults). + * On entry, we hold either the VMA lock or the mmap_lock + * (see FAULT_FLAG_VMA_LOCK). * - * The mmap_lock may have been released depending on flags and our return value. + * The mmap_lock or VMA lock may have been released depending on flags + * and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) @@ -6458,8 +6461,8 @@ unlock: /* * On entry, we hold either the VMA lock or the mmap_lock - * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in - * the result, the mmap_lock is not held on exit. See filemap_fault() + * (see FAULT_FLAG_VMA_LOCK). If VM_FAULT_RETRY is set in + * the result, the lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, @@ -6691,9 +6694,9 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, /* * By the time we get here, we already hold either the VMA lock or the - * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). + * mmap_lock (see FAULT_FLAG_VMA_LOCK). * - * The mmap_lock may have been released depending on flags and our + * The lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, From 5a2d162e22bf33eb89d53e802d0fc1ec422e19b6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 21:29:40 -0700 Subject: [PATCH 013/321] mm/damon/core: make charge_addr_from aware of end-address exclusivity DAMON region end address is exclusive one, but charge_addr_from is assigned assuming the end address is inclusive. As a result, DAMOS action to next up to min_region_sz memory can be skipped. This is quite negligible user impact. But, the bug is a bug that can be very simply fixed. Fix the wrong assignment to respect the exclusiveness of the address. The issue was discovered [1] by Sashiko. Link: https://lore.kernel.org/20260428042942.118230-1-sj@kernel.org Link: https://lore.kernel.org/20260428032324.115663-1-sj@kernel.org [1] Fixes: 50585192bc2e ("mm/damon/schemes: skip already charged targets and regions") Signed-off-by: SeongJae Park Cc: # 5.16.x Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3dbbbfdeff71..901ffdaefb7f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2106,7 +2106,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (damos_quota_is_set(quota) && quota->charged_sz >= quota->esz) { quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; + quota->charge_addr_from = r->ar.end; } } if (s->action != DAMOS_STAT) From 9138e27a3bc380cd88475546688f23d5eda1ad23 Mon Sep 17 00:00:00 2001 From: Ravi Jonnalagadda Date: Mon, 27 Apr 2026 20:05:20 -0700 Subject: [PATCH 014/321] mm/damon: add node_eligible_mem_bp goal metric Background and Motivation ========================= In heterogeneous memory systems, controlling memory distribution across NUMA nodes is essential for performance optimization. This patch enables system-wide page distribution with target-state goals such as "maintain 60% of scheme-eligible memory on DRAM" using PA-mode DAMON schemes. Rather than using absolute thresholds, this metric tracks the ratio of memory that matches each scheme's access pattern filters on a target node, enabling the quota system to automatically adjust migration aggressiveness to maintain the desired distribution. What This Metric Measures ========================= node_eligible_mem_bp: scheme_eligible_bytes_on_node / total_scheme_eligible_bytes * 10000 Two-Scheme Setup for Hot Page Distribution ========================================== For maintaining 60% of hot memory on DRAM (node 0) and 40% on CXL (node 1): PULL scheme: migrate_hot to node 0 goal: node_eligible_mem_bp, nid=0, target=6000 addr filter: node 1 address range (only migrate FROM CXL) "Move hot pages to DRAM if less than 60% of hot data is in DRAM" PUSH scheme: migrate_hot to node 1 goal: node_eligible_mem_bp, nid=1, target=4000 addr filter: node 0 address range (only migrate FROM DRAM) "Move hot pages to CXL if less than 40% of hot data is in CXL" Each scheme independently measures its own eligible memory and adjusts its quota to achieve its target ratio. The schemes work in concert through DAMON's unified monitoring context, with the quota autotuner balancing their relative aggressiveness. Implementation Details ====================== The implementation adds a new quota goal metric type DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP to the existing DAMOS quota goal framework. When this metric is configured for a scheme: 1. During each quota adjustment cycle, damos_get_node_eligible_mem_bp() is called to calculate the current memory distribution. 2. The function iterates through all regions that match the scheme's access pattern (via __damos_valid_target()) and calculates: - Total eligible bytes across all nodes - Eligible bytes specifically on the target node (goal->nid) 3. For each eligible region, damos_calc_eligible_bytes() walks through the physical address range, using damon_get_folio() to look up each folio and determine its NUMA node via folio_nid(). 4. Large folios are handled by calculating the exact overlap between the region boundaries and folio boundaries, ensuring accurate byte counts even when regions partially span folios. 5. The ratio (node_eligible / total_eligible * 10000) is returned as basis points, which the quota autotuner uses to adjust the scheme's effective quota size (esz). The implementation requires CONFIG_DAMON_PADDR since damon_get_folio() is only available for physical address space monitoring. Testing Results =============== Functionally tested on a two-node heterogeneous memory system with DRAM (node 0) and CXL memory (node 1). A PUSH+PULL scheme configuration using migrate_hot actions was used to reach a target hot memory ratio between the two tiers. With the TEMPORAL tuner, the system converges quickly to the target distribution. The tuner drives esz to maximum when under goal and to zero once the goal is met, forming a simple on/off feedback loop that stabilizes at the desired ratio. With the CONSIST tuner, the scheme still converges but more slowly, as it migrates and then throttles itself based on quota feedback. The time to reach the goal varies depending on workload intensity. Note: This metric works with both TEMPORAL and CONSIST goal tuners. Link: https://lore.kernel.org/20260428030520.701-1-ravis.opensrc@gmail.com Signed-off-by: Ravi Jonnalagadda Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Honggyu Kim Cc: Jonathan Corbet Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 + mm/damon/core.c | 172 +++++++++++++++++++++++++++++++++++---- mm/damon/sysfs-schemes.c | 7 ++ 3 files changed, 167 insertions(+), 15 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f2cdb7c3f5e6..986b8c902585 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -159,6 +159,8 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio. * @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio. + * @DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: Scheme-eligible memory ratio of a + * node in basis points (0-10000). * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -172,6 +174,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEMCG_FREE_BP, DAMOS_QUOTA_ACTIVE_MEM_BP, DAMOS_QUOTA_INACTIVE_MEM_BP, + DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 901ffdaefb7f..e4229294353e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -13,10 +13,14 @@ #include #include #include +#include #include #include #include +/* for damon_get_folio() used by node eligible memory metrics */ +#include "ops-common.h" + #define CREATE_TRACE_POINTS #include @@ -1326,11 +1330,26 @@ static int damon_commit_targets( int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; + struct damos *scheme; + struct damos_quota_goal *goal; dst->maybe_corrupted = true; if (!is_power_of_2(src->min_region_sz)) return -EINVAL; + /* node_eligible_mem_bp metric requires PADDR ops */ + if (src->ops.id != DAMON_OPS_PADDR) { + damon_for_each_scheme(scheme, src) { + struct damos_quota *quota = &scheme->quota; + + damos_for_each_quota_goal(goal, quota) { + if (goal->metric == + DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP) + return -EINVAL; + } + } + } + err = damon_commit_schemes(dst, src); if (err) return err; @@ -2287,7 +2306,115 @@ static unsigned long damos_get_node_memcg_used_bp( numerator = i.totalram - used_pages; return mult_frac(numerator, 10000, i.totalram); } -#else + +#ifdef CONFIG_DAMON_PADDR +/* + * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node. + * @c: The DAMON context. + * @s: The scheme. + * @nid: The target NUMA node id. + * @total: Output for total eligible bytes across all nodes. + * + * Iterates through each folio in eligible regions to accurately determine + * which node the memory resides on. Returns eligible bytes on the specified + * node and sets *total to the sum across all nodes. + * + * Note: This function requires damon_get_folio() from ops-common.c, which is + * only available when CONFIG_DAMON_PADDR is enabled. It also requires the + * context to be using PADDR operations for meaningful results. + */ +static phys_addr_t damos_calc_eligible_bytes(struct damon_ctx *c, + struct damos *s, int nid, phys_addr_t *total) +{ + struct damon_target *t; + struct damon_region *r; + phys_addr_t total_eligible = 0; + phys_addr_t node_eligible = 0; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + phys_addr_t addr, end_addr; + + if (!__damos_valid_target(r, s)) + continue; + + /* Convert from core address units to physical bytes */ + addr = (phys_addr_t)r->ar.start * c->addr_unit; + end_addr = (phys_addr_t)r->ar.end * c->addr_unit; + while (addr < end_addr) { + struct folio *folio; + phys_addr_t folio_start, folio_end; + phys_addr_t overlap_start, overlap_end; + phys_addr_t counted; + + folio = damon_get_folio(PHYS_PFN(addr)); + if (!folio) { + addr = PAGE_ALIGN_DOWN(addr + + PAGE_SIZE); + if (!addr) + break; + continue; + } + + /* + * Calculate exact overlap between the region + * [addr, end_addr) and the folio range. + * The folio may start before addr if addr is + * in the middle of a large folio. + */ + folio_start = PFN_PHYS(folio_pfn(folio)); + folio_end = folio_start + folio_size(folio); + + overlap_start = max(addr, folio_start); + overlap_end = min(end_addr, folio_end); + + if (overlap_end > overlap_start) { + counted = overlap_end - overlap_start; + total_eligible += counted; + if (folio_nid(folio) == nid) + node_eligible += counted; + } + + /* Advance past the entire folio */ + addr = folio_end; + folio_put(folio); + } + cond_resched(); + } + } + + *total = total_eligible; + return node_eligible; +} + +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + phys_addr_t total_eligible = 0; + phys_addr_t node_eligible; + + if (c->ops.id != DAMON_OPS_PADDR) + return 0; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)) + return 0; + + node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible); + + if (!(unsigned long)total_eligible) + return 0; + + return mult_frac((unsigned long)node_eligible, 10000, + (unsigned long)total_eligible); +} +#else /* CONFIG_DAMON_PADDR */ +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + return 0; +} +#endif /* CONFIG_DAMON_PADDR */ +#else /* CONFIG_NUMA */ static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { @@ -2299,7 +2426,13 @@ static unsigned long damos_get_node_memcg_used_bp( { return 0; } -#endif + +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + return 0; +} +#endif /* CONFIG_NUMA */ /* * Returns LRU-active or inactive memory to total LRU memory size ratio. @@ -2319,7 +2452,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio) return mult_frac(inactive, 10000, total); } -static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) +static void damos_set_quota_goal_current_value(struct damon_ctx *c, + struct damos *s, struct damos_quota_goal *goal) { u64 now_psi_total; @@ -2345,19 +2479,24 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_in_active_mem_bp( goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP); break; + case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: + goal->current_value = damos_get_node_eligible_mem_bp(c, s, + goal->nid); + break; default: break; } } /* Return the highest score since it makes schemes least aggressive */ -static unsigned long damos_quota_score(struct damos_quota *quota) +static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s) { struct damos_quota_goal *goal; + struct damos_quota *quota = &s->quota; unsigned long highest_score = 0; damos_for_each_quota_goal(goal, quota) { - damos_set_quota_goal_current_value(goal); + damos_set_quota_goal_current_value(c, s, goal); highest_score = max(highest_score, mult_frac(goal->current_value, 10000, goal->target_value)); @@ -2366,17 +2505,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota) return highest_score; } -static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota) +static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s) { - unsigned long score = damos_quota_score(quota); + struct damos_quota *quota = &s->quota; + unsigned long score = damos_quota_score(c, s); quota->esz_bp = damon_feed_loop_next_input( max(quota->esz_bp, 10000UL), score); } -static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) +static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c, + struct damos *s) { - unsigned long score = damos_quota_score(quota); + struct damos_quota *quota = &s->quota; + unsigned long score = damos_quota_score(c, s); if (score >= 10000) quota->esz_bp = 0; @@ -2389,9 +2531,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ -static void damos_set_effective_quota(struct damos_quota *quota, - struct damon_ctx *ctx) +static void damos_set_effective_quota(struct damon_ctx *ctx, struct damos *s) { + struct damos_quota *quota = &s->quota; unsigned long throughput; unsigned long esz = ULONG_MAX; @@ -2402,9 +2544,9 @@ static void damos_set_effective_quota(struct damos_quota *quota, if (!list_empty("a->goals)) { if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST) - damos_goal_tune_esz_bp_consist(quota); + damos_goal_tune_esz_bp_consist(ctx, s); else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL) - damos_goal_tune_esz_bp_temporal(quota); + damos_goal_tune_esz_bp_temporal(ctx, s); esz = quota->esz_bp / 10000; } @@ -2452,7 +2594,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* First charge window */ if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; - damos_set_effective_quota(quota, c); + damos_set_effective_quota(c, s); } /* New charge window starts */ @@ -2467,7 +2609,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->charged_sz = 0; if (trace_damos_esz_enabled()) cached_esz = quota->esz; - damos_set_effective_quota(quota, c); + damos_set_effective_quota(c, s); if (trace_damos_esz_enabled() && quota->esz != cached_esz) damos_trace_esz(c, s, quota); } diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index a8014780edae..d12e741a47ec 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1093,6 +1093,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_INACTIVE_MEM_BP, .name = "inactive_mem_bp", }, + { + .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP, + .name = "node_eligible_mem_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -2685,6 +2689,9 @@ static int damos_sysfs_add_quota_score( } goal->nid = sysfs_goal->nid; break; + case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: + goal->nid = sysfs_goal->nid; + break; default: break; } From c7ec7d5f6b3d1fc36d04baaabd8d2756a5e937b1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:50 -0700 Subject: [PATCH 015/321] mm/damon/core: handle Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index e4229294353e..df51a5661d46 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2065,6 +2065,20 @@ static void damos_walk_cancel(struct damon_ctx *ctx) mutex_unlock(&ctx->walk_control_lock); } +static bool damos_quota_is_full(struct damos_quota *quota, + unsigned long min_region_sz) +{ + if (!damos_quota_is_set(quota)) + return false; + if (quota->charged_sz >= quota->esz) + return true; + /* + * DAMOS action is applied per region, so esz - quota->charged_sz < min_region_sz; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -2122,8 +2136,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); quota->charged_sz += sz; - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) { + if (damos_quota_is_full(quota, c->min_region_sz)) { quota->charge_target_from = t; quota->charge_addr_from = r->ar.end; } @@ -2151,8 +2164,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; /* Check the quota */ - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) + if (damos_quota_is_full(quota, c->min_region_sz)) continue; if (damos_skip_charged_region(t, r, s, c->min_region_sz)) @@ -2601,8 +2613,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!time_in_range_open(jiffies, quota->charged_from, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) + if (damos_quota_is_full(quota, c->min_region_sz)) s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; From 2423bb5fbe81f842cef10e076aeeb04004a6e15f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:51 -0700 Subject: [PATCH 016/321] mm/damon/core: merge regions after applying DAMOS schemes damos_apply_scheme() could split the given region if applying the scheme's action to the entire region can result in violating the quota-set upper limit. Keeping regions that are created by such split operations is unnecessary overhead. The overhead would be negligible in the common case because such split operations could happen only up to the number of installed schemes per scheme apply interval. The following commit could make the impact larger, though. The following commit will allow the action-failed region to be charged in a different ratio. If both the ratio and the remaining quota is quite small while the region to apply the scheme is quite large and the action is nearly always failing, a high number of split operations could happen. Remove the unnecessary overhead by merging regions after applying schemes is done for each region. The merge operation is made only if it will not lose monitoring information and keep min_nr_regions constraint. In the worst case, the max_nr_regions could still be violated until the next per-aggregation interval merge operation is made. Link: https://lore.kernel.org/20260428013402.115171-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index df51a5661d46..e59f4031d24b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2182,6 +2182,58 @@ static void damon_do_apply_schemes(struct damon_ctx *c, } } +/* + * damos_apply_target() - Apply DAMOS schemes to a given target. + * @c: monitoring context to apply its DAMOS schemes to.. + * @t: monitoring target to apply the schemes to. + * @max_region_sz: maximum region size for @c. + * + * This function could split regions for keeping the quota. To minimize + * overhead from the split operations increased number of regions, this + * function will also merge regions after the schemes applying attempt is done, + * for each region. The merge operation is made only when it doesn't lose the + * monitoring information and not violating @max_region_sz. + * + * Hence, after this function is called, the total number of regions could + * be increased or reduced. The increase could make max_nr_regions temporarily + * be violated, until the next per-aggregation interval regions merge operation + * is executed. The decrease will not violate min_nr_regions though, since it + * keeps @max_region_sz. + */ +static void damos_apply_target(struct damon_ctx *c, struct damon_target *t, + unsigned long max_region_sz) +{ + struct damon_region *r; + + damon_for_each_region(r, t) { + struct damon_region *prev_r; + + damon_do_apply_schemes(c, t, r); + /* + * damon_do_apply_scheems() could split the region for the + * quota. Keeping the new slices is an overhead. Merge back + * the slices into the previous region if it doesn't lose any + * information and not violating the max_region_sz. + */ + if (damon_first_region(t) == r) + continue; + prev_r = damon_prev_region(r); + if (prev_r->ar.end != r->ar.start) + continue; + if (prev_r->age != r->age) + continue; + if (prev_r->last_nr_accesses != r->last_nr_accesses) + continue; + if (prev_r->nr_accesses != r->nr_accesses) + continue; + if (r->ar.end - prev_r->ar.start > max_region_sz) + continue; + prev_r->ar.end = r->ar.end; + damon_destroy_region(r, t); + r = prev_r; + } +} + /* * damon_feed_loop_next_input() - get next input to achieve a target score. * @last_input The last input. @@ -2674,9 +2726,9 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s) static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; - struct damon_region *r; struct damos *s; bool has_schemes_to_apply = false; + unsigned long max_region_sz; damon_for_each_scheme(s, c) { if (time_before(c->passed_sample_intervals, s->next_apply_sis)) @@ -2693,13 +2745,12 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (!has_schemes_to_apply) return; + max_region_sz = damon_region_sz_limit(c); mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { if (c->ops.target_valid && c->ops.target_valid(t) == false) continue; - - damon_for_each_region(r, t) - damon_do_apply_schemes(c, t, r); + damos_apply_target(c, t, max_region_sz); } damon_for_each_scheme(s, c) { From 4ee4fb3214a8aadf5e8d253f8a34b76baff7f37d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:52 -0700 Subject: [PATCH 017/321] mm/damon/core: introduce failed region quota charge ratio DAMOS quota is charged to all DAMOS action application attempted memory, regardless of how much of the memory the action was successful and failed. This makes understanding quota behavior without DAMOS stat but only with end level metrics (e.g., increased amount of free memory for DAMOS_PAGEOUT action) difficult. Also, charging action-failed memory same as action-successful memory is somewhat unfair, as successful action application will induce more overhead in most cases. Introduce DAMON core API for setting the charge ratio for such action-failed memory. It allows API callers to specify the ratio in a flexible way, by setting the numerator and the denominator. Link: https://lore.kernel.org/20260428013402.115171-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 9 +++++++++ mm/damon/core.c | 21 ++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 986b8c902585..2bb43910e22e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -236,6 +236,8 @@ enum damos_quota_goal_tuner { * @goals: Head of quota tuning goals (&damos_quota_goal) list. * @goal_tuner: Goal-based @esz tuning algorithm to use. * @esz: Effective size quota in bytes. + * @fail_charge_num: Failed regions charge rate numerator. + * @fail_charge_denom: Failed regions charge rate denominator. * * @weight_sz: Weight of the region's size for prioritization. * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. @@ -265,6 +267,10 @@ enum damos_quota_goal_tuner { * * The resulting effective size quota in bytes is set to @esz. * + * For DAMOS action applying failed amount of regions, charging those same to + * those that the action has successfully applied may be unfair. For the + * reason, 'the size * @fail_charge_num / @fail_charge_denom' is charged. + * * For selecting regions within the quota, DAMON prioritizes current scheme's * target memory regions using the &struct damon_operations->get_scheme_score. * You could customize the prioritization logic by setting &weight_sz, @@ -279,6 +285,9 @@ struct damos_quota { enum damos_quota_goal_tuner goal_tuner; unsigned long esz; + unsigned int fail_charge_num; + unsigned int fail_charge_denom; + unsigned int weight_sz; unsigned int weight_nr_accesses; unsigned int weight_age; diff --git a/mm/damon/core.c b/mm/damon/core.c index e59f4031d24b..7aeaf319a18a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -922,6 +922,8 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) if (err) return err; dst->goal_tuner = src->goal_tuner; + dst->fail_charge_num = src->fail_charge_num; + dst->fail_charge_denom = src->fail_charge_denom; dst->weight_sz = src->weight_sz; dst->weight_nr_accesses = src->weight_nr_accesses; dst->weight_age = src->weight_age; @@ -2065,6 +2067,23 @@ static void damos_walk_cancel(struct damon_ctx *ctx) mutex_unlock(&ctx->walk_control_lock); } +static void damos_charge_quota(struct damos_quota *quota, + unsigned long sz_region, unsigned long sz_applied) +{ + /* + * sz_applied could be bigger than sz_region, depending on ops + * implementation of the action, e.g., damos_pa_pageout(). Charge only + * the region size in the case. + */ + if (!quota->fail_charge_denom || sz_applied > sz_region) + quota->charged_sz += sz_region; + else + quota->charged_sz += sz_applied + mult_frac( + (sz_region - sz_applied), + quota->fail_charge_num, + quota->fail_charge_denom); +} + static bool damos_quota_is_full(struct damos_quota *quota, unsigned long min_region_sz) { @@ -2135,7 +2154,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); - quota->charged_sz += sz; + damos_charge_quota(quota, sz, sz_applied); if (damos_quota_is_full(quota, c->min_region_sz)) { quota->charge_target_from = t; quota->charge_addr_from = r->ar.end; From fad1124120d61d2c6781c9d0fcace0fdb6e24df4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:53 -0700 Subject: [PATCH 018/321] mm/damon/sysfs-schemes: implement fail_charge_{num,denom} files Implement the user-space ABI for the DAMOS action failed region quota-charge ratio setup. For this, add two new sysfs files under the DAMON sysfs interface for DAMOS quotas. Names of the files are fail_charge_num and fail_charge_denom, and work for reading and setting the numerator and denominator of the failed regions charge ratio. Link: https://lore.kernel.org/20260428013402.115171-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 54 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index d12e741a47ec..be2b5eda84e0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1512,6 +1512,8 @@ struct damon_sysfs_quotas { unsigned long reset_interval_ms; unsigned long effective_sz; /* Effective size quota in bytes */ enum damos_quota_goal_tuner goal_tuner; + unsigned int fail_charge_num; + unsigned int fail_charge_denom; }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1686,6 +1688,48 @@ static ssize_t goal_tuner_store(struct kobject *kobj, return -EINVAL; } +static ssize_t fail_charge_num_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%u\n", quotas->fail_charge_num); +} + +static ssize_t fail_charge_num_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtouint(buf, 0, "as->fail_charge_num); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t fail_charge_denom_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%u\n", quotas->fail_charge_denom); +} + +static ssize_t fail_charge_denom_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtouint(buf, 0, "as->fail_charge_denom); + + if (err) + return -EINVAL; + return count; +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1706,12 +1750,20 @@ static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr = __ATTR_RW_MODE(goal_tuner, 0600); +static struct kobj_attribute damon_sysfs_quotas_fail_charge_num_attr = + __ATTR_RW_MODE(fail_charge_num, 0600); + +static struct kobj_attribute damon_sysfs_quotas_fail_charge_denom_attr = + __ATTR_RW_MODE(fail_charge_denom, 0600); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, &damon_sysfs_quotas_effective_bytes_attr.attr, &damon_sysfs_quotas_goal_tuner_attr.attr, + &damon_sysfs_quotas_fail_charge_num_attr.attr, + &damon_sysfs_quotas_fail_charge_denom_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); @@ -2803,6 +2855,8 @@ static struct damos *damon_sysfs_mk_scheme( .weight_nr_accesses = sysfs_weights->nr_accesses, .weight_age = sysfs_weights->age, .goal_tuner = sysfs_quotas->goal_tuner, + .fail_charge_num = sysfs_quotas->fail_charge_num, + .fail_charge_denom = sysfs_quotas->fail_charge_denom, }; struct damos_watermarks wmarks = { .metric = sysfs_wmarks->metric, From 776270536d9d2111aec3db54cfccae4ed5a3c5f6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:54 -0700 Subject: [PATCH 019/321] Docs/mm/damon/design: document fail_charge_{num,denom} Update DAMON design document for the DAMOS action failed region quota charge ratio. Link: https://lore.kernel.org/20260428013402.115171-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index afc7d52bda2f..bacb457f553a 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -565,6 +565,28 @@ interface `, refer to :ref:`weights ` part of the documentation. +.. _damon_design_damos_quotas_failed_memory_charging_ratio: + +Action-failed Memory Charging Ratio +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +DAMOS action to a given region can fail for some subsets of the memory of the +region. For example, if the action is ``pageout`` and the region has some +unreclaimable pages, applying the action to the pages will fail. The amount of +system resource that is taken for such failed action applications is usually +different from that for successful action applications. For such cases, users +can set different charging ratio for such failed memory. The ratio can be +specified using ``fail_charge_num`` and ``fail_charge_denom`` parameters. The +two parameters represent the numerator and denominator of the ratio. The +feature is enabled only if ``fail_charge_denom`` is not zero. + +For example, let's suppose a DAMOS action is applied to a region of 1,000 MiB +size. The action is successfully applied to only 700 MiB of the region. +``fail_charge_num`` and ``fail_charge_denom`` are set to ``1`` and ``1024``, +respectively. Then only 700 MiB and 300 KiB of size (``700 MiB + 300 MiB * 1 / +1024``) will be charged. + + .. _damon_design_damos_quotas_auto_tuning: Aim-oriented Feedback-driven Auto-tuning From 59ebdeedb595116bc2d2d0bcc408994908cb3b9d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:55 -0700 Subject: [PATCH 020/321] Docs/admin-guide/mm/damon/usage: document fail_charge_{num,denom} files Update DAMON usage document for the DAMOS action failed regions quota charge ratio control sysfs files. Link: https://lore.kernel.org/20260428013402.115171-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 534e1199cf09..e84b58731f7e 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -83,7 +83,9 @@ comma (","). │ │ │ │ │ │ │ │ sz/min,max │ │ │ │ │ │ │ │ nr_accesses/min,max │ │ │ │ │ │ │ │ age/min,max - │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms,effective_bytes,goal_tuner + │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms, + │ │ │ │ │ │ │ effective_bytes,goal_tuner, + │ │ │ │ │ │ │ fail_charge_num,fail_charge_denom │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ │ :ref:`goals `/nr_goals │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid,path @@ -377,9 +379,10 @@ schemes//quotas/ The directory for the :ref:`quotas ` of the given DAMON-based operation scheme. -Under ``quotas`` directory, five files (``ms``, ``bytes``, -``reset_interval_ms``, ``effective_bytes`` and ``goal_tuner``) and two -directories (``weights`` and ``goals``) exist. +Under ``quotas`` directory, seven files (``ms``, ``bytes``, +``reset_interval_ms``, ``effective_bytes``, ``goal_tuner``, ``fail_charge_num`` +and ``fail_charge_denom``) and two directories (``weights`` and ``goals``) +exist. You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and ``reset interval`` in milliseconds by writing the values to the three files, @@ -398,6 +401,13 @@ the background design of the feature and the name of the selectable algorithms. Refer to :ref:`goals directory ` for the goals setup. +You can set the action-failed memory quota charging ratio by writing the +numerator and the denominator for the ratio to ``fail_charge_num`` and +``fail_charge_denom`` files, respectively. Reading those files will return the +current set values. Refer to :ref:`design +` for more details of +the ratio feature. + The time quota is internally transformed to a size quota. Between the transformed size quota and user-specified size quota, smaller one is applied. Based on the user-specified :ref:`goal `, the From 1d6b8e92da39413b7780908ea3d896c4a75b9bed Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:56 -0700 Subject: [PATCH 021/321] Docs/ABI/damon: document fail_charge_{num,denom} Update DAMON ABI document for the DAMOS action failed regions quota charge ratio control sysfs files. Link: https://lore.kernel.org/20260428013402.115171-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 2424237ebb10..213eb87392d8 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -322,6 +322,18 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the goal-based effective quota auto-tuning algorithm to use. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/fail_charge_num +Date: Mar 2026 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + action-failed memory quota charging ratio numerator. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/fail_charge_denom +Date: Mar 2026 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + action-failed memory quota charging ratio denominator. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/sz_permil Date: Mar 2022 Contact: SeongJae Park From 0a605b4b673b46c78b43b5f5e557cfdd06856267 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:57 -0700 Subject: [PATCH 022/321] mm/damon/tests/core-kunit: test fail_charge_{num,denom} committing Extend damos_test_commit_quotas() kunit test to ensure damos_commit_quota() handles fail_charge_{num,denom} parameters. Link: https://lore.kernel.org/20260428013402.115171-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 9e5904c2beeb..6de622a2fd79 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -694,6 +694,8 @@ static void damos_test_commit_quota(struct kunit *test) .ms = 2, .sz = 3, .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, + .fail_charge_num = 2, + .fail_charge_denom = 3, .weight_sz = 4, .weight_nr_accesses = 5, .weight_age = 6, @@ -703,6 +705,8 @@ static void damos_test_commit_quota(struct kunit *test) .ms = 8, .sz = 9, .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, + .fail_charge_num = 1, + .fail_charge_denom = 1024, .weight_sz = 10, .weight_nr_accesses = 11, .weight_age = 12, @@ -717,6 +721,8 @@ static void damos_test_commit_quota(struct kunit *test) KUNIT_EXPECT_EQ(test, dst.ms, src.ms); KUNIT_EXPECT_EQ(test, dst.sz, src.sz); KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner); + KUNIT_EXPECT_EQ(test, dst.fail_charge_num, src.fail_charge_num); + KUNIT_EXPECT_EQ(test, dst.fail_charge_denom, src.fail_charge_denom); KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); From 588f08518fa2bb3b9ef20b5fbb20e27b39e5a257 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:58 -0700 Subject: [PATCH 023/321] selftests/damon/_damon_sysfs: support failed region quota charge ratio Extend _damon_sysfs.py for DAMOS action failed regions quota charge ratio setup, so that we can add kselftest for the new feature. Link: https://lore.kernel.org/20260428013402.115171-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 2b4df655d9fd..0f13512fa5e6 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -132,14 +132,17 @@ class DamosQuota: goals = None # quota goals goal_tuner = None # quota goal tuner reset_interval_ms = None # quota reset interval + fail_charge_num = None + fail_charge_denom = None weight_sz_permil = None weight_nr_accesses_permil = None weight_age_permil = None scheme = None # owner scheme def __init__(self, sz=0, ms=0, goals=None, goal_tuner='consist', - reset_interval_ms=0, weight_sz_permil=0, - weight_nr_accesses_permil=0, weight_age_permil=0): + reset_interval_ms=0, fail_charge_num=0, fail_charge_denom=0, + weight_sz_permil=0, weight_nr_accesses_permil=0, + weight_age_permil=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms @@ -151,6 +154,8 @@ class DamosQuota: for idx, goal in enumerate(self.goals): goal.idx = idx goal.quota = self + self.fail_charge_num = fail_charge_num + self.fail_charge_denom = fail_charge_denom def sysfs_dir(self): return os.path.join(self.scheme.sysfs_dir(), 'quotas') @@ -197,6 +202,18 @@ class DamosQuota: os.path.join(self.sysfs_dir(), 'goal_tuner'), self.goal_tuner) if err is not None: return err + + err = write_file( + os.path.join(self.sysfs_dir(), 'fail_charge_num'), + self.fail_charge_num) + if err is not None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'fail_charge_denom'), + self.fail_charge_denom) + if err is not None: + return err + return None class DamosWatermarks: From bcd8d68c6ba1ef918294d96ab64726eeef00b37c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:33:59 -0700 Subject: [PATCH 024/321] selftests/damon/drgn_dump_damon_status: support failed region quota charge ratio Extend drgn_dump_damon_status.py to dump DAMON internal state for DAMOS action failed regions quota charge ratio, to be able to show if the internal state for the feature is working, with future DAMON selftests. Link: https://lore.kernel.org/20260428013402.115171-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/drgn_dump_damon_status.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index af99b07a4f56..b5c56233a923 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -112,6 +112,8 @@ def damos_quota_to_dict(quota): ['goals', damos_quota_goals_to_list], ['goal_tuner', int], ['esz', int], + ['fail_charge_num', int], + ['fail_charge_denom', int], ['weight_sz', int], ['weight_nr_accesses', int], ['weight_age', int], From 8d21446a6c7feb2d93b3ea4f54ffd7f4eb64f2bc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 18:34:00 -0700 Subject: [PATCH 025/321] selftests/damon/sysfs.py: test failed region quota charge ratio Extend sysfs.py DAMON selftest to setup DAMOS action failed region quota charge ratio and assert the setup is made into DAMON internal state. Link: https://lore.kernel.org/20260428013402.115171-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 3aa5c91548a5..9067945f16ca 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -73,6 +73,10 @@ def assert_quota_committed(quota, dump): } assert_true(dump['goal_tuner'] == tuner_val[quota.goal_tuner], 'goal_tuner', dump) + assert_true(dump['fail_charge_num'] == quota.fail_charge_num, + 'fail_charge_num', dump) + assert_true(dump['fail_charge_denom'] == quota.fail_charge_denom, + 'fail_charge_denom', dump) assert_true(dump['weight_sz'] == quota.weight_sz_permil, 'weight_sz', dump) assert_true(dump['weight_nr_accesses'] == quota.weight_nr_accesses_permil, 'weight_nr_accesses', dump) @@ -239,6 +243,8 @@ def main(): nid=1)], goal_tuner='temporal', reset_interval_ms=1500, + fail_charge_num=1, + fail_charge_denom=4096, weight_sz_permil=20, weight_nr_accesses_permil=200, weight_age_permil=1000), From 9f40c3cdf0fa3011c3a15f8acc0b9ffb3ed11171 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:52 +0800 Subject: [PATCH 026/321] selftests/cgroup: skip test_zswap if zswap is globally disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "selftests/cgroup: improve zswap tests robustness and support large page sizes", v7. This patchset aims to fix various spurious failures and improve the overall robustness of the cgroup zswap selftests. The primary motivation is to make the tests compatible with architectures that use non-4K page sizes (such as 64K on ppc64le and arm64). Currently, the tests rely heavily on hardcoded 4K page sizes and fixed memory limits. On 64K page size systems, these hardcoded values lead to sub-page granularity accesses, incorrect page count calculations, and insufficient memory pressure to trigger zswap writeback, ultimately causing the tests to fail. Additionally, this series addresses OOM kills occurring in test_swapin_nozswap by dynamically scaling memory limits, and prevents spurious test failures when zswap is built into the kernel but globally disabled. This patch (of 8): test_zswap currently only checks whether zswap is present by testing /sys/module/zswap. This misses the runtime global state exposed in /sys/module/zswap/parameters/enabled. When zswap is built/loaded but globally disabled, the zswap cgroup selftests run in an invalid environment and may fail spuriously. Check the runtime enabled state before running the tests: - skip if zswap is not configured, - fail if the enabled knob cannot be read, - skip if zswap is globally disabled. Also print a hint in the skip message on how to enable zswap. Link: https://lore.kernel.org/20260424040059.12940-1-li.wang@linux.dev Link: https://lore.kernel.org/20260424040059.12940-2-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index a7bdcdd09d62..a94238a2e048 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -15,6 +15,9 @@ #include "kselftest.h" #include "cgroup_util.h" +#define PATH_ZSWAP "/sys/module/zswap" +#define PATH_ZSWAP_ENABLED "/sys/module/zswap/parameters/enabled" + static int read_int(const char *path, size_t *value) { FILE *file; @@ -725,9 +728,18 @@ struct zswap_test { }; #undef T -static bool zswap_configured(void) +static void check_zswap_enabled(void) { - return access("/sys/module/zswap", F_OK) == 0; + char value[2]; + + if (access(PATH_ZSWAP, F_OK)) + ksft_exit_skip("zswap isn't configured\n"); + + if (read_text(PATH_ZSWAP_ENABLED, value, sizeof(value)) <= 0) + ksft_exit_fail_msg("Failed to read " PATH_ZSWAP_ENABLED "\n"); + + if (value[0] == 'N') + ksft_exit_skip("zswap is disabled (hint: echo 1 > " PATH_ZSWAP_ENABLED ")\n"); } int main(int argc, char **argv) @@ -740,8 +752,7 @@ int main(int argc, char **argv) if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); - if (!zswap_configured()) - ksft_exit_skip("zswap isn't configured\n"); + check_zswap_enabled(); /* * Check that memory controller is available: From 0d38cded3c6294b0dfa38e3fc92077b5d381951e Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:53 +0800 Subject: [PATCH 027/321] selftests/cgroup: avoid OOM in test_swapin_nozswap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_swapin_nozswap can hit OOM before reaching its assertions on some setups. The test currently sets memory.max=8M and then allocates/reads 32M with memory.zswap.max=0, which may over-constrain reclaim and kill the workload process. Replace hardcoded sizes with PAGE_SIZE-based values: - control_allocation_size = PAGE_SIZE * 512 - memory.max = control_allocation_size * 3 / 4 - minimum expected swap = control_allocation_size / 4 This keeps the test pressure model intact (allocate/read beyond memory.max to force swap-in/out) while making it more robust across different environments. The test intent is unchanged: confirm that swapping occurs while zswap remains unused when memory.zswap.max=0. === Error Logs === # ./test_zswap TAP version 13 1..7 ok 1 test_zswap_usage not ok 2 test_swapin_nozswap ... # dmesg [271641.879153] test_zswap invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 [271641.879168] CPU: 1 UID: 0 PID: 177372 Comm: test_zswap Kdump: loaded Not tainted 6.12.0-211.el10.ppc64le #1 VOLUNTARY [271641.879171] Hardware name: IBM,9009-41A POWER9 (architected) 0x4e0202 0xf000005 of:IBM,FW940.02 (UL940_041) hv:phyp pSeries [271641.879173] Call Trace: [271641.879174] [c00000037540f730] [c00000000127ec44] dump_stack_lvl+0x88/0xc4 (unreliable) [271641.879184] [c00000037540f760] [c0000000005cc594] dump_header+0x5c/0x1e4 [271641.879188] [c00000037540f7e0] [c0000000005cb464] oom_kill_process+0x324/0x3b0 [271641.879192] [c00000037540f860] [c0000000005cbe48] out_of_memory+0x118/0x420 [271641.879196] [c00000037540f8f0] [c00000000070d8ec] mem_cgroup_out_of_memory+0x18c/0x1b0 [271641.879200] [c00000037540f990] [c000000000713888] try_charge_memcg+0x598/0x890 [271641.879204] [c00000037540fa70] [c000000000713dbc] charge_memcg+0x5c/0x110 [271641.879207] [c00000037540faa0] [c0000000007159f8] __mem_cgroup_charge+0x48/0x120 [271641.879211] [c00000037540fae0] [c000000000641914] alloc_anon_folio+0x2b4/0x5a0 [271641.879215] [c00000037540fb60] [c000000000641d58] do_anonymous_page+0x158/0x6b0 [271641.879218] [c00000037540fbd0] [c000000000642f8c] __handle_mm_fault+0x4bc/0x910 [271641.879221] [c00000037540fcf0] [c000000000643500] handle_mm_fault+0x120/0x3c0 [271641.879224] [c00000037540fd40] [c00000000014bba0] ___do_page_fault+0x1c0/0x980 [271641.879228] [c00000037540fdf0] [c00000000014c44c] hash__do_page_fault+0x2c/0xc0 [271641.879232] [c00000037540fe20] [c0000000001565d8] do_hash_fault+0x128/0x1d0 [271641.879236] [c00000037540fe50] [c000000000008be0] data_access_common_virt+0x210/0x220 [271641.879548] Tasks state (memory values in pages): ... [271641.879550] [ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name [271641.879555] [ 177372] 0 177372 571 0 0 0 0 51200 96 0 test_zswap [271641.879562] oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=/,mems_allowed=0,oom_memcg=/no_zswap_test,task_memcg=/no_zswap_test,task=test_zswap,pid=177372,uid=0 [271641.879578] Memory cgroup out of memory: Killed process 177372 (test_zswap) total-vm:36544kB, anon-rss:0kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:50kB oom_score_adj:0 Link: https://lore.kernel.org/20260424040059.12940-3-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index a94238a2e048..47709cbdcdf1 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -165,21 +165,25 @@ out: static int test_swapin_nozswap(const char *root) { int ret = KSFT_FAIL; - char *test_group; - long swap_peak, zswpout; + char *test_group, mem_max_buf[32]; + long swap_peak, zswpout, min_swap; + size_t allocation_size = sysconf(_SC_PAGESIZE) * 512; + + min_swap = allocation_size / 4; + snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size * 3/4); test_group = cg_name(root, "no_zswap_test"); if (!test_group) goto out; if (cg_create(test_group)) goto out; - if (cg_write(test_group, "memory.max", "8M")) + if (cg_write(test_group, "memory.max", mem_max_buf)) goto out; if (cg_write(test_group, "memory.zswap.max", "0")) goto out; /* Allocate and read more than memory.max to trigger swapin */ - if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) + if (cg_run(test_group, allocate_and_read_bytes, (void *)allocation_size)) goto out; /* Verify that pages are swapped out, but no zswap happened */ @@ -189,8 +193,9 @@ static int test_swapin_nozswap(const char *root) goto out; } - if (swap_peak < MB(24)) { - ksft_print_msg("at least 24MB of memory should be swapped out\n"); + if (swap_peak < min_swap) { + ksft_print_msg("at least %ldKB of memory should be swapped out\n", + min_swap / 1024); goto out; } From b19ee588e159c71f0d314246a944dcfc3e2a6009 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:54 +0800 Subject: [PATCH 028/321] selftests/cgroup: use runtime page size for zswpin check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_zswapin compares memory.stat:zswpin (counted in pages) against a byte threshold converted with PAGE_SIZE. In cgroup selftests, PAGE_SIZE is hardcoded to 4096, which makes the conversion wrong on systems with non-4K base pages (e.g. 64K). As a result, the test requires too many pages to pass and fails spuriously even when zswap is working. Use sysconf(_SC_PAGESIZE) for the zswpin threshold conversion so the check matches the actual system page size. Link: https://lore.kernel.org/20260424040059.12940-4-li.wang@linux.dev Signed-off-by: Li Wang Reviewed-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 47709cbdcdf1..37aa83c2f1bf 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -245,7 +245,7 @@ static int test_zswapin(const char *root) goto out; } - if (zswpin < MB(24) / PAGE_SIZE) { + if (zswpin < MB(24) / sysconf(_SC_PAGESIZE)) { ksft_print_msg("at least 24MB should be brought back from zswap\n"); goto out; } From 6e9f5c2eecd107cf9a10fd22d311b2c49026f474 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:55 +0800 Subject: [PATCH 029/321] selftests/cgroup: rename PAGE_SIZE to BUF_SIZE in cgroup_util MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cgroup utility code defines a local PAGE_SIZE macro hardcoded to 4096, which is used primarily as a generic buffer size for reading cgroup and proc files. This naming is misleading because the value has nothing to do with the actual page size of the system. On architectures with larger pages (e.g., 64K on arm64 or ppc64), the name suggests a relationship that does not exist. Additionally, the name can shadow or conflict with PAGE_SIZE definitions from system headers, leading to confusion or subtle bugs. To resolve this, rename the macro to BUF_SIZE to accurately reflect its purpose as a general I/O buffer size. Furthermore, test_memcontrol currently relies on this hardcoded 4K value to stride through memory and trigger page faults. Update this logic to use the actual system page size dynamically. This micro-optimizes the memory faulting process by ensuring it iterates correctly and efficiently based on the underlying architecture's true page size. (This part from Waiman) Link: https://lore.kernel.org/20260424040059.12940-5-li.wang@linux.dev Signed-off-by: Li Wang Signed-off-by: Waiman Long Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- .../selftests/cgroup/lib/cgroup_util.c | 18 +++++++++--------- .../cgroup/lib/include/cgroup_util.h | 4 ++-- tools/testing/selftests/cgroup/test_core.c | 2 +- tools/testing/selftests/cgroup/test_freezer.c | 2 +- .../selftests/cgroup/test_memcontrol.c | 19 ++++++++++++------- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 42f54936f4bb..f1ec7de58ae3 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -141,7 +141,7 @@ int cg_read_strcmp_wait(const char *cgroup, const char *control, int cg_read_strstr(const char *cgroup, const char *control, const char *needle) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; if (cg_read(cgroup, control, buf, sizeof(buf))) return -1; @@ -171,7 +171,7 @@ long cg_read_long_fd(int fd) long cg_read_key_long(const char *cgroup, const char *control, const char *key) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; char *ptr; if (cg_read(cgroup, control, buf, sizeof(buf))) @@ -207,7 +207,7 @@ long cg_read_key_long_poll(const char *cgroup, const char *control, long cg_read_lc(const char *cgroup, const char *control) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; const char delim[] = "\n"; char *line; long cnt = 0; @@ -259,7 +259,7 @@ int cg_write_numeric(const char *cgroup, const char *control, long value) static int cg_find_root(char *root, size_t len, const char *controller, bool *nsdelegate) { - char buf[10 * PAGE_SIZE]; + char buf[10 * BUF_SIZE]; char *fs, *mount, *type, *options; const char delim[] = "\n\t "; @@ -314,7 +314,7 @@ int cg_create(const char *cgroup) int cg_wait_for_proc_count(const char *cgroup, int count) { - char buf[10 * PAGE_SIZE] = {0}; + char buf[10 * BUF_SIZE] = {0}; int attempts; char *ptr; @@ -339,7 +339,7 @@ int cg_wait_for_proc_count(const char *cgroup, int count) int cg_killall(const char *cgroup) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; char *ptr = buf; /* If cgroup.kill exists use it. */ @@ -549,7 +549,7 @@ int cg_run_nowait(const char *cgroup, int proc_mount_contains(const char *option) { - char buf[4 * PAGE_SIZE]; + char buf[4 * BUF_SIZE]; ssize_t read; read = read_text("/proc/mounts", buf, sizeof(buf)); @@ -561,7 +561,7 @@ int proc_mount_contains(const char *option) int cgroup_feature(const char *feature) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; ssize_t read; read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); @@ -588,7 +588,7 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) return -1; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index 567b1082974c..febc1723d090 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -2,8 +2,8 @@ #include #include -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 +#ifndef BUF_SIZE +#define BUF_SIZE 4096 #endif #define MB(x) (x << 20) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 7b83c7e7c9d4..88ca832d4fc1 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -87,7 +87,7 @@ static int test_cgcore_destroy(const char *root) int ret = KSFT_FAIL; char *cg_test = NULL; int child_pid; - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; cg_test = cg_name(root, "cg_test"); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 97fae92c8387..160a9e6ad277 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -642,7 +642,7 @@ cleanup: */ static int proc_check_stopped(int pid) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; int len; len = proc_read_text(pid, 0, "stat", buf, sizeof(buf)); diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index b43da9bc20c4..44338dbaee81 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -26,6 +26,7 @@ static bool has_localevents; static bool has_recursiveprot; +static int page_size; int get_temp_fd(void) { @@ -34,7 +35,7 @@ int get_temp_fd(void) int alloc_pagecache(int fd, size_t size) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; struct stat st; int i; @@ -61,7 +62,7 @@ int alloc_anon(const char *cgroup, void *arg) char *buf, *ptr; buf = malloc(size); - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + for (ptr = buf; ptr < buf + size; ptr += page_size) *ptr = 0; free(buf); @@ -70,7 +71,7 @@ int alloc_anon(const char *cgroup, void *arg) int is_swap_enabled(void) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; const char delim[] = "\n"; int cnt = 0; char *line; @@ -113,7 +114,7 @@ static int test_memcg_subtree_control(const char *root) { char *parent, *child, *parent2 = NULL, *child2 = NULL; int ret = KSFT_FAIL; - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; /* Create two nested cgroups with the memory controller enabled */ parent = cg_name(root, "memcg_test_0"); @@ -184,7 +185,7 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg) return -1; } - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + for (ptr = buf; ptr < buf + size; ptr += page_size) *ptr = 0; current = cg_read_long(cgroup, "memory.current"); @@ -414,7 +415,7 @@ static int alloc_anon_noexit(const char *cgroup, void *arg) return -1; } - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + for (ptr = buf; ptr < buf + size; ptr += page_size) *ptr = 0; while (getppid() == ppid) @@ -1000,7 +1001,7 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) return -1; } - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + for (ptr = buf; ptr < buf + size; ptr += page_size) *ptr = 0; mem_current = cg_read_long(cgroup, "memory.current"); @@ -1791,6 +1792,10 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, proc_status; + page_size = sysconf(_SC_PAGE_SIZE); + if (page_size <= 0) + page_size = BUF_SIZE; + ksft_print_header(); ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) From 43743cc516684e40faf15afbb123eccd3d90e244 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:56 +0800 Subject: [PATCH 030/321] selftests/cgroup: replace hardcoded page size values in test_zswap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_zswap uses hardcoded values of 4095 and 4096 throughout as page stride and page size, which are only correct on systems with a 4K page size. On architectures with larger pages (e.g., 64K on arm64 or ppc64), these constants cause memory to be touched at sub-page granularity, leading to inefficient access patterns and incorrect page count calculations, which can cause test failures. Replace all hardcoded 4095 and 4096 values with a global pagesize variable initialized from sysconf(_SC_PAGESIZE) at startup, and remove the redundant local sysconf() calls scattered across individual functions. No functional change on 4K page size systems. Link: https://lore.kernel.org/20260424040059.12940-6-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Yosry Ahmed Reviewed-by: Jiayuan Chen Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Chengming Zhou Cc: Nhat Pham Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 45 ++++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 37aa83c2f1bf..23ff11390a33 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -15,6 +15,8 @@ #include "kselftest.h" #include "cgroup_util.h" +static int page_size; + #define PATH_ZSWAP "/sys/module/zswap" #define PATH_ZSWAP_ENABLED "/sys/module/zswap/parameters/enabled" @@ -73,11 +75,11 @@ static int allocate_and_read_bytes(const char *cgroup, void *arg) if (!mem) return -1; - for (int i = 0; i < size; i += 4095) + for (int i = 0; i < size; i += page_size) mem[i] = 'a'; /* Go through the allocated memory to (z)swap in and out pages */ - for (int i = 0; i < size; i += 4095) { + for (int i = 0; i < size; i += page_size) { if (mem[i] != 'a') ret = -1; } @@ -93,7 +95,7 @@ static int allocate_bytes(const char *cgroup, void *arg) if (!mem) return -1; - for (int i = 0; i < size; i += 4095) + for (int i = 0; i < size; i += page_size) mem[i] = 'a'; free(mem); return 0; @@ -167,7 +169,7 @@ static int test_swapin_nozswap(const char *root) int ret = KSFT_FAIL; char *test_group, mem_max_buf[32]; long swap_peak, zswpout, min_swap; - size_t allocation_size = sysconf(_SC_PAGESIZE) * 512; + size_t allocation_size = page_size * 512; min_swap = allocation_size / 4; snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size * 3/4); @@ -245,7 +247,7 @@ static int test_zswapin(const char *root) goto out; } - if (zswpin < MB(24) / sysconf(_SC_PAGESIZE)) { + if (zswpin < MB(24) / page_size) { ksft_print_msg("at least 24MB should be brought back from zswap\n"); goto out; } @@ -272,9 +274,8 @@ out: */ static int attempt_writeback(const char *cgroup, void *arg) { - long pagesize = sysconf(_SC_PAGESIZE); size_t memsize = MB(4); - char buf[pagesize]; + char buf[page_size]; long zswap_usage; bool wb_enabled = *(bool *) arg; int ret = -1; @@ -289,11 +290,11 @@ static int attempt_writeback(const char *cgroup, void *arg) * half empty, this will result in data that is still compressible * and ends up in zswap, with material zswap usage. */ - for (int i = 0; i < pagesize; i++) - buf[i] = i < pagesize/2 ? (char) i : 0; + for (int i = 0; i < page_size; i++) + buf[i] = i < page_size/2 ? (char) i : 0; - for (int i = 0; i < memsize; i += pagesize) - memcpy(&mem[i], buf, pagesize); + for (int i = 0; i < memsize; i += page_size) + memcpy(&mem[i], buf, page_size); /* Try and reclaim allocated memory */ if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) { @@ -304,8 +305,8 @@ static int attempt_writeback(const char *cgroup, void *arg) zswap_usage = cg_read_long(cgroup, "memory.zswap.current"); /* zswpin */ - for (int i = 0; i < memsize; i += pagesize) { - if (memcmp(&mem[i], buf, pagesize)) { + for (int i = 0; i < memsize; i += page_size) { + if (memcmp(&mem[i], buf, page_size)) { ksft_print_msg("invalid memory\n"); goto out; } @@ -441,7 +442,7 @@ static int test_no_invasive_cgroup_shrink(const char *root) if (cg_enter_current(control_group)) goto out; control_allocation = malloc(control_allocation_size); - for (int i = 0; i < control_allocation_size; i += 4095) + for (int i = 0; i < control_allocation_size; i += page_size) control_allocation[i] = 'a'; if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) goto out; @@ -481,7 +482,7 @@ static int no_kmem_bypass_child(const char *cgroup, void *arg) values->child_allocated = true; return -1; } - for (long i = 0; i < values->target_alloc_bytes; i += 4095) + for (long i = 0; i < values->target_alloc_bytes; i += page_size) ((char *)allocation)[i] = 'a'; values->child_allocated = true; pause(); @@ -529,7 +530,7 @@ static int test_no_kmem_bypass(const char *root) min_free_kb_low = sys_info.totalram / 500000; values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) + sys_info.totalram * 5 / 100; - stored_pages_threshold = sys_info.totalram / 5 / 4096; + stored_pages_threshold = sys_info.totalram / 5 / page_size; trigger_allocation_size = sys_info.totalram / 20; /* Set up test memcg */ @@ -556,7 +557,7 @@ static int test_no_kmem_bypass(const char *root) if (!trigger_allocation) break; - for (int i = 0; i < trigger_allocation_size; i += 4095) + for (int i = 0; i < trigger_allocation_size; i += page_size) trigger_allocation[i] = 'b'; usleep(100000); free(trigger_allocation); @@ -567,8 +568,8 @@ static int test_no_kmem_bypass(const char *root) /* If memory was pushed to zswap, verify it belongs to memcg */ if (stored_pages > stored_pages_threshold) { int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); - int delta = stored_pages * 4096 - zswapped; - int result_ok = delta < stored_pages * 4096 / 4; + int delta = stored_pages * page_size - zswapped; + int result_ok = delta < stored_pages * page_size / 4; ret = result_ok ? KSFT_PASS : KSFT_FAIL; break; @@ -622,7 +623,7 @@ static int allocate_random_and_wait(const char *cgroup, void *arg) close(fd); /* Touch all pages to ensure they're faulted in */ - for (size_t i = 0; i < size; i += PAGE_SIZE) + for (size_t i = 0; i < size; i += page_size) mem[i] = mem[i]; /* Use MADV_PAGEOUT to push pages into zswap */ @@ -752,6 +753,10 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i; + page_size = sysconf(_SC_PAGE_SIZE); + if (page_size <= 0) + page_size = BUF_SIZE; + ksft_print_header(); ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) From a19b474927519c8822193f8bdc010641ec6ba404 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:57 +0800 Subject: [PATCH 031/321] selftest/cgroup: fix zswap test_no_invasive_cgroup_shrink on large pagesize system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_no_invasive_cgroup_shrink sets up two cgroups: wb_group, which is expected to trigger zswap writeback, and a control group (renamed to zw_group), which should only have pages sitting in zswap without any writeback. There are two problems with the current test: 1) The data patterns are reversed. wb_group uses allocate_bytes(), which writes only a single byte per page — trivially compressible, especially by zstd — so compressed pages fit within zswap.max and writeback is never triggered. Meanwhile, the control group uses getrandom() to produce hard-to-compress data, but it is the group that does *not* need writeback. 2) The test uses fixed sizes (10K zswap.max, 10MB allocation) that are too small on systems with large PAGE_SIZE (e.g. 64K), failing to build enough memory pressure to trigger writeback reliably. Fix both issues by: - Swapping the data patterns: fill wb_group pages with partially random data (getrandom for page_size/4 bytes) to resist compression and trigger writeback, and fill zw_group pages with simple repeated data to stay compressed in zswap. - Making all size parameters PAGE_SIZE-aware: set allocation size to PAGE_SIZE * 1024, memory.zswap.max to PAGE_SIZE, and memory.max to allocation_size / 2 for both cgroups. - Allocating memory inline instead of via cg_run() so the pages remain resident throughout the test. === Error Log === # getconf PAGESIZE 65536 # ./test_zswap TAP version 13 ... ok 5 test_zswap_writeback_disabled ok 6 # SKIP test_no_kmem_bypass not ok 7 test_no_invasive_cgroup_shrink Link: https://lore.kernel.org/20260424040059.12940-7-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 70 ++++++++++++++------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 23ff11390a33..8f0478923bd0 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "kselftest.h" #include "cgroup_util.h" @@ -426,44 +427,71 @@ static int test_zswap_writeback_disabled(const char *root) static int test_no_invasive_cgroup_shrink(const char *root) { int ret = KSFT_FAIL; - size_t control_allocation_size = MB(10); - char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; + unsigned int off; + size_t allocation_size = page_size * 1024; + unsigned int nr_pages = allocation_size / page_size; + char zswap_max_buf[32], mem_max_buf[32]; + char *zw_allocation = NULL, *wb_allocation = NULL; + char *zw_group = NULL, *wb_group = NULL; + + snprintf(zswap_max_buf, sizeof(zswap_max_buf), "%d", page_size); + snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size / 2); wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); if (!wb_group) return KSFT_FAIL; - if (cg_write(wb_group, "memory.zswap.max", "10K")) + if (cg_write(wb_group, "memory.zswap.max", zswap_max_buf)) goto out; - control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); - if (!control_group) + if (cg_write(wb_group, "memory.max", mem_max_buf)) goto out; - /* Push some test_group2 memory into zswap */ - if (cg_enter_current(control_group)) + zw_group = setup_test_group_1M(root, "per_memcg_wb_test2"); + if (!zw_group) goto out; - control_allocation = malloc(control_allocation_size); - for (int i = 0; i < control_allocation_size; i += page_size) - control_allocation[i] = 'a'; - if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) + if (cg_write(zw_group, "memory.max", mem_max_buf)) goto out; - /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ - if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) + /* Push some zw_group memory into zswap (simple data, easy to compress) */ + if (cg_enter_current(zw_group)) goto out; + zw_allocation = malloc(allocation_size); + for (int i = 0; i < nr_pages; i++) { + off = (unsigned long)i * page_size; + memset(&zw_allocation[off], 0, page_size); + memset(&zw_allocation[off], 'a', page_size/4); + } + if (cg_read_key_long(zw_group, "memory.stat", "zswapped") < 1) + goto out; + + /* Push wb_group memory into zswap with hard-to-compress data to trigger wb */ + if (cg_enter_current(wb_group)) + goto out; + wb_allocation = malloc(allocation_size); + if (!wb_allocation) + goto out; + for (int i = 0; i < nr_pages; i++) { + off = (unsigned long)i * page_size; + memset(&wb_allocation[off], 0, page_size); + getrandom(&wb_allocation[off], page_size/4, 0); + } /* Verify that only zswapped memory from gwb_group has been written back */ - if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) + if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(zw_group) == 0) ret = KSFT_PASS; out: cg_enter_current(root); - if (control_group) { - cg_destroy(control_group); - free(control_group); + if (zw_group) { + cg_destroy(zw_group); + free(zw_group); } - cg_destroy(wb_group); - free(wb_group); - if (control_allocation) - free(control_allocation); + if (wb_group) { + cg_destroy(wb_group); + free(wb_group); + } + if (zw_allocation) + free(zw_allocation); + if (wb_allocation) + free(wb_allocation); return ret; } From 883015a9c328eaeac48395db36f9e5f864f6473d Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:58 +0800 Subject: [PATCH 032/321] selftest/cgroup: fix zswap attempt_writeback() on 64K pagesize system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In attempt_writeback(), a memsize of 4M only covers 64 pages on 64K page size systems. When memory.reclaim is called, the kernel prefers reclaiming clean file pages (binary, libc, linker, etc.) over swapping anonymous pages. With only 64 pages of anonymous memory, the reclaim target can be largely or entirely satisfied by dropping file pages, resulting in very few or zero anonymous pages being pushed into zswap. This causes zswap_usage to be extremely small or zero, making zswap_usage/4 insufficient to create meaningful writeback pressure. The test then fails because no writeback is triggered. On 4K page size systems this is not an issue because 4M covers 1024 pages, and file pages are a small fraction of the reclaim target. Fix this by: - Always allocating 1024 pages regardless of page size. This ensures enough anonymous pages to reliably populate zswap and trigger writeback, while keeping the original 4M allocation on 4K systems. - Setting zswap.max to zswap_usage/4 instead of zswap_usage/2 to create stronger writeback pressure, ensuring reclaim reliably triggers writeback even on large page size systems. === Error Log === # uname -rm 6.12.0-211.el10.ppc64le ppc64le # getconf PAGESIZE 65536 # ./test_zswap TAP version 13 1..7 ok 1 test_zswap_usage ok 2 test_swapin_nozswap ok 3 test_zswapin not ok 4 test_zswap_writeback_enabled ... Link: https://lore.kernel.org/20260424040059.12940-8-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 8f0478923bd0..5fe0cffb5575 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -268,14 +268,14 @@ out: This will move it into zswap. * 3. Save current zswap usage. * 4. Move the memory allocated in step 1 back in from zswap. - * 5. Set zswap.max to half the amount that was recorded in step 3. + * 5. Set zswap.max to 1/4 of the amount that was recorded in step 3. * 6. Attempt to reclaim memory equal to the amount that was allocated, this will either trigger writeback if it's enabled, or reclamation will fail if writeback is disabled as there isn't enough zswap space. */ static int attempt_writeback(const char *cgroup, void *arg) { - size_t memsize = MB(4); + size_t memsize = page_size * 1024; char buf[page_size]; long zswap_usage; bool wb_enabled = *(bool *) arg; @@ -313,12 +313,12 @@ static int attempt_writeback(const char *cgroup, void *arg) } } - if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2)) + if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/4)) goto out; /* * If writeback is enabled, trying to reclaim memory now will trigger a - * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * writeback as zswap.max is 1/4 of what was needed when reclaim ran the first time. * If writeback is disabled, memory reclaim will fail as zswap is limited and * it can't writeback to swap. */ From e5ab892d05ca1a6b032dbc4c9795372daf226415 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 24 Apr 2026 12:00:59 +0800 Subject: [PATCH 033/321] selftests/cgroup: test_zswap: wait for asynchronous writeback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit zswap writeback is asynchronous, but test_zswap.c checks writeback counters immediately after reclaim/trigger paths. On some platforms (e.g. ppc64le), this can race with background writeback and cause spurious failures even when behavior is correct. Add wait_for_writeback() to poll get_cg_wb_count() with a bounded timeout, and use it in: test_zswap_writeback_one() when writeback is expected test_no_invasive_cgroup_shrink() for the wb_group check This keeps the original before/after assertion style while making the tests robust against writeback completion latency. No test behavior change, selftest stability improvement only. Link: https://lore.kernel.org/20260424040059.12940-9-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Tejun Heo Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Cc: Chengming Zhou Cc: Jiayuan Chen Cc: Waiman Long Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 28 +++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 5fe0cffb5575..49b36ee79160 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -120,6 +120,27 @@ fail: return NULL; } +/* + * Writeback is asynchronous; poll until at least one writeback has + * been recorded for @cg, or until @timeout_ms has elapsed. + */ +static long wait_for_writeback(const char *cg, int timeout_ms) +{ + long elapsed, count; + for (elapsed = 0; elapsed < timeout_ms; elapsed += 100) { + count = get_cg_wb_count(cg); + + if (count < 0) + return -1; + if (count > 0) + return count; + + usleep(100000); + } + + return 0; +} + /* * Sanity test to check that pages are written into zswap. */ @@ -345,7 +366,10 @@ static int test_zswap_writeback_one(const char *cgroup, bool wb) return -1; /* Verify that zswap writeback occurred only if writeback was enabled */ - zswpwb_after = get_cg_wb_count(cgroup); + if (wb) + zswpwb_after = wait_for_writeback(cgroup, 5000); + else + zswpwb_after = get_cg_wb_count(cgroup); if (zswpwb_after < 0) return -1; @@ -476,7 +500,7 @@ static int test_no_invasive_cgroup_shrink(const char *root) } /* Verify that only zswapped memory from gwb_group has been written back */ - if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(zw_group) == 0) + if (wait_for_writeback(wb_group, 5000) > 0 && get_cg_wb_count(zw_group) == 0) ret = KSFT_PASS; out: cg_enter_current(root); From 13fe5736560d6635592b77b1b490fd018af33075 Mon Sep 17 00:00:00 2001 From: Sunny Patel Date: Sun, 19 Apr 2026 23:17:43 +0530 Subject: [PATCH 034/321] mm/migrate_device: cleanup up PMD Checks and warnings Remove the odd VM_WARN_ON_FOLIO(!folio, folio) usage and replace it with a simpler VM_WARN_ON_ONCE(!folio) check. Drop the redundant VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)). Refactor the PMD checks, making the control flow clearer and avoiding duplicate condition checks. Link: https://lore.kernel.org/20260419174747.10701-1-nueralspacetech@gmail.com Signed-off-by: Sunny Patel Acked-by: Zi Yan Reviewed-by: Huang Ying Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Joshua Hahn Cc: Matthew Brost Cc: Rakie Kim Signed-off-by: Andrew Morton --- mm/migrate_device.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 19cd14b34114..554754eb26ff 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -801,8 +801,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, bool flush = false; unsigned long i; - VM_WARN_ON_FOLIO(!folio, folio); - VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); + VM_WARN_ON_ONCE(!folio); if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) return -EINVAL; @@ -859,11 +858,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, if (userfaultfd_missing(vma)) goto unlock_abort; - if (!pmd_none(*pmdp)) { - if (!is_huge_zero_pmd(*pmdp)) - goto unlock_abort; + if (is_huge_zero_pmd(*pmdp)) flush = true; - } else if (!pmd_none(*pmdp)) + else if (!pmd_none(*pmdp)) goto unlock_abort; add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); From e0974347f5bbf5f869d616779684a5ed8337c27b Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Sun, 19 Apr 2026 23:42:25 +0900 Subject: [PATCH 035/321] mm/sparse: remove unnecessary NULL check before allocating mem_section Commit 850ed20539a4 ("mm: move array mem_section init code out of memory_present()") moved mem_section allocation logic into memblocks_present(). Before that move, memory_present() could be called multiple times, so unlikely() matched the common case, where most calls found mem_section already allocated. After that move, memblocks_present() is called exactly once from sparse_init(). Under CONFIG_SPARSEMEM_EXTREME, mem_section is always NULL when it is called. So remove unnecessary NULL check before allocating mem_section. No functional change. Link: https://lore.kernel.org/20260419144225.2875654-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Acked-by: Mike Rapoport (Microsoft) Reviewed by: Donet Tom Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/sparse.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index effdac6b0ab1..e13f9f5fa090 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -201,13 +201,11 @@ static void __init memblocks_present(void) int i, nid; #ifdef CONFIG_SPARSEMEM_EXTREME - if (unlikely(!mem_section)) { - unsigned long size, align; + unsigned long size, align; - size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc_or_panic(size, align); - } + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc_or_panic(size, align); #endif for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) From eb8fc9d285f95cd14697ef3df2b0c2e41c76cbdd Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 16 Apr 2026 14:23:02 +0800 Subject: [PATCH 036/321] mm/vmscan: fix typos in comments Fix three typos in comments: - Line 112: "zome_reclaim_mode" -> "zone_reclaim_mode" - Line 6208: "prioities" -> "priorities" - Line 7067: "that that high" -> "that the high" (duplicated word) Link: https://lore.kernel.org/20260416062302.727468-1-gxxa03070307@gmail.com Signed-off-by: Xiang Gao Reviewed-by: Barry Song Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b2d89ed69d22..a9fd43b23a58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -109,7 +109,7 @@ struct scan_control { /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ + /* zone_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -6359,7 +6359,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) if (current_is_kswapd() || cgroup_reclaim(sc)) return; - /* Throttle if making no progress at high prioities. */ + /* Throttle if making no progress at high priorities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } @@ -7224,7 +7224,7 @@ restart: /* * There should be no need to raise the scanning priority if - * enough pages are already being scanned that that high + * enough pages are already being scanned that the high * watermark would be met at 100% efficiency. */ if (kswapd_shrink_node(pgdat, &sc)) From d86c9e971af2315119a78c564a802fafcebf1b6b Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Wed, 15 Apr 2026 20:39:37 -0700 Subject: [PATCH 037/321] mm: fix mmap errno value when MAP_DROPPABLE is not supported Patch series "fix MAP_DROPPABLE not supported errno", v4. Mark Brown reported seeing a regression in -next on 32 bit arm with the mlock selftests. Before exiting and marking the tests failed, the following message was logged after an attempt to create a MAP_DROPPABLE mapping: Bail out! mmap error: Unknown error 524 It turns out error 524 is ENOTSUPP which is an error that userspace is not supposed to see, but it indicates in this instance that MAP_DROPPABLE is not supported. The first patch changes the errno returned to EOPNOTSUPP. The second patch is a second version of a prior patch to introduce selftests to verify locking behavior with droppable mappings with the additional change to skip the tests when MAP_DROPPABLE is not supported. The third patch fixes the MAP_DROPPABLE selftest so that it is run by the framework and skips if MAP_DROPPABLE is not supported. This patch (of 3): On configs where MAP_DROPPABLE is not supported (currently any 32-bit config except for PPC32), mmap fails with errno set to ENOTSUPP. However, ENOTSUPP is not a standard error value that userspace knows about. The acceptable userspace-visible errno to use is EOPNOTSUPP. checkpatch.pl has a warning to this effect. Link: https://lore.kernel.org/20260416033939.49981-1-anthony.yznaga@oracle.com Link: https://lore.kernel.org/20260416033939.49981-2-anthony.yznaga@oracle.com Fixes: 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") Signed-off-by: Anthony Yznaga Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka (SUSE) Reported-by: Mark Brown Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5754d1c36462..2311ae7c2ff4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -504,7 +504,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) - return -ENOTSUPP; + return -EOPNOTSUPP; /* * A locked or stack area makes no sense to be droppable. * From c02dd57c57a6ae7dd05fdf8b861f1a76e1e4f8bc Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Wed, 15 Apr 2026 20:39:38 -0700 Subject: [PATCH 038/321] selftests/mm: verify droppable mappings cannot be locked For configs that support MAP_DROPPABLE verify that a mapping created with MAP_DROPPABLE cannot be locked via mlock(), and that it will not be locked if it's created after mlockall(MCL_FUTURE). Link: https://lore.kernel.org/20260416033939.49981-3-anthony.yznaga@oracle.com Signed-off-by: Anthony Yznaga Acked-by: David Hildenbrand (Arm) Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka (SUSE) Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mlock2-tests.c | 86 ++++++++++++++++++++--- 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index b474f2b20def..e16e288cc7c1 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE #include +#include #include #include #include @@ -163,14 +164,17 @@ static int lock_check(unsigned long addr) return (vma_rss == vma_size); } -static int unlock_lock_check(char *map) +static int unlock_lock_check(char *map, bool mlock_supported) { - if (is_vmflag_set((unsigned long)map, LOCKED)) { - ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED); - return 1; - } + if (!is_vmflag_set((unsigned long)map, LOCKED)) + return 0; - return 0; + if (mlock_supported) + ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED); + else + ksft_print_msg("VMA flag %s is present on an unsupported VMA\n", LOCKED); + + return 1; } static void test_mlock_lock(void) @@ -196,7 +200,7 @@ static void test_mlock_lock(void) ksft_exit_fail_msg("munlock(): %s\n", strerror(errno)); } - ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__); + ksft_test_result(!unlock_lock_check(map, true), "%s: Unlocked\n", __func__); munmap(map, 2 * page_size); } @@ -296,7 +300,7 @@ static void test_munlockall0(void) ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno)); } - ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); + ksft_test_result(!unlock_lock_check(map, true), "%s: No locked memory\n", __func__); munmap(map, 2 * page_size); } @@ -336,7 +340,67 @@ static void test_munlockall1(void) ksft_exit_fail_msg("munlockall() %s\n", strerror(errno)); } - ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); + ksft_test_result(!unlock_lock_check(map, true), "%s: No locked memory\n", __func__); + munmap(map, 2 * page_size); +} + +/* Droppable memory should not be lockable. */ +static void test_mlock_droppable(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + /* Ensure MCL_FUTURE is not set. */ + if (munlockall()) { + ksft_test_result_fail("munlockall() %s\n", strerror(errno)); + return; + } + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + if (map == MAP_FAILED) { + if ((errno == EOPNOTSUPP) || (errno == EINVAL)) + ksft_test_result_skip("%s: MAP_DROPPABLE not supported\n", __func__); + else + ksft_test_result_fail("mmap error: %s\n", strerror(errno)); + return; + } + + if (mlock2_(map, 2 * page_size, 0)) + ksft_test_result_fail("mlock2(0): %s\n", strerror(errno)); + else + ksft_test_result(!unlock_lock_check(map, false), + "%s: droppable memory not locked\n", __func__); + + munmap(map, 2 * page_size); +} + +static void test_mlockall_future_droppable(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + ksft_test_result_fail("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno)); + return; + } + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + + if (map == MAP_FAILED) { + if ((errno == EOPNOTSUPP) || (errno == EINVAL)) + ksft_test_result_skip("%s: MAP_DROPPABLE not supported\n", __func__); + else + ksft_test_result_fail("mmap error: %s\n", strerror(errno)); + munlockall(); + return; + } + + ksft_test_result(!unlock_lock_check(map, false), "%s: droppable memory not locked\n", + __func__); + + munlockall(); munmap(map, 2 * page_size); } @@ -442,7 +506,7 @@ int main(int argc, char **argv) munmap(map, size); - ksft_set_plan(13); + ksft_set_plan(15); test_mlock_lock(); test_mlock_onfault(); @@ -451,6 +515,8 @@ int main(int argc, char **argv) test_lock_onfault_of_present(); test_vma_management(true); test_mlockall(); + test_mlock_droppable(); + test_mlockall_future_droppable(); ksft_finished(); } From 303c6bdfe7cb51658fe632e31ee5a5d526c88435 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Wed, 15 Apr 2026 20:39:39 -0700 Subject: [PATCH 039/321] selftests/mm: run the MAP_DROPPABLE selftest The test was not being run by the selftest framework so it was never noticed that it would fail with an assertion failure on configs without support for MAP_DROPPABLE. Update the test so that it is skipped instead when MAP_DROPPABLE is not supported, and add it to the mmap category so that the test is run by the framework. Link: https://lore.kernel.org/20260416033939.49981-4-anthony.yznaga@oracle.com Signed-off-by: Anthony Yznaga Acked-by: David Hildenbrand (Arm) Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Mark Brown Cc: Vlastimil Babka (SUSE) Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/droppable.c | 9 ++++++++- tools/testing/selftests/mm/run_vmtests.sh | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c index 44940f75c461..30c8be37fcb9 100644 --- a/tools/testing/selftests/mm/droppable.c +++ b/tools/testing/selftests/mm/droppable.c @@ -26,7 +26,14 @@ int main(int argc, char *argv[]) ksft_set_plan(1); alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); - assert(alloc != MAP_FAILED); + if (alloc == MAP_FAILED) { + if ((errno == EOPNOTSUPP) || (errno == EINVAL)) { + ksft_test_result_skip("MAP_DROPPABLE not supported\n"); + exit(KSFT_SKIP); + } + ksft_test_result_fail("mmap error: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } memset(alloc, 'A', alloc_size); for (size_t i = 0; i < alloc_size; i += page_size) assert(*(uint8_t *)(alloc + i)); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index c17b133a81d2..3b61677fe984 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -382,6 +382,7 @@ else fi CATEGORY="mmap" run_test ./map_populate +CATEGORY="mmap" run_test ./droppable CATEGORY="mlock" run_test ./mlock-random-test From 781b0e74748f14b0e732eb736370bbbed181fe4d Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 14 Apr 2026 15:58:13 +0800 Subject: [PATCH 040/321] mm/page_owner: fix %pGp format specifier argument type The %pGp format specifier expects an argument of type 'unsigned long *', but page->flags is now of type 'memdesc_flags_t' (a struct containing an unsigned long member 'f') after the introduction of memdesc_flags_t. Fix the type mismatch by passing &page->flags.f instead of &page->flags, which matches the expected type. Link: https://lore.kernel.org/20260414075813.3425968-1-zhen.ni@easystack.cn Fixes: 53fbef56e07d ("mm: introduce memdesc_flags_t") Signed-off-by: Zhen Ni Acked-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 8178e0be557f..2dddcb6510aa 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -573,7 +573,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[page_mt], pfn >> pageblock_order, migratetype_names[pageblock_mt], - &page->flags); + &page->flags.f); ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) From 8c2c7df58b5433f614d603bbdffd85f2a392b74a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 12 Apr 2026 14:19:31 -0700 Subject: [PATCH 041/321] Docs/mm/damon/maintainer-profile: add AI review usage guideline DAMON is opted-in for DAMON patches scanning [1] and email delivery [2]. Clarify how that could be used on DAMON maintainer profile. Link: https://lore.kernel.org/20260412211932.89038-1-sj@kernel.org Link: https://github.com/sashiko-dev/sashiko/commit/ad9f4a98f958 [1] Link: https://github.com/sashiko-dev/sashiko/commit/b554c7b6e733 [2] Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Randy Dunlap Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index bcb9798a27a8..fb2fa00cc9aa 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -100,3 +100,24 @@ There is also a public Google `calendar `_ that has the events. Anyone can subscribe to it. DAMON maintainer will also provide periodic reminders to the mailing list (damon@lists.linux.dev). + +AI Review +--------- + +For patches that are publicly posted to DAMON mailing list +(damon@lists.linux.dev), AI reviews of the patches will be available at +sashiko.dev. The reviews could also be sent as mails to the author of the +patch. + +Patch authors are encouraged to check the AI reviews and share their opinions. +The sharing could be done as a reply to the mail thread. Consider reducing the +recipients list for such sharing, since some people are not really interested +in AI reviews. As a rule of thumb, drop stable@vger.kernel.org and individuals +except DAMON maintainer. + +`hkml` also provides a `feature +`_ +for such sharing. Please feel free to use the feature. + +It is only an optional recommendation. DAMON maintainer could also ask any +question about the AI reviews, though. From ffe55393137c01aa01940b528afcea8c5a108ed7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 10 Apr 2026 17:24:19 +0800 Subject: [PATCH 042/321] mm/sparse: remove sparse buffer pre-allocation mechanism Commit 9bdac9142407 ("sparsemem: Put mem map for one node together.") introduced a mechanism to pre-allocate a large memory block to hold all memmaps for a NUMA node upfront. However, the original commit message did not clearly state the actual benefits or the necessity of explicitly pre-allocating a single chunk for all memmap areas of a given node. One of the concerns about removing this pre-allocation is that the subsequent per-section memmap allocations could become scattered around, and might turn too many memory blocks/sections into an "un-offlinable" state. However, tests show that even without the explicit node-wide pre-allocation, memblock still allocates memory closely and back-to-back. When tracing vmemmap_set_pmd allocations, the physical chunks allocated by memblock are strictly adjacent to each other in a single contiguous physical range (mapped top-down). Because they are packed tightly together naturally, they will at most consume or pollute the exact same number of memory blocks as the explicit pre-allocation did. Another concern is the boot performance impact of calling memmap_alloc() multiple times compared to one large node-wide allocation. Tests on a 256GB VM showed that memmap allocation time increased from 199,555 ns to 741,292 ns. Even though it is 3.7x slower, on a 1TB machine, the entire memory allocation time would only take a few milliseconds. This boot performance difference is completely negligible. Since no negative impact on memory offlining behavior or noticeable boot performance regression was found, this patch proposes removing the explicit node-wide memmap pre-allocation mechanism to reduce the maintenance burden. Link: https://lore.kernel.org/20260410092419.2446420-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/sparse-vmemmap.c | 7 +----- mm/sparse.c | 58 +-------------------------------------------- 3 files changed, 2 insertions(+), 64 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e3b6112a8d79..8a0078a4dc78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4855,7 +4855,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -void *sparse_buffer_alloc(unsigned long size); unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 3c35d2303a61..43f82621dd92 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap) { - void *ptr; - if (altmap) return altmap_alloc_block_buf(size, altmap); - ptr = sparse_buffer_alloc(size); - if (!ptr) - ptr = vmemmap_alloc_block(size, node); - return ptr; + return vmemmap_alloc_block(size, node); } static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) diff --git a/mm/sparse.c b/mm/sparse.c index e13f9f5fa090..16ac6df3c89f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -239,12 +239,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn, struct dev_pagemap *pgmap) { unsigned long size = section_map_size(); - struct page *map = sparse_buffer_alloc(size); + struct page *map; phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - if (map) - return map; - map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", @@ -254,55 +251,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn, } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -static void *sparsemap_buf __meminitdata; -static void *sparsemap_buf_end __meminitdata; - -static inline void __meminit sparse_buffer_free(unsigned long size) -{ - WARN_ON(!sparsemap_buf || size == 0); - memblock_free(sparsemap_buf, size); -} - -static void __init sparse_buffer_init(unsigned long size, int nid) -{ - phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ - /* - * Pre-allocated buffer is mainly used by __populate_section_memmap - * and we want it to be properly aligned to the section size - this is - * especially the case for VMEMMAP which maps memmap to PMDs - */ - sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); - sparsemap_buf_end = sparsemap_buf + size; -} - -static void __init sparse_buffer_fini(void) -{ - unsigned long size = sparsemap_buf_end - sparsemap_buf; - - if (sparsemap_buf && size > 0) - sparse_buffer_free(size); - sparsemap_buf = NULL; -} - -void * __meminit sparse_buffer_alloc(unsigned long size) -{ - void *ptr = NULL; - - if (sparsemap_buf) { - ptr = (void *) roundup((unsigned long)sparsemap_buf, size); - if (ptr + size > sparsemap_buf_end) - ptr = NULL; - else { - /* Free redundant aligned space */ - if ((unsigned long)(ptr - sparsemap_buf) > 0) - sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); - sparsemap_buf = ptr + size; - } - } - return ptr; -} - void __weak __meminit vmemmap_populate_print_last(void) { } @@ -360,8 +308,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, goto failed; } - sparse_buffer_init(map_count * section_map_size(), nid); - sparse_vmemmap_init_nid_early(nid); for_each_present_section_nr(pnum_begin, pnum) { @@ -379,7 +325,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, __func__, nid); pnum_begin = pnum; sparse_usage_fini(); - sparse_buffer_fini(); goto failed; } memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), @@ -388,7 +333,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, } } sparse_usage_fini(); - sparse_buffer_fini(); return; failed: /* From db5e2c01ca3a8fba1c0687d7eec3ac701387a31f Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Fri, 10 Apr 2026 15:47:39 +0800 Subject: [PATCH 043/321] mm/memory-failure: use bool for forcekill state 'forcekill' is used as a boolean flag to control whether processes should be forcibly killed. It is only assigned from boolean expressions and never used in arithmetic or bitmask operations. Convert it from int to bool. No functional change intended. Link: https://lore.kernel.org/20260410074740.2524718-1-ye.liu@linux.dev Signed-off-by: Ye Liu Reviewed-by: SeongJae Park Acked-by: Miaohe Lin Cc: Liu Ye Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ee42d4361309..62b547c168fc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -459,7 +459,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, * Only do anything when FORCEKILL is set, otherwise just free the * list (this is used for clean pages which do not need killing) */ -static void kill_procs(struct list_head *to_kill, int forcekill, +static void kill_procs(struct list_head *to_kill, bool forcekill, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -1582,7 +1582,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, { LIST_HEAD(tokill); bool unmap_success; - int forcekill; + bool forcekill; bool mlocked = folio_test_mlocked(folio); /* @@ -1703,7 +1703,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, unmap_mapping_range(mapping, start, size, 0); } - kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags); + kill_procs(to_kill, !!(flags & MF_MUST_KILL), pfn, flags); } /* From 98e09ce7bb67902c452d22a2a10baf3f0951f3d2 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Thu, 9 Apr 2026 09:43:22 +0800 Subject: [PATCH 044/321] mm/khugepaged: use ALIGN helpers for PMD alignment PMD alignment in khugepaged is currently implemented using a mix of rounding helpers and open-coded bitmask operations. Use ALIGN() and ALIGN_DOWN() consistently for PMD-sized address range alignment, matching the preferred style for address and size handling. No functional change intended. Link: https://lore.kernel.org/20260409014323.2385982-1-ye.liu@linux.dev Signed-off-by: Ye Liu Reviewed-by: Zi Yan Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Reviewed-by: SeongJae Park Cc: Baolin Wang Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Liu Ye Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b8452dbdb043..5f4e009593e0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2528,8 +2528,8 @@ static void collapse_scan_mm_slot(unsigned int progress_max, cc->progress++; continue; } - hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); - hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); + hstart = ALIGN(vma->vm_start, HPAGE_PMD_SIZE); + hend = ALIGN_DOWN(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) { cc->progress++; continue; @@ -2845,8 +2845,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, mmgrab(mm); lru_add_drain_all(); - hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = end & HPAGE_PMD_MASK; + hstart = ALIGN(start, HPAGE_PMD_SIZE); + hend = ALIGN_DOWN(end, HPAGE_PMD_SIZE); for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; From b0f3d00e15e82242d08791fea00807cb01eb1235 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 8 Apr 2026 08:47:00 -0700 Subject: [PATCH 045/321] mm: huge_memory: use sysfs_match_string() in defrag_store() Patch series "mm: huge_memory: clean up defrag sysfs with shared", v2. Refactor defrag_store() and defrag_show() to use shared data tables instead of duplicated if/else chains. Patch 1 introduces an enum defrag_mode, a defrag_mode_strings[] table, and a defrag_flags[] mapping array, then rewrites defrag_store() to use sysfs_match_string() with a loop over defrag_flags[]. Patch 2 refactors defrag_show() to use the same arrays, replacing its hardcoded if/else chain of test_bit() calls and string literals. This follows the same pattern applied to anon_enabled_store() in commit 522dfb4ba71f ("mm: huge_memory: refactor anon_enabled_store() with change_anon_orders()"). This patch (of 2): Replace the if/else chain of sysfs_streq() calls in defrag_store() with sysfs_match_string() and a defrag_mode_strings[] table. Introduce enum defrag_mode and defrag_flags[] array mapping each mode to its corresponding transparent_hugepage_flag. The store function now loops over defrag_flags[], setting the bit for the selected mode and clearing the others. When mode is DEFRAG_NEVER (index 4), no index in the 4-element defrag_flags[] matches, so all flags are cleared. Note that the enum ordering (always, defer, defer+madvise, madvise, never) differs from the original if/else chain order in defrag_store() (always, defer+madvise, defer, madvise, never). This is intentional to match the display order used by defrag_show(). This is a follow-up cleanup to commit 522dfb4ba71f ("mm: huge_memory: refactor anon_enabled_store() with change_anon_orders()") which applied the same sysfs_match_string() pattern to anon_enabled_store(). Link: https://lore.kernel.org/20260408-thp_defrag-v2-0-bc544c1bde4e@debian.org Link: https://lore.kernel.org/20260408-thp_defrag-v2-1-bc544c1bde4e@debian.org Signed-off-by: Breno Leitao Acked-by: David Hildenbrand (Arm) Tested-by: Lance Yang Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Tested-by: Zi Yan Acked-by: Zi Yan Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 60 +++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4586f3ccb133..62e00b21fdf4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -429,6 +429,29 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } +enum defrag_mode { + DEFRAG_ALWAYS = 0, + DEFRAG_DEFER, + DEFRAG_DEFER_MADVISE, + DEFRAG_MADVISE, + DEFRAG_NEVER, +}; + +static const char * const defrag_mode_strings[] = { + [DEFRAG_ALWAYS] = "always", + [DEFRAG_DEFER] = "defer", + [DEFRAG_DEFER_MADVISE] = "defer+madvise", + [DEFRAG_MADVISE] = "madvise", + [DEFRAG_NEVER] = "never", +}; + +static const enum transparent_hugepage_flag defrag_flags[] = { + [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +}; + static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -456,34 +479,19 @@ static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "always")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "defer+madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "defer")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "never")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else + int mode, m; + + mode = sysfs_match_string(defrag_mode_strings, buf); + if (mode < 0) return -EINVAL; + for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) { + if (m == mode) + set_bit(defrag_flags[m], &transparent_hugepage_flags); + else + clear_bit(defrag_flags[m], &transparent_hugepage_flags); + } + return count; } static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); From 1d8274b82cd1870eba883fd20204bcd8601c3527 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 8 Apr 2026 08:47:01 -0700 Subject: [PATCH 046/321] mm: huge_memory: refactor defrag_show() to use defrag_flags[] Replace the hardcoded if/else chain of test_bit() calls and string literals in defrag_show() with a loop over defrag_flags[] and defrag_mode_strings[] arrays introduced in the previous commit. This makes defrag_show() consistent with defrag_store() and eliminates the duplicated mode name strings. Link: https://lore.kernel.org/20260408-thp_defrag-v2-2-bc544c1bde4e@debian.org Signed-off-by: Breno Leitao Acked-by: David Hildenbrand (Arm) Tested-by: Lance Yang Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Tested-by: Zi Yan Acked-by: Zi Yan Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 62e00b21fdf4..e9d499da0ac7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -455,24 +455,30 @@ static const enum transparent_hugepage_flag defrag_flags[] = { static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - const char *output; + int active = DEFRAG_NEVER; + int len = 0; + int i; - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) - output = "[always] defer defer+madvise madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - output = "always [defer] defer+madvise madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, - &transparent_hugepage_flags)) - output = "always defer [defer+madvise] madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags)) - output = "always defer defer+madvise [madvise] never"; - else - output = "always defer defer+madvise madvise [never]"; + for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) { + if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) { + active = i; + break; + } + } - return sysfs_emit(buf, "%s\n", output); + for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) { + if (i == active) + len += sysfs_emit_at(buf, len, "[%s] ", + defrag_mode_strings[i]); + else + len += sysfs_emit_at(buf, len, "%s ", + defrag_mode_strings[i]); + } + + /* Replace trailing space with newline */ + buf[len - 1] = '\n'; + + return len; } static ssize_t defrag_store(struct kobject *kobj, From f2a950170f7a78761c2b2e5e535716fb0f8c0813 Mon Sep 17 00:00:00 2001 From: "JP Kobryn (Meta)" Date: Mon, 6 Apr 2026 12:50:14 -0700 Subject: [PATCH 047/321] mm/vmpressure: skip socket pressure for costly order reclaim When reclaim is triggered by high order allocations on a fragmented system, vmpressure() can report poor reclaim efficiency even though the system has plenty of free memory. This is because many pages are scanned, but few are found to actually reclaim - the pages are actively in use and don't need to be freed. The resulting scan:reclaim ratio causes vmpressure() to assert socket pressure, throttling TCP throughput unnecessarily. Costly order allocations (above PAGE_ALLOC_COSTLY_ORDER) rely heavily on compaction to succeed, so poor reclaim efficiency at these orders does not necessarily indicate memory pressure. The kernel already treats this order as the boundary where reclaim is no longer expected to succeed and compaction may take over. Make vmpressure() order-aware through an additional parameter sourced from scan_control at existing call sites. Socket pressure is now only asserted when order <= PAGE_ALLOC_COSTLY_ORDER. Memcg reclaim is unaffected since try_to_free_mem_cgroup_pages() always uses order 0, which passes the filter unconditionally. Similarly, vmpressure_prio() now passes order 0 internally when calling vmpressure(), ensuring critical pressure from low reclaim priority is not suppressed by the order filter. The patch was motivated by a case of impacted net throughput in production. On one affected host, the memory state at the time showed ~15GB available, zero cgroup pressure, and the following buddyinfo state: Order FreePages 0: 133,970 1: 29,230 2: 17,351 3: 18,984 7+: 0 Using bpf, it was found that 94% of vmpressure calls on this host were from order-7 kswapd reclaim. TCP minimum recv window is rcv_ssthresh:19712. Before patch: 723 out of 3,843 (19%) TCP connections stuck at minimum recv window After live-patching and ~30min elapsed: 0 out of 3,470 TCP connections stuck at minimum recv window Link: https://lore.kernel.org/20260406195014.112521-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn (Meta) Reviewed-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Jakub Kicinski Reviewed-by: Barry Song Acked-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Eric Dumazet Cc: Kairui Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/vmpressure.h | 9 +++++---- mm/vmpressure.c | 15 ++++++++++++--- mm/vmscan.c | 8 ++++---- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6a2f51ebbfd3..faecd5522401 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -30,8 +30,8 @@ struct vmpressure { struct mem_cgroup; #ifdef CONFIG_MEMCG -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed); +void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree, + unsigned long scanned, unsigned long reclaimed); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); @@ -44,8 +44,9 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) {} +static inline void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, + bool tree, unsigned long scanned, + unsigned long reclaimed) {} static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) {} #endif /* CONFIG_MEMCG */ diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 3fbb86996c4d..f053554e5826 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -218,6 +218,7 @@ static void vmpressure_work_fn(struct work_struct *work) /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask + * @order: allocation order being reclaimed for * @memcg: cgroup memory controller handle * @tree: legacy subtree mode * @scanned: number of pages scanned @@ -236,7 +237,7 @@ static void vmpressure_work_fn(struct work_struct *work) * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; @@ -307,7 +308,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); - if (level > VMPRESSURE_LOW) { + /* + * Once we go above COSTLY_ORDER, reclaim relies heavily on + * compaction to make progress. Reclaim efficiency was never a + * great proxy for pressure to begin with, but it's outright + * misleading with these high orders. Don't throttle sockets + * because somebody is attempting something crazy like an order-7 + * and predictably struggling. + */ + if (level > VMPRESSURE_LOW && order <= PAGE_ALLOC_COSTLY_ORDER) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. @@ -348,7 +357,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, true, vmpressure_win, 0); + vmpressure(gfp, 0, memcg, true, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) diff --git a/mm/vmscan.c b/mm/vmscan.c index a9fd43b23a58..4b0984387658 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5071,8 +5071,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + vmpressure(sc->gfp_mask, sc->order, memcg, false, + sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); flush_reclaim_state(sc); @@ -6175,7 +6175,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) /* Record the group's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, + vmpressure(sc->gfp_mask, sc->order, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); @@ -6220,7 +6220,7 @@ again: /* Record the subtree's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + vmpressure(sc->gfp_mask, sc->order, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, nr_node_reclaimed); if (nr_node_reclaimed) From d590df11be0f18cdf817fcd20e9f3c51962df5d7 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 2 Apr 2026 08:26:50 +0100 Subject: [PATCH 048/321] mm/page_io: rename swap_iocb fields for clarity swap_iocb->pages tracks the number of bvec entries (folios), not base pages. Rename the array from bvec to bvecs and the counter from pages to nr_bvecs to accurately reflect their purpose. Link: https://lore.kernel.org/20260402072650.48811-1-devnexen@gmail.com Signed-off-by: David Carlier Suggested-by: Matthew Wilcox (Oracle) Suggested-by: David Hildenbrand Acked-by: David Hildenbrand (Arm) Acked-by: Chris Li Cc: Baoquan He Cc: Kairui Song Cc: Kemeng Shi Cc: NeilBrown Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/page_io.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 70cea9e24d2f..7ed76592e20d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -326,8 +326,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) struct swap_iocb { struct kiocb iocb; - struct bio_vec bvec[SWAP_CLUSTER_MAX]; - int pages; + struct bio_vec bvecs[SWAP_CLUSTER_MAX]; + int nr_bvecs; int len; }; static mempool_t *sio_pool; @@ -348,7 +348,7 @@ int sio_pool_init(void) static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec[0].bv_page; + struct page *page = sio->bvecs[0].bv_page; int p; if (ret != sio->len) { @@ -362,15 +362,15 @@ static void sio_write_complete(struct kiocb *iocb, long ret) */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); - for (p = 0; p < sio->pages; p++) { - page = sio->bvec[p].bv_page; + for (p = 0; p < sio->nr_bvecs; p++) { + page = sio->bvecs[p].bv_page; set_page_dirty(page); ClearPageReclaim(page); } } - for (p = 0; p < sio->pages; p++) - end_page_writeback(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) + end_page_writeback(sio->bvecs[p].bv_page); mempool_free(sio, sio_pool); } @@ -397,13 +397,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = pos; - sio->pages = 0; + sio->nr_bvecs = 0; sio->len = 0; } - bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0); sio->len += folio_size(folio); - sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { + sio->nr_bvecs += 1; + if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } @@ -477,7 +477,7 @@ void swap_write_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -489,8 +489,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret) int p; if (ret == sio->len) { - for (p = 0; p < sio->pages; p++) { - struct folio *folio = page_folio(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) { + struct folio *folio = page_folio(sio->bvecs[p].bv_page); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); @@ -499,8 +499,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret) } count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT); } else { - for (p = 0; p < sio->pages; p++) { - struct folio *folio = page_folio(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) { + struct folio *folio = page_folio(sio->bvecs[p].bv_page); folio_unlock(folio); } @@ -559,13 +559,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) init_sync_kiocb(&sio->iocb, sis->swap_file); sio->iocb.ki_pos = pos; sio->iocb.ki_complete = sio_read_complete; - sio->pages = 0; + sio->nr_bvecs = 0; sio->len = 0; } - bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0); sio->len += folio_size(folio); - sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + sio->nr_bvecs += 1; + if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) { swap_read_unplug(sio); sio = NULL; } @@ -666,7 +666,7 @@ void __swap_read_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); From c70a9f639bfd662b95b5e3e64f4b62b13c237eca Mon Sep 17 00:00:00 2001 From: wangxuewen Date: Thu, 2 Apr 2026 14:49:46 +0800 Subject: [PATCH 049/321] mm/memory-failure: replace magic number 3 with GET_PAGE_MAX_RETRY_NUM Replace the hardcoded magic number 3 in get_any_page() with the existing GET_PAGE_MAX_RETRY_NUM macro for code consistency and maintainability. This change has no functional impact, only improves code readability and unifies the retry limit configuration. Link: https://lore.kernel.org/20260402064946.1124250-1-18810879172@163.com Signed-off-by: wangxuewen Acked-by: SeongJae Park Acked-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 62b547c168fc..866c4428ac7e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1418,7 +1418,7 @@ try_again: * We raced with (possibly temporary) unhandlable * page, retry. */ - if (pass++ < 3) { + if (pass++ < GET_PAGE_MAX_RETRY_NUM) { shake_page(p); goto try_again; } From 1d05e1f6ac26263dc27cfb2796ef4e5e24e070f4 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 31 Mar 2026 10:52:13 +0000 Subject: [PATCH 050/321] mm/page_alloc: cleanup flag vars in alloc_pages_bulk_noprof() These two variables are redundant, squash them to align alloc_pages_bulk_noprof() with the style used in alloc_frozen_pages_nolock_noprof(). Link: https://lore.kernel.org/20260331-b4-prepare_alloc_pages-flags-v1-1-ea2416def698@google.com Signed-off-by: Brendan Jackman Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola Reviewed-by: Vlastimil Babka (SUSE) Reviewed-by: Anshuman Khandual Cc: Johannes Weiner Cc: Michal Hocko Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a81ae5781036..baf41005f90e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5054,7 +5054,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct per_cpu_pages *pcp; struct list_head *pcp_list; struct alloc_context ac; - gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0, nr_account = 0; @@ -5095,10 +5094,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ gfp &= gfp_allowed_mask; - alloc_gfp = gfp; - if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags)) goto out; - gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ z = ac.preferred_zoneref; From b9fe373e7d3c5a0814fe45d8cb41f649ed5e244d Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Tue, 31 Mar 2026 08:07:30 +0100 Subject: [PATCH 051/321] mm/thp: dead code cleanup in Kconfig There is already an 'if TRANSPARENT_HUGEPAGE' condition wrapping several config options e.g. 'READ_ONLY_THP_FOR_FS', making the 'depends on' statement for each of these a duplicate dependency (dead code). I propose leaving the outer 'if TRANSPARENT_HUGEPAGE...endif' and removing the individual 'depends on TRANSPARENT_HUGEPAGE' statement from each option. This dead code was found by kconfirm, a static analysis tool for Kconfig. Link: https://lore.kernel.org/20260331070730.33915-1-julianbraha@gmail.com Signed-off-by: Julian Braha Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Reviewed-by: Anshuman Khandual Cc: Johannes Weiner Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/Kconfig | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index e8bf1e9e6ad9..e221fa1dc54d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -810,7 +810,6 @@ if TRANSPARENT_HUGEPAGE choice prompt "Transparent Hugepage Support sysfs defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_ALWAYS help Selects the sysfs defaults for Transparent Hugepage Support. @@ -840,7 +839,6 @@ endchoice choice prompt "Shmem hugepage allocation defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER help Selects the hugepage allocation policy defaults for @@ -886,7 +884,6 @@ endchoice choice prompt "Tmpfs hugepage allocation defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER help Selects the hugepage allocation policy defaults for @@ -931,7 +928,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT + depends on ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page From 94e0bcde055ee1bd758218ec3a4ff098874123ac Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 30 Mar 2026 18:20:57 -0700 Subject: [PATCH 052/321] mm, page_alloc: reintroduce page allocation stall warning Previously, we had warnings when a single page allocation took longer than reasonably expected. This was introduced in commit 63f53dea0c98 ("mm: warn about allocations which stall for too long"). The warning was subsequently reverted in commit 400e22499dd9 ("mm: don't warn about allocations which stall for too long") because it was possible to generate memory pressure that would effectively stall further progress through printk execution. Page allocation stalls in excess of 10 seconds are always useful to debug because they can result in severe userspace unresponsiveness. Adding this artifact can be used to correlate with userspace going out to lunch and to understand the state of memory at the time. There should be a reasonable expectation that this warning will never trigger given it is very passive, it will only be emitted when a page allocation takes longer than 10 seconds. If it does trigger, this reveals an issue that should be fixed: a single page allocation should never loop for more than 10 seconds without oom killing to make memory available. Unlike the original implementation, this implementation only reports stalls once for the system every 10 seconds. Otherwise, many concurrent reclaimers could spam the kernel log unnecessarily. Stalls are only reported when calling into direct reclaim. Link: https://lore.kernel.org/371c86c8-1d47-bd70-b74c-769842718b1f@google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka (SUSE) Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: Petr Mladek Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index baf41005f90e..d9c6313e69f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -285,6 +285,14 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +/* + * When page allocations stall for longer than a threshold, + * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning + * will be printed during this duration for the entire system. + */ +#define ALLOC_STALL_WARN_MSECS (10 * 1000UL) +static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES; + static bool page_contains_unaccepted(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order, int alloc_flags); @@ -4688,6 +4696,40 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask, + unsigned int order, unsigned long alloc_start_time) +{ + static DEFINE_SPINLOCK(alloc_stall_lock); + unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time); + + if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS)) + return; + if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies))) + return; + if (gfp_mask & __GFP_NOWARN) + return; + + if (!spin_trylock(&alloc_stall_lock)) + return; + + /* Check again, this time under the lock */ + if (time_is_after_jiffies(alloc_stall_warn_jiffies)) { + spin_unlock(&alloc_stall_lock); + return; + } + + WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS)); + spin_unlock(&alloc_stall_lock); + + pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl", + current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); + cpuset_print_current_mems_allowed(); + pr_cont("\n"); + dump_stack(); + warn_alloc_show_mem(gfp_mask, nodemask); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4708,6 +4750,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; bool compact_first = false; bool can_retry_reserves = true; + unsigned long alloc_start_time = jiffies; if (unlikely(nofail)) { /* @@ -4823,6 +4866,9 @@ retry: if (current->flags & PF_MEMALLOC) goto nopage; + /* If allocation has taken excessively long, warn about it */ + check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time); + /* Try direct reclaim and then allocating */ if (!compact_first) { page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, From 9669b87065a6fe96198f3df2c3d125c5f5c1f210 Mon Sep 17 00:00:00 2001 From: "JP Kobryn (Meta)" Date: Fri, 24 Apr 2026 22:34:17 -0700 Subject: [PATCH 053/321] mm/lruvec: preemptively free dead folios during lru_add drain Of all observable lruvec lock contention in our fleet, we find that ~24% occurs when dead folios are present in lru_add batches at drain time. This is wasteful in the sense that the folio is added to the LRU just to be immediately removed via folios_put_refs(), incurring two unnecessary lock acquisitions. Eliminate this overhead by preemptively cleaning up dead folios before they make it into the LRU. Use folio_ref_freeze() to filter folios whose only remaining refcount is the batch ref. When dead folios are found, move them off the add batch and onto a temporary batch to be freed. PG_active may be set on a batched folio as well as PG_unevictable (via migration path). Since filtered folios bypass the normal lru_add() cleanup, both flags must be cleared before freeing. During A/B testing on one of our prod instagram workloads (high-frequency short-lived requests), the patch intercepted almost all dead folios before they entered the LRU. Data collected using the mm_lru_insertion tracepoint shows the effectiveness of the patch: Per-host LRU add averages at 95% CPU load (60 hosts each side, 3 x 60s intervals) dead folios/min total folios/min dead % unpatched: 1,297,785 19,341,986 6.7097% patched: 14 19,039,996 0.0001% Within this workload, we save ~2.6M lock acquisitions per minute per host as a result. System-wide memory stats improved on the patched side also at 95% CPU load: - direct reclaim scanning reduced 7% - allocation stalls reduced 5.2% - compaction stalls reduced 12.3% - page frees reduced 4.9% No regressions were observed in requests served per second or request tail latency (p99). Both metrics showed directional improvement at higher CPU utilization (comparing 85% to 95%). Note that tests were performed using classic LRU. Link: https://lore.kernel.org/20260425053417.351146-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn (Meta) Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Axel Rasmussen Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Cc: Rik van Riel Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/swap.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 5cc44f0de987..2dd84813f4dd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -160,14 +160,42 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; + struct folio_batch free_fbatch; + bool is_lru_add = (move_fn == lru_add); + + /* + * If we're adding to the LRU, preemptively filter dead folios. Use + * this dedicated folio batch for temp storage and deferred cleanup. + */ + if (is_lru_add) + folio_batch_init(&free_fbatch); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ - if (move_fn != lru_add && !folio_test_clear_lru(folio)) + if (!is_lru_add && !folio_test_clear_lru(folio)) continue; + /* + * Filter dead folios by moving them from the add batch to the temp + * batch for freeing after this loop. + * + * We're bypassing normal cleanup. Clear flags that are not + * applicable to dead folios. + * + * Since the folio may be part of a huge page, unqueue from + * deferred split list to avoid a dangling list entry. + */ + if (is_lru_add && folio_ref_freeze(folio, 1)) { + __folio_clear_active(folio); + __folio_clear_unevictable(folio); + folio_unqueue_deferred_split(folio); + fbatch->folios[i] = NULL; + folio_batch_add(&free_fbatch, folio); + continue; + } + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,6 +204,13 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) lruvec_unlock_irqrestore(lruvec, flags); + + /* Cleanup filtered dead folios. */ + if (is_lru_add) { + mem_cgroup_uncharge_folios(&free_fbatch); + free_unref_folios(&free_fbatch); + } + folios_put(fbatch); } @@ -964,6 +999,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; + /* Folio batch entry may have been preemptively removed during drain. */ + if (!folio) + continue; + if (is_huge_zero_folio(folio)) continue; From 15807d0ddde37407af72859426b654f3d1972b00 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sat, 28 Mar 2026 12:25:34 +0530 Subject: [PATCH 054/321] mm/hugetlb: fix hugetlb cgroup rsvd charge/uncharge mismatch In alloc_hugetlb_folio(), a single h_cg pointer is used for both the rsvd and non-rsvd hugetlb cgroup charges. When map_chg is set, hugetlb_cgroup_charge_cgroup_rsvd() stores the charged cgroup in h_cg, but the immediately following hugetlb_cgroup_charge_cgroup() overwrites h_cg with the non-rsvd cgroup pointer. As a result, hugetlb_cgroup_commit_charge_rsvd() stores the wrong (non-rsvd) cgroup pointer into the folio's rsvd slot. When the folio is later freed, free_huge_folio() unconditionally calls both hugetlb_cgroup_uncharge_folio() and hugetlb_cgroup_uncharge_folio_rsvd(). The rsvd uncharge reads back the wrong cgroup from the folio and decrements a counter that was never charged for that cgroup, causing a page_counter underflow: page_counter underflow: -512 nr_pages=512 WARNING: mm/page_counter.c:61 at page_counter_cancel Fix this by introducing a separate h_cg_rsvd pointer exclusively for the rsvd charge path, keeping the rsvd and non-rsvd charges fully independent through their charge, commit, and error uncharge paths. Link: https://lore.kernel.org/20260328065534.346053-1-kartikey406@gmail.com Fixes: 08cf9faf7558 ("hugetlb_cgroup: support noreserve mappings") Reported-by: syzbot+226c1f947186f8fef796@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=226c1f947186f8fef796 Signed-off-by: Deepanshu Kartikey Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Oscar Salvador Cc: Mina Almasry Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4b80b167cc9c..bcc657abbe35 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2859,6 +2859,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct hugetlb_cgroup *h_cg_rsvd = NULL; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); @@ -2909,7 +2910,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( - idx, pages_per_huge_page(h), &h_cg); + idx, pages_per_huge_page(h), &h_cg_rsvd); if (ret) goto out_subpool_put; } @@ -2951,7 +2952,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, folio); + h_cg_rsvd, folio); } spin_unlock_irq(&hugetlb_lock); @@ -3003,7 +3004,7 @@ out_uncharge_cgroup: out_uncharge_cgroup_reservation: if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), - h_cg); + h_cg_rsvd); out_subpool_put: /* * put page to subpool iff the quota of subpool's rsv_hpages is used From faae0ca3628b99119c3ad9780259d25b02ddff93 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 26 Mar 2026 12:31:57 +0000 Subject: [PATCH 055/321] drm/managed: use special gfp_t format specifier Patch series "treewide: fixup gfp_t printks", v2. Use vprintf()'s special gfp_t conversion in a few places. This patch (of 3): %pGg produces nice readable output and decouples the format string from the size of gfp_t. Link: https://lore.kernel.org/20260326-gfp64-v2-0-d916021cecdf@google.com Link: https://lore.kernel.org/20260326-gfp64-v2-1-d916021cecdf@google.com Signed-off-by: Brendan Jackman Cc: Alexander Potapenko Cc: Allison Collins Cc: Allison Henderson Cc: Dave Airlie Cc: David S. Miller Cc: Dmitry Vyukov Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Maarten Lankhorst Cc: Marco Elver Cc: Maxime Ripard Cc: Paolo Abeni Cc: Simon Horman Cc: Stanislaw Gruszka Cc: Thomas Zimemrmann Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_managed.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_managed.c b/drivers/gpu/drm/drm_managed.c index 247f468731de..a9da94319b05 100644 --- a/drivers/gpu/drm/drm_managed.c +++ b/drivers/gpu/drm/drm_managed.c @@ -232,8 +232,8 @@ void *drmm_kmalloc(struct drm_device *dev, size_t size, gfp_t gfp) dr = alloc_dr(NULL, size, gfp, dev_to_node(dev->dev)); if (!dr) { - drm_dbg_drmres(dev, "failed to allocate %zu bytes, %u flags\n", - size, gfp); + drm_dbg_drmres(dev, "failed to allocate %zu bytes, %pGg\n", + size, &gfp); return NULL; } dr->node.name = kstrdup_const("kmalloc", gfp); From d36102a5f55321b9bdf3e40fbb7b5c482e6dfb12 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 26 Mar 2026 12:31:59 +0000 Subject: [PATCH 056/321] mm/kfence: use special gfp_t format specifier %pGg produces nice readable output and decouples the format string from the size of gfp_t. Link: https://lore.kernel.org/20260326-gfp64-v2-3-d916021cecdf@google.com Signed-off-by: Brendan Jackman Cc: Alexander Potapenko Cc: Allison Collins Cc: Allison Henderson Cc: Dave Airlie Cc: David S. Miller Cc: Dmitry Vyukov Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Jakub Kicinski Cc: Maarten Lankhorst Cc: Marco Elver Cc: Maxime Ripard Cc: Paolo Abeni Cc: Simon Horman Cc: Stanislaw Gruszka Cc: Thomas Zimemrmann Signed-off-by: Andrew Morton --- mm/kfence/kfence_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 5725a367246d..10424cd25e5a 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -263,7 +263,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat break; } - kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + kunit_info(test, "%s: size=%zu, gfp=%pGg, policy=%s, cache=%i\n", __func__, size, &gfp, policy_name, !!test_cache); /* From 3f994146201563c5374a5ea17486b80323120d6d Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 26 Mar 2026 12:32:00 +0000 Subject: [PATCH 057/321] net/rds: use special gfp_t format specifier %pGg produces nice readable output and decouples the format string from the size of gfp_t. Link: https://lore.kernel.org/20260326-gfp64-v2-4-d916021cecdf@google.com Signed-off-by: Brendan Jackman Acked-by: Allison Henderson Cc: Alexander Potapenko Cc: Allison Collins Cc: Dave Airlie Cc: David S. Miller Cc: Dmitry Vyukov Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Maarten Lankhorst Cc: Marco Elver Cc: Maxime Ripard Cc: Paolo Abeni Cc: Simon Horman Cc: Stanislaw Gruszka Cc: Thomas Zimemrmann Signed-off-by: Andrew Morton --- net/rds/tcp_recv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 49f96ee0c40f..ffe843ca219c 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -275,7 +275,7 @@ static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) desc.count = 1; /* give more than one skb per call */ tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); - rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, + rdsdebug("tcp_read_sock for tc %p gfp %pGg returned %d\n", tc, &gfp, desc.error); if (skb_queue_empty_lockless(&sock->sk->sk_receive_queue) && From 8aa442cfce79e2d69e72fc8e0c0864ac2971149d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 23 Feb 2026 12:15:16 -0800 Subject: [PATCH 058/321] dax/kmem: account for partial discontiguous resource upon removal When dev_dax_kmem_probe() partially succeeds (at least one range is mapped) but a subsequent range fails request_mem_region() or add_memory_driver_managed(), the probe silently continues, ultimately returning success, but with the corresponding range resource NULL'ed out. dev_dax_kmem_remove() iterates over all dax_device ranges regardless of if the underlying resource exists. When remove_memory() is called later, it returns 0 because the memory was never added which causes dev_dax_kmem_remove() to incorrectly assume the (nonexistent) resource can be removed and attempts cleanup on a NULL pointer. Fix this by skipping these ranges altogether, noting that these cases are considered success, such that the cleanup is still reached when all actually-added ranges are successfully removed. Link: https://lore.kernel.org/20260223201516.1517657-1-dave@stgolabs.net Fixes: 60e93dc097f7 ("device-dax: add dis-contiguous resource support") Signed-off-by: Davidlohr Bueso Reviewed-by: Ben Cheatham Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Cc: Dan Williams Cc: Dave Jiang Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/dax/kmem.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 2cc8749bc871..a18e2b968e4d 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -227,6 +227,12 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) if (rc) continue; + /* range was never added during probe */ + if (!data->res[i]) { + success++; + continue; + } + rc = remove_memory(range.start, range_len(&range)); if (rc == 0) { remove_resource(data->res[i]); From 7e8983f317ab0efd13aa573e166d7ad69e36a429 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 15 Apr 2026 10:15:09 +0530 Subject: [PATCH 059/321] selftests/mm: simplify byte pattern checking in mremap_test The original version of mremap_test (7df666253f26: "kselftests: vm: add mremap tests") validated remapped contents byte-by-byte and printed a mismatch index in case the bytes streams didn't match. That was rather inefficient, especially also if the test passed. Later, commit 7033c6cc9620 ("selftests/mm: mremap_test: optimize execution time from minutes to seconds using chunkwise memcmp") used memcmp() on bigger chunks, to fallback to byte-wise scanning to detect the problematic index only if it discovered a problem. However, the implementation is overly complicated (e.g., get_sqrt() is currently not optimal) and we don't really have to report the exact index: whoever debugs the failing test can figure that out. Let's simplify by just comparing both byte streams with memcmp() and not detecting the exact failed index. Link: https://lore.kernel.org/20260415044509.579428-1-dev.jain@arm.com Signed-off-by: Dev Jain Reported-by: Sarthak Sharma Tested-by: Sarthak Sharma Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Cc: Anshuman Khandual Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Laight Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 109 +++-------------------- 1 file changed, 10 insertions(+), 99 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 308576437228..131d9d6db867 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -76,27 +76,6 @@ enum { .expect_failure = should_fail \ } -/* compute square root using binary search */ -static unsigned long get_sqrt(unsigned long val) -{ - unsigned long low = 1; - - /* assuming rand_size is less than 1TB */ - unsigned long high = (1UL << 20); - - while (low <= high) { - unsigned long mid = low + (high - low) / 2; - unsigned long temp = mid * mid; - - if (temp == val) - return mid; - if (temp < val) - low = mid + 1; - high = mid - 1; - } - return low; -} - /* * Returns false if the requested remap region overlaps with an * existing mapping (e.g text, stack) else returns true. @@ -995,11 +974,9 @@ static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; - unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; - unsigned long num_chunks; if (threshold_mb == VALIDATION_NO_THRESHOLD) threshold = c.region_size; @@ -1068,87 +1045,21 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto clean_up_dest_preamble; } - /* - * Verify byte pattern after remapping. Employ an algorithm with a - * square root time complexity in threshold: divide the range into - * chunks, if memcmp() returns non-zero, only then perform an - * iteration in that chunk to find the mismatch index. - */ - num_chunks = get_sqrt(threshold); - for (unsigned long i = 0; i < num_chunks; ++i) { - size_t chunk_size = threshold / num_chunks; - unsigned long shift = i * chunk_size; - - if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) - continue; - - /* brute force iteration only over mismatch segment */ - for (t = shift; t < shift + chunk_size; ++t) { - if (((char *) dest_addr)[t] != rand_addr[t]) { - ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, - ((char *) dest_addr)[t] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - } - - /* - * if threshold is not divisible by num_chunks, then check the - * last chunk - */ - for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { - if (((char *) dest_addr)[t] != rand_addr[t]) { - ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, - ((char *) dest_addr)[t] & 0xff); - ret = -1; - goto clean_up_dest; - } + /* Verify byte pattern after remapping */ + if (memcmp(dest_addr, rand_addr, threshold)) { + ksft_print_msg("Data after remap doesn't match\n"); + ret = -1; + goto clean_up_dest; } /* Verify the dest preamble byte pattern after remapping */ - if (!c.dest_preamble_size) - goto no_preamble; - - num_chunks = get_sqrt(c.dest_preamble_size); - - for (unsigned long i = 0; i < num_chunks; ++i) { - size_t chunk_size = c.dest_preamble_size / num_chunks; - unsigned long shift = i * chunk_size; - - if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, - chunk_size)) - continue; - - /* brute force iteration only over mismatched segment */ - for (d = shift; d < shift + chunk_size; ++d) { - if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { - ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); - ret = -1; - goto clean_up_dest; - } - } + if (c.dest_preamble_size && + memcmp(dest_preamble_addr, rand_addr, c.dest_preamble_size)) { + ksft_print_msg("Preamble data after remap doesn't match\n"); + ret = -1; + goto clean_up_dest; } - for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { - if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { - ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - -no_preamble: start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; From c373f7f98e6ad591c85d40548cf8b6443be69311 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:50 +0800 Subject: [PATCH 060/321] mm/sparse-vmemmap: fix vmemmap accounting underflow Patch series "mm: Fix vmemmap optimization accounting and initialization", v8. The series fixes several bugs in vmemmap optimization, mainly around incorrect page accounting and memmap initialization in DAX and memory hotplug paths. It also fixes pageblock migratetype initialization and struct page initialization for ZONE_DEVICE compound pages. Patches 1-4 fix vmemmap accounting issues. Patch 1 fixes an accounting underflow in the section activation failure path by moving vmemmap page accounting into the lower-level allocation and freeing helpers. Patch 2 fixes incorrect altmap passing in the memory hotplug error path. Patch 3 passes pgmap through memory deactivation paths so the teardown side can determine whether vmemmap optimization was in effect. Patch 4 uses that information to account the optimized DAX vmemmap size correctly. Patches 5-6 fix initialization issues in mm/mm_init. One makes sure all pageblocks in ZONE_DEVICE compound pages get their migratetype initialized. The other fixes a case where DAX memory hotplug reuses an unoptimized early-section memmap while compound_nr_pages() still assumes vmemmap optimization, leaving tail struct pages uninitialized. This patch (of 6): In section_activate(), if populate_section_memmap() fails, the error handling path calls section_deactivate() to roll back the state. This causes a vmemmap accounting imbalance. Since commit c3576889d87b ("mm: fix accounting of memmap pages"), memmap pages are accounted for only after populate_section_memmap() succeeds. However, the failure path unconditionally calls section_deactivate(), which decreases the vmemmap count. Consequently, a failure in populate_section_memmap() leads to an accounting underflow, incorrectly reducing the system's tracked vmemmap usage. Fix this more thoroughly by moving all accounting calls into the lower level functions that actually perform the vmemmap allocation and freeing: - populate_section_memmap() accounts for newly allocated vmemmap pages - depopulate_section_memmap() unaccounts when vmemmap is freed This ensures proper accounting in all code paths, including error handling and early section cases. Link: https://lore.kernel.org/20260428081855.1249045-1-songmuchun@bytedance.com Link: https://lore.kernel.org/20260428081855.1249045-2-songmuchun@bytedance.com Fixes: c3576889d87b ("mm: fix accounting of memmap pages") Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 43f82621dd92..60e55e78d7ff 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -651,7 +651,12 @@ static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { - return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap, + pgmap); + + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + + return page; } static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, @@ -660,13 +665,17 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); vmemmap_free(start, end, altmap); } + static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), + PAGE_SIZE))); vmemmap_free(start, end, NULL); } @@ -769,14 +778,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * The memmap of early sections is always fully populated. See * section_activate() and pfn_valid() . */ - if (!section_is_early) { - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + if (!section_is_early) depopulate_section_memmap(pfn, nr_pages, altmap); - } else if (memmap) { - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), - PAGE_SIZE))); + else if (memmap) free_map_bootmem(memmap); - } if (empty) ms->section_mem_map = (unsigned long)NULL; @@ -821,7 +826,6 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, section_deactivate(pfn, nr_pages, altmap); return ERR_PTR(-ENOMEM); } - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); return memmap; } From 2fac4afa0e2e68841334c78c1821e49f74fbc66a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:51 +0800 Subject: [PATCH 061/321] mm/memory_hotplug: fix incorrect altmap passing in error path In create_altmaps_and_memory_blocks(), when arch_add_memory() succeeds with memmap_on_memory enabled, the vmemmap pages are allocated from params.altmap. If create_memory_block_devices() subsequently fails, the error path calls arch_remove_memory() with a NULL altmap instead of params.altmap. This is a bug that could lead to memory corruption. Since altmap is NULL, vmemmap_free() falls back to freeing the vmemmap pages into the system buddy allocator via free_pages() instead of the altmap. arch_remove_memory() then immediately destroys the physical linear mapping for this memory. This injects unowned pages into the buddy allocator, causing machine checks or memory corruption if the system later attempts to allocate and use those freed pages. Fix this by passing params.altmap to arch_remove_memory() in the error path. Link: https://lore.kernel.org/20260428081855.1249045-3-songmuchun@bytedance.com Fixes: 6b8f0798b85a ("mm/memory_hotplug: split memmap_on_memory requests across memblocks") Signed-off-by: Muchun Song Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Reviewed-by: Georgi Djakov Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 40c7915dabe0..cf4f77108c43 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1470,7 +1470,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { - arch_remove_memory(cur_start, memblock_size, NULL); + arch_remove_memory(cur_start, memblock_size, params.altmap); kfree(params.altmap); goto out; } From 3bbc54dd1b62f1a4b218c70aafbeceeba7c90c5d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:52 +0800 Subject: [PATCH 062/321] mm/sparse-vmemmap: pass @pgmap argument to memory deactivation paths Currently, the memory hot-remove call chain -- arch_remove_memory(), __remove_pages(), sparse_remove_section() and section_deactivate() -- does not carry the struct dev_pagemap pointer. This prevents the lower levels from knowing whether the section was originally populated with vmemmap optimizations (e.g., DAX with vmemmap optimization enabled). Without this information, we cannot call vmemmap_can_optimize() to determine if the vmemmap pages were optimized. As a result, the vmemmap page accounting during teardown will mistakenly assume a non-optimized allocation, leading to incorrect memmap statistics. To lay the groundwork for fixing the vmemmap page accounting, we need to pass the @pgmap pointer down to the deactivation location. Plumb the @pgmap argument through the APIs of arch_remove_memory(), __remove_pages() and sparse_remove_section(), mirroring the corresponding *_activate() paths. Link: https://lore.kernel.org/20260428081855.1249045-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 5 +++-- arch/loongarch/mm/init.c | 5 +++-- arch/powerpc/mm/mem.c | 5 +++-- arch/riscv/mm/init.c | 5 +++-- arch/s390/mm/init.c | 5 +++-- arch/x86/mm/init_64.c | 5 +++-- include/linux/memory_hotplug.h | 8 +++++--- mm/memory_hotplug.c | 13 +++++++------ mm/memremap.c | 4 ++-- mm/sparse-vmemmap.c | 12 ++++++------ 10 files changed, 38 insertions(+), 29 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index dd85e093ffdb..e5a42b7a0160 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -2024,12 +2024,13 @@ err: return ret; } -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - __remove_pages(start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap, pgmap); __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); } diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 031b39eb081c..687980b6e91f 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -119,12 +119,13 @@ int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) return ret; } -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - __remove_pages(start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap, pgmap); } #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 648d0c5602ec..4c1afab91996 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -158,12 +158,13 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return rc; } -void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - __remove_pages(start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap, pgmap); arch_remove_linear_mapping(start, size); } #endif diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fa8d2f6f554b..885f1db4e9bf 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1742,9 +1742,10 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *param return ret; } -void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap, pgmap); remove_linear_mapping(start, size); flush_tlb_all(); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 1f72efc2a579..11a689423440 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -276,12 +276,13 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - __remove_pages(start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap, pgmap); vmem_remove_mapping(start, size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df2261fa4f98..77b889b71cf3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1288,12 +1288,13 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - __remove_pages(start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap, pgmap); kernel_physical_mapping_remove(start, start + size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 815e908c4135..7c9d66729c60 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -135,9 +135,10 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); +extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, struct dev_pagemap *pgmap); /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, @@ -307,7 +308,8 @@ extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cf4f77108c43..462d8dcd636d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -576,6 +576,7 @@ void remove_pfn_range_from_zone(struct zone *zone, * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used + * @pgmap: device page map or %NULL if not ZONE_DEVICE * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make @@ -583,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone, * calling offline_pages(). */ void __remove_pages(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; @@ -598,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); - sparse_remove_section(pfn, cur_nr_pages, altmap); + sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap); } } @@ -1427,7 +1428,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) remove_memory_block_devices(cur_start, memblock_size); - arch_remove_memory(cur_start, memblock_size, altmap); + arch_remove_memory(cur_start, memblock_size, altmap, NULL); /* Verify that all vmemmap pages have actually been freed. */ WARN(altmap->alloc, "Altmap not fully unmapped"); @@ -1470,7 +1471,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { - arch_remove_memory(cur_start, memblock_size, params.altmap); + arch_remove_memory(cur_start, memblock_size, params.altmap, NULL); kfree(params.altmap); goto out; } @@ -1556,7 +1557,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { - arch_remove_memory(start, size, params.altmap); + arch_remove_memory(start, size, params.altmap, NULL); goto error; } } @@ -2268,7 +2269,7 @@ static int try_remove_memory(u64 start, u64 size) * No altmaps present, do the removal directly */ remove_memory_block_devices(start, size); - arch_remove_memory(start, size, NULL); + arch_remove_memory(start, size, NULL, NULL); } else { /* all memblocks in the range have altmaps */ remove_memory_blocks_and_altmaps(start, size); diff --git a/mm/memremap.c b/mm/memremap.c index 053842d45cb1..81766d822400 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) PHYS_PFN(range_len(range))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { __remove_pages(PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), NULL); + PHYS_PFN(range_len(range)), NULL, pgmap); } else { arch_remove_memory(range->start, range_len(range), - pgmap_altmap(pgmap)); + pgmap_altmap(pgmap), pgmap); kasan_remove_zero_shadow(__va(range->start), range_len(range)); } mem_hotplug_done(); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 60e55e78d7ff..eafb7c6eb71e 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -660,7 +660,7 @@ static struct page * __meminit populate_section_memmap(unsigned long pfn, } static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); @@ -741,7 +741,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) * usage map, but still need to free the vmemmap range. */ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); @@ -779,7 +779,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * section_activate() and pfn_valid() . */ if (!section_is_early) - depopulate_section_memmap(pfn, nr_pages, altmap); + depopulate_section_memmap(pfn, nr_pages, altmap, pgmap); else if (memmap) free_map_bootmem(memmap); @@ -823,7 +823,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); + section_deactivate(pfn, nr_pages, altmap, pgmap); return ERR_PTR(-ENOMEM); } @@ -884,13 +884,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, } void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); if (WARN_ON_ONCE(!valid_section(ms))) return; - section_deactivate(pfn, nr_pages, altmap); + section_deactivate(pfn, nr_pages, altmap, pgmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From 721a73e30c9e3e8fcffe1725bcede1bbd20b4918 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:53 +0800 Subject: [PATCH 063/321] mm/sparse-vmemmap: fix DAX vmemmap accounting with optimization When vmemmap optimization is enabled for DAX, the nr_memmap_pages counter in /proc/vmstat is incorrect. The current code always accounts for the full, non-optimized vmemmap size, but vmemmap optimization reduces the actual number of vmemmap pages by reusing tail pages. This causes the system to overcount vmemmap usage, leading to inaccurate page statistics in /proc/vmstat. Fix this by introducing section_nr_vmemmap_pages(), which returns the exact vmemmap page count for a given pfn range based on whether optimization is in effect. Link: https://lore.kernel.org/20260428081855.1249045-5-songmuchun@bytedance.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index eafb7c6eb71e..112ccf9c71ca 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -647,6 +647,31 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } +static int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + const unsigned int order = pgmap ? pgmap->vmemmap_shift : 0; + const unsigned long pages_per_compound = 1UL << order; + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION)); + VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION); + + if (!vmemmap_can_optimize(altmap, pgmap)) + return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE); + + if (order < PFN_SECTION_SHIFT) { + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound)); + return VMEMMAP_RESERVE_NR * nr_pages / pages_per_compound; + } + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)); + + if (IS_ALIGNED(pfn, pages_per_compound)) + return VMEMMAP_RESERVE_NR; + + return 0; +} + static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) @@ -654,7 +679,7 @@ static struct page * __meminit populate_section_memmap(unsigned long pfn, struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap)); return page; } @@ -665,7 +690,7 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap)); vmemmap_free(start, end, altmap); } @@ -673,9 +698,10 @@ static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long pfn = page_to_pfn(memmap); - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), - PAGE_SIZE))); + memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION, + NULL, NULL)); vmemmap_free(start, end, NULL); } From 94405c6136839f7c462249c8b4b957bcb9527a9d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:54 +0800 Subject: [PATCH 064/321] mm/mm_init: fix pageblock migratetype for ZONE_DEVICE compound pages The memmap_init_zone_device() function only initializes the migratetype of the first pageblock of a compound page. If the compound page size exceeds pageblock_nr_pages (e.g., 1GB hugepages with 2MB pageblocks), subsequent pageblocks in the compound page remain uninitialized. Move the migratetype initialization out of __init_zone_device_page() and into a separate pageblock_migratetype_init_range() function. This iterates over the entire PFN range of the memory, ensuring that all pageblocks are correctly initialized. Also remove the stale confusing comment about MEMINIT_HOTPLUG above the migratetype setting since it is an obsolete relic from commit 966cf44f637e ("mm: defer ZONE_DEVICE page initialization to the point where we init pgmap") and no longer makes sense here. Link: https://lore.kernel.org/20260428081855.1249045-6-songmuchun@bytedance.com Fixes: c4386bd8ee3a ("mm/memremap: add ZONE_DEVICE support for compound pages") Signed-off-by: Muchun Song Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/mm_init.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index f9f8e1af921c..cfc76953e249 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -674,6 +674,20 @@ static inline void fixup_hashdist(void) static inline void fixup_hashdist(void) {} #endif /* CONFIG_NUMA */ +#ifdef CONFIG_ZONE_DEVICE +static __meminit void pageblock_migratetype_init_range(unsigned long pfn, + unsigned long nr_pages, int migratetype) +{ + const unsigned long end = pfn + nr_pages; + + for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) { + init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false); + if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) + cond_resched(); + } +} +#endif + /* * Initialize a reserved page unconditionally, finding its zone first. */ @@ -1011,21 +1025,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, page_folio(page)->pgmap = pgmap; page->zone_device_data = NULL; - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (pageblock_aligned(pfn)) { - init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); - cond_resched(); - } - /* * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released * directly to the driver page allocator which will set the page count @@ -1122,6 +1121,9 @@ void __ref memmap_init_zone_device(struct zone *zone, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) + cond_resched(); + if (pfns_per_compound == 1) continue; @@ -1129,6 +1131,8 @@ void __ref memmap_init_zone_device(struct zone *zone, compound_nr_pages(altmap, pgmap)); } + pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE); + pr_debug("%s initialised %lu pages in %ums\n", __func__, nr_pages, jiffies_to_msecs(jiffies - start)); } From cd681403a87085562499d60325b7b45d3be11217 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:18:55 +0800 Subject: [PATCH 065/321] mm/mm_init: fix uninitialized struct pages for ZONE_DEVICE If DAX memory is hotplugged into an unoccupied subsection of an early section, section_activate() reuses the unoptimized boot memmap. However, compound_nr_pages() still assumes that vmemmap optimization is in effect and initializes only the reduced number of struct pages. As a result, the remaining tail struct pages are left uninitialized, which can later lead to unexpected behavior or crashes. Fix this by treating early sections as unoptimized when calculating how many struct pages to initialize. Link: https://lore.kernel.org/20260428081855.1249045-7-songmuchun@bytedance.com Fixes: 6fd3620b3428 ("mm/page_alloc: reuse tail struct pages for compound devmaps") Signed-off-by: Muchun Song Acked-by: David Hildenbrand (Arm) Acked-by: Mike Rapoport (Microsoft) Acked-by: Liam R. Howlett Cc: "Aneesh Kumar K.V" Cc: Joao Martins Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/mm_init.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index cfc76953e249..bd466a3c10c8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1055,10 +1055,17 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * of how the sparse_vmemmap internals handle compound pages in the lack * of an altmap. See vmemmap_populate_compound_pages(). */ -static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, +static inline unsigned long compound_nr_pages(unsigned long pfn, + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { - if (!vmemmap_can_optimize(altmap, pgmap)) + /* + * If DAX memory is hot-plugged into an unoccupied subsection + * of an early section, the unoptimized boot memmap is reused. + * See section_activate(). + */ + if (early_section(__pfn_to_section(pfn)) || + !vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); @@ -1128,7 +1135,7 @@ void __ref memmap_init_zone_device(struct zone *zone, continue; memmap_init_compound(page, pfn, zone_idx, nid, pgmap, - compound_nr_pages(altmap, pgmap)); + compound_nr_pages(pfn, altmap, pgmap)); } pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE); From 1d258000b0f28ca34faa7ce699e873666724aaa3 Mon Sep 17 00:00:00 2001 From: Liew Rui Yan Date: Sun, 26 Apr 2026 16:16:14 -0700 Subject: [PATCH 066/321] mm/damon/ops-common: optimize damon_hot_score() using ilog2() Patch series "mm/damon: repost non-hotfix reviewed patches in damon/next tree", v2. The first patch from Liew Rui Yan add a minor performance optimization using ilog2() instead of inefficient manual implementation of the functionality. The second patch from Cheng-Han Wu fixes a minor typo: s/parametrs/parameters/. The third patch from Liew Rui Yan make commit_inputs operation of DAMON_RECLAIM and DAMON_LRU_SORT synchronous to improve the user experience. The fourth patch from Asier Gutierrez adds a new DAMOS action, DAMOS_COLLAPSE for deterministic DAMOS-based access-aware THP system. This patch (of 4): The current implementation of damon_hot_score() uses a manual for-loop to calculate the value of 'age_in_log'. This can be efficiently replaced by ilog2(), which is semantically more appropriate for calculating the logarithmic value of age. In a simulated-kernel-module performance test with 10,000,000 iterations, this optimization showed a significant reduction in latency (average latency reduced from ~12ns to ~1ns). Test results from the simulated-kernel-module: - ilog2: DAMON Perf Test: Starting 10000000 iterations ============================================= Total Iterations : 10000000 Average Latency : 1 ns P95 Latency : 41 ns P99 Latency : 41 ns --------------------------------------------- Range (ns) | Count | Percent --------------------------------------------- 0-19 | 0 | 0% 20-39 | 2625000 | 26% 40-59 | 7374000 | 73% 60-79 | 0 | 0% 80-99 | 0 | 0% 100+ | 1000 | 0% ============================================= - for-loop: DAMON Perf Test: Starting 10000000 iterations ============================================= Total Iterations : 10000000 Average Latency : 12 ns P95 Latency : 51 ns P99 Latency : 60 ns --------------------------------------------- Range (ns) | Count | Percent --------------------------------------------- 0-19 | 0 | 0% 20-39 | 0 | 0% 40-59 | 9862000 | 98% 60-79 | 135000 | 1% 80-99 | 1000 | 0% 100+ | 2000 | 0% ============================================= Full raw benchmark results can be found at [1]. Link: https://lore.kernel.org/20260426231619.107231-1-sj@kernel.org Link: https://lore.kernel.org/20260426231619.107231-2-sj@kernel.org Link: https://github.com/aethernet65535/damon-hot-score-fls-optimize/tree/master/result-raw [1] Signed-off-by: SeongJae Park Signed-off-by: Liew Rui Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Asier Gutierrez Cc: Cheng-Han Wu Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 8c6d613425c1..3a0ddc3ac719 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -117,9 +117,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, damon_max_nr_accesses(&c->attrs); age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; + if (age_in_sec) + age_in_log = min_t(int, ilog2(age_in_sec) + 1, + DAMON_MAX_AGE_IN_LOG); + else + age_in_log = 0; + /* If frequency is 0, higher age means it's colder */ if (freq_subscore == 0) From abdca14655fe4ec821791c031d5764fdd1e9484d Mon Sep 17 00:00:00 2001 From: Cheng-Han Wu Date: Sun, 26 Apr 2026 16:16:15 -0700 Subject: [PATCH 067/321] Docs/admin-guide/mm/damon: fix 'parametrs' typo Fix the misspelling of "parametrs" as "parameters" in reclaim.rst and lru_sort.rst. Link: https://lore.kernel.org/20260426231619.107231-3-sj@kernel.org Signed-off-by: Cheng-Han Wu Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Asier Gutierrez Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Liew Rui Yan Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 2 +- Documentation/admin-guide/mm/damon/reclaim.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 14cc6b2db897..25e2f042a383 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -75,7 +75,7 @@ Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. Input parameters that updated while DAMON_LRU_SORT is running are not applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values -of parametrs except ``enabled`` again. Once the re-reading is done, this +of parameters except ``enabled`` again. Once the re-reading is done, this parameter is set as ``N``. If invalid parameters are found while the re-reading, DAMON_LRU_SORT will be disabled. diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index d7a0225b4950..01a34c215b66 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -67,7 +67,7 @@ Make DAMON_RECLAIM reads the input parameters again, except ``enabled``. Input parameters that updated while DAMON_RECLAIM is running are not applied by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values -of parametrs except ``enabled`` again. Once the re-reading is done, this +of parameters except ``enabled`` again. Once the re-reading is done, this parameter is set as ``N``. If invalid parameters are found while the re-reading, DAMON_RECLAIM will be disabled. From de3c60e1c8314f3408a72836483772e17f279aca Mon Sep 17 00:00:00 2001 From: Liew Rui Yan Date: Sun, 26 Apr 2026 16:16:16 -0700 Subject: [PATCH 068/321] mm/damon: add synchronous commit for commit_inputs Problem ======= Writing invalid parameters to sysfs followed by 'commit_inputs=Y' fails silently (no error returned to shell), because the validation happens asynchronously in the kdamond. Solution ======== To fix this, the commit_inputs_store() callback now uses damon_call() to synchronously commit parameters in the kdamond thread's safe context. This ensures that validation errors are returned immediately to userspace, following the pattern used by DAMON_SYSFS. Changes ======= 1. Added commit_inputs_store() and commit_inputs_fn() to commit synchronously. 2. Removed handle_commit_inputs(). This change is motivated from another discussion [1]. Link: https://lore.kernel.org/20260426231619.107231-4-sj@kernel.org Link: https://lore.kernel.org/20260318153731.97470-1-aethernet65535@gmail.com [1] Signed-off-by: Liew Rui Yan Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: Asier Gutierrez Cc: Cheng-Han Wu Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 48 +++++++++++++++++++++++++++++++++++++-------- mm/damon/reclaim.c | 48 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 80 insertions(+), 16 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8494040b1ee4..7569e471160a 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -39,7 +39,6 @@ static bool enabled __read_mostly; * the re-reading, DAMON_LRU_SORT will be disabled. */ static bool commit_inputs __read_mostly; -module_param(commit_inputs, bool, 0600); /* * Desired active to [in]active memory ratio in bp (1/10,000). @@ -340,18 +339,51 @@ out: return err; } -static int damon_lru_sort_handle_commit_inputs(void) +static int damon_lru_sort_commit_inputs_fn(void *arg) { - int err; + return damon_lru_sort_apply_parameters(); +} - if (!commit_inputs) +static int damon_lru_sort_commit_inputs_store(const char *val, + const struct kernel_param *kp) +{ + bool commit_inputs_request; + int err; + struct damon_call_control control = { + .fn = damon_lru_sort_commit_inputs_fn, + }; + + if (!val) { + commit_inputs_request = true; + } else { + err = kstrtobool(val, &commit_inputs_request); + if (err) + return err; + } + + if (!commit_inputs_request) return 0; - err = damon_lru_sort_apply_parameters(); - commit_inputs = false; - return err; + /* + * Skip damon_call() if ctx is not initialized to avoid + * NULL pointer dereference. + */ + if (!ctx) + return -EINVAL; + + err = damon_call(ctx, &control); + + return err ? err : control.return_code; } +static const struct kernel_param_ops commit_inputs_param_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = damon_lru_sort_commit_inputs_store, + .get = param_get_bool, +}; + +module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600); + static int damon_lru_sort_damon_call_fn(void *arg) { struct damon_ctx *c = arg; @@ -365,7 +397,7 @@ static int damon_lru_sort_damon_call_fn(void *arg) damon_lru_sort_cold_stat = s->stat; } - return damon_lru_sort_handle_commit_inputs(); + return 0; } static struct damon_call_control call_control = { diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index fe7fce26cf6c..b330ff169590 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -39,7 +39,6 @@ static bool enabled __read_mostly; * re-reading, DAMON_RECLAIM will be disabled. */ static bool commit_inputs __read_mostly; -module_param(commit_inputs, bool, 0600); /* * Time threshold for cold memory regions identification in microseconds. @@ -246,18 +245,51 @@ out: return err; } -static int damon_reclaim_handle_commit_inputs(void) +static int damon_reclaim_commit_inputs_fn(void *arg) { - int err; + return damon_reclaim_apply_parameters(); +} - if (!commit_inputs) +static int damon_reclaim_commit_inputs_store(const char *val, + const struct kernel_param *kp) +{ + bool commit_inputs_request; + int err; + struct damon_call_control control = { + .fn = damon_reclaim_commit_inputs_fn, + }; + + if (!val) { + commit_inputs_request = true; + } else { + err = kstrtobool(val, &commit_inputs_request); + if (err) + return err; + } + + if (!commit_inputs_request) return 0; - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - return err; + /* + * Skip damon_call() if ctx is not initialized to avoid + * NULL pointer dereference. + */ + if (!ctx) + return -EINVAL; + + err = damon_call(ctx, &control); + + return err ? err : control.return_code; } +static const struct kernel_param_ops commit_inputs_param_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = damon_reclaim_commit_inputs_store, + .get = param_get_bool, +}; + +module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600); + static int damon_reclaim_damon_call_fn(void *arg) { struct damon_ctx *c = arg; @@ -267,7 +299,7 @@ static int damon_reclaim_damon_call_fn(void *arg) damon_for_each_scheme(s, c) damon_reclaim_stat = s->stat; - return damon_reclaim_handle_commit_inputs(); + return 0; } static struct damon_call_control call_control = { From 58996503b631adc6a268a42f4624a34513c16199 Mon Sep 17 00:00:00 2001 From: Asier Gutierrez Date: Sun, 26 Apr 2026 16:16:17 -0700 Subject: [PATCH 069/321] mm/damon: support MADV_COLLAPSE via DAMOS_COLLAPSE scheme action This patch set introces a new action: DAMOS_COLLAPSE. For DAMOS_HUGEPAGE and DAMOS_NOHUGEPAGE to work, khugepaged should be working, since it relies on hugepage_madvise to add a new slot. This slot should be picked up by khugepaged and eventually collapse (or not, if we are using DAMOS_NOHUGEPAGE) the pages. If THP is not enabled, khugepaged will not be working, and therefore no collapse will happen. DAMOS_COLLAPSE eventually calls madvise_collapse, which will collapse the address range synchronously. In cases where there is a large VMA (databases, for example), DAMOS_COLLAPSE allows us to collapse only the hot region, and not the entire VMA. This new action may be required to support autotuning with hugepage as a goal[1]. ========= Benchmarks: ========= MySQL ===== Tests were performed in an ARM physical server with MariaDB 10.5 and sysbench. Read only benchmark was perform with gaussian row hitting, which follows a normal distribution. T n, D h: THP set to never, DAMON action set to hugepage T m, D h: THP set to madvise, DAMON action set to hugepage T n, D c: THP set to never, DAMON action set to collapse Memory consumption. Lower is better. +------------------+----------+----------+----------+ | | T n, D h | T m, D h | T n, D c | +------------------+----------+----------+----------+ | Total memory use | 2.13 | 2.20 | 2.20 | | Huge pages | 0 | 1.3 | 1.27 | +------------------+----------+----------+----------+ Performance in TPS (Transactions Per Second). Higher is better. T n, D h: 18225.58 T m, D h 18252.93 T n, D c: 18270.21 Performance counter I got the number of L1 D/I TLB accesses and the number a D/I TLB accesses that triggered a page walk. I divided the second by the first to get the percentage of page walkes per TLB access. The lower the better. +---------------+--------------+--------------+--------------+ | | T n, D h | T m, D h | T n, D c | +---------------+--------------+--------------+--------------+ | L1 DTLB | 127248242753 | 125431020479 | 125327001821 | | L1 ITLB | 80332558619 | 79346759071 | 79298139590 | | DTLB walk | 75011087 | 52800418 | 55895794 | | ITLB walk | 71577076 | 71505137 | 67262140 | | DTLB % misses | 0.058948623 | 0.042095183 | 0.044599961 | | ITLB % misses | 0.089100954 | 0.090117275 | 0.084821839 | +---------------+--------------+--------------+--------------+ Masim ===== I used masim with the "demo" configuration, but changing the times to 100 seconds for the initial phase and 50 seconds for the rest of the phases. Memory consumption: +------------------+----------+----------+----------+ | | T n, D h | T m, D h | T n, D c | +------------------+----------+----------+----------+ | Total memory use | 2.38 GB | 2.36 GB | 2.37 GB | | Huge pages | 0 | 190 MB | 188 MB | +------------------+----------+----------+----------+ Performance: THP never, DAMOS_HUGEPAGE initial phase: 40,491 accesses/msec, 100001 msecs run low phase 0: 39,658 accesses/msec, 50002 msecs run high phase 0: 41,678 accesses/msec, 50000 msecs run low phase 1: 39,625 accesses/msec, 50003 msecs run high phase 1: 41,658 accesses/msec, 50002 msecs run low phase 2: 39,642 accesses/msec, 50002 msecs run high phase 2: 41,640 accesses/msec, 50001 msecs run THP madvise, DAMOS_HUGEPAGE initial phase: 51,977 accesses/msec, 100000 msecs run low phase 0: 86,953 accesses/msec, 50000 msecs run high phase 0: 94,812 accesses/msec, 50000 msecs run low phase 1: 101,017 accesses/msec, 50000 msecs run high phase 1: 94,841 accesses/msec, 50000 msecs run low phase 2: 100,993 accesses/msec, 50000 msecs run high phase 2: 94,791 accesses/msec, 50001 msecs run THP never, DAMOS_COLLAPSE initial phase: 93,678 accesses/msec, 100001 msecs run low phase 0: 101,475 accesses/msec, 50000 msecs run high phase 0: 98,589 accesses/msec, 50000 msecs run low phase 1: 101,531 accesses/msec, 50001 msecs run high phase 1: 98,506 accesses/msec, 50001 msecs run low phase 2: 101,458 accesses/msec, 50001 msecs run high phase 2: 98,555 accesses/msec, 50000 msecs run Memory consumption dynamic (how quickly collapses occur): It shows in seconds how many huge pages are allocated. +----+----------+----------+ | | T m, D h | T n, D c | +----+----------+----------+ | 5 | 32 | 188 | | 10 | 48 | 188 | | 15 | 64 | 188 | | 20 | 96 | 188 | | 30 | 112 | 188 | | 35 | 144 | 188 | | 40 | 160 | 188 | | 45 | 190 | 188 | | 50 | 190 | 188 | | 55 | 190 | 188 | | 60 | 190 | 188 | +----+----------+----------+ ========= - We can see that DAMOS "hugepage" action works only when THP is set to madvise. "collapse" action works even when THP is set to never. - Performance for "collapse" action is slightly lower than "hugepage" action and THP madvise. This is due to the fact that collapases occur synchronously. With "hugepage" they may occur during page faults. - Memory consumption is slighly lower for "collapse" than "hugepage" with THP madvise. This is due to the khugepage collapses all VMAs, while "collapse" action only collapses the VMAs in the hot region. - There is an improvement in TLB utilization when collapse through "hugepage" or "collapse" actions are triggered. The amount of TLB misses is lower. - "collapse" action is performance synchronously, which means that page collapses happen earlier and more rapidly. This can be useful or not, depending on the scenario. - "hugepage" action may trigger a VMA split in some scenarios, since it needs to change the flag of the VMA to THP enabled. This may lead to additional overhead. Collapse action just adds a new option to chose the correct system balance. Link: https://lore.kernel.org/20260426231619.107231-5-sj@kernel.org Link: https://lore.kernel.org/damon/20260313000816.79933-1-sj@kernel.org/ [1] Signed-off-by: Asier Gutierrez Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Cheng-Han Wu Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Liew Rui Yan Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 4 ++++ include/linux/damon.h | 2 ++ mm/damon/sysfs-schemes.c | 4 ++++ mm/damon/vaddr.c | 3 +++ tools/testing/selftests/damon/sysfs.py | 11 ++++++----- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index bacb457f553a..da74ab20e289 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -474,6 +474,10 @@ that supports each action are as below. Supported by ``vaddr`` and ``fvaddr`` operations set. When TRANSPARENT_HUGEPAGE is disabled, the application of the action will just fail. + - ``collapse``: Call ``madvise()`` for the region with ``MADV_COLLAPSE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. When + TRANSPARENT_HUGEPAGE is disabled, the application of the action will just + fail. - ``lru_prio``: Prioritize the region on its LRU lists. Supported by ``paddr`` operations set. - ``lru_deprio``: Deprioritize the region on its LRU lists. diff --git a/include/linux/damon.h b/include/linux/damon.h index 2bb43910e22e..d3a231275c23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -121,6 +121,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Reclaim the region. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_COLLAPSE: Call ``madvise()`` for the region with MADV_COLLAPSE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions. @@ -140,6 +141,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_COLLAPSE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, DAMOS_MIGRATE_HOT, diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index be2b5eda84e0..ab2153fff9a8 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2116,6 +2116,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = { .action = DAMOS_NOHUGEPAGE, .name = "nohugepage", }, + { + .action = DAMOS_COLLAPSE, + .name = "collapse", + }, { .action = DAMOS_LRU_PRIO, .name = "lru_prio", diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b069dbc7e3d2..dd5f2d7027ac 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -903,6 +903,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_COLLAPSE: + madv_action = MADV_COLLAPSE; + break; case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: return damos_va_migrate(t, r, scheme, sz_filter_passed); diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 9067945f16ca..7e93584ff02b 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -127,11 +127,12 @@ def assert_scheme_committed(scheme, dump): 'pageout': 2, 'hugepage': 3, 'nohugeapge': 4, - 'lru_prio': 5, - 'lru_deprio': 6, - 'migrate_hot': 7, - 'migrate_cold': 8, - 'stat': 9, + 'collapse': 5, + 'lru_prio': 6, + 'lru_deprio': 7, + 'migrate_hot': 8, + 'migrate_cold': 9, + 'stat': 10, } assert_true(dump['action'] == action_val[scheme.action], 'action', dump) assert_true(dump['apply_interval_us'] == scheme. apply_interval_us, From 8803f883310a886e701fa282eaae3a6658b10091 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 27 Apr 2026 13:43:14 +0200 Subject: [PATCH 070/321] sh: use folio_mapped() instead of page_mapped() in sh4_flush_cache_page() Patch series "mm: remove page_mapped()". While preparing my slides for an LSF/MM talk, I realized that I did not yet remove page_mapped(). So let's do that. In the BPF arena code it's unclear which memdesc we would want to allocate in the future: certainly something with a refcount, but likely none with a mapcount. So let's just rely on the page refcount instead to decide whether we want to try zapping the page from user page tables. This patch (of 3): We already have the folio in our hands, so let's just use folio_mapped(). Link: https://lore.kernel.org/20260427-page_mapped-v1-0-e89c3592c74c@kernel.org Link: https://lore.kernel.org/20260427-page_mapped-v1-1-e89c3592c74c@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Harry Yoo Cc: Jann Horn Cc: Jiri Olsa Cc: John Paul Adrian Glaubitz Cc: Kumar Kartikeya Dwivedi Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Mike Rapoport Cc: Rich Felker Cc: Rik van Riel Cc: Song Liu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/mm/cache-sh4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 83fb34b39ca7..8bc9ce541c14 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -248,7 +248,7 @@ static void sh4_flush_cache_page(void *args) */ map_coherent = (current_cpu_data.dcache.n_aliases && test_bit(PG_dcache_clean, folio_flags(folio, 0)) && - page_mapped(page)); + folio_mapped(folio)); if (map_coherent) vaddr = kmap_coherent(page, address); else From 88692f0c33a788072abfa1888b28bc6d7d7d1165 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 27 Apr 2026 13:43:15 +0200 Subject: [PATCH 071/321] bpf: arena: use page_ref_count() instead of page_mapped() in arena_free_pages() Pages that BPF arena code maps are allocated through bpf_map_alloc_pages(), which does not allocate folios but pages. In the future, pages will not have a mapcount, only folios will. Converting the code to use folios and rely on folio_mapped() sounds like the wrong approach. Should BPF arena code allocate folios and use folio_mapped() here? But likely we would not want to use folios here longterm, as we don't really need folio information. Hard to tell. But in the meantime, we can simply use the page refcount instead, as a heuristic whether the page might be mapped to user space and we would want to try zapping it, so we can get rid of page_mapped(). Page allocation will give us a page with a refcount of 1. Any user space mapping adds a page reference. While there can be references from other subsystems (e.g., GUP), in the common case for this test here relying on the page count is good enough. Link: https://lore.kernel.org/20260427-page_mapped-v1-2-e89c3592c74c@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Harry Yoo Cc: Jann Horn Cc: Jiri Olsa Cc: John Paul Adrian Glaubitz Cc: Kumar Kartikeya Dwivedi Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Mike Rapoport Cc: Rich Felker Cc: Rik van Riel Cc: Song Liu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 49a8f7b1beef..a497c5913bd4 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -729,7 +729,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { page = llist_entry(pos, struct page, pcp_llist); - if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + if (page_cnt == 1 && page_ref_count(page) > 1) /* maybe mapped by user space */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there * is no need to call zap_pages which is slow. When From 90f01f5d6ba57d93363289b3247314b7fd5e8d49 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 27 Apr 2026 13:43:16 +0200 Subject: [PATCH 072/321] mm: remove page_mapped() Let's replace the last user of page_mapped() by folio_mapped() so we can get rid of page_mapped(). Replace the remaining occurrences of page_mapped() in rmap documentation by folio_mapped(). Link: https://lore.kernel.org/20260427-page_mapped-v1-3-e89c3592c74c@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Harry Yoo Cc: Jann Horn Cc: Jiri Olsa Cc: John Paul Adrian Glaubitz Cc: Kumar Kartikeya Dwivedi Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Mike Rapoport Cc: Rich Felker Cc: Rik van Riel Cc: Song Liu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- mm/memory.c | 2 +- mm/rmap.c | 8 ++++---- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a0078a4dc78..9cedc5e75aa9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1888,16 +1888,6 @@ static inline bool folio_mapped(const struct folio *folio) return folio_mapcount(folio) >= 1; } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any sub-page of compound page is mapped, - * even if this particular sub-page is not itself mapped by any PTE or PMD. - */ -static inline bool page_mapped(const struct page *page) -{ - return folio_mapped(page_folio(page)); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); diff --git a/mm/memory.c b/mm/memory.c index 02ec74a1273f..0c9d9c2cbf0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5482,7 +5482,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { - if (page_mapped(vmf->page)) + if (folio_mapped(folio)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) diff --git a/mm/rmap.c b/mm/rmap.c index 99e1b3dc390b..1c77d5dc06e9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -571,7 +571,7 @@ void __init anon_vma_init(void) * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as - * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * long as we observe folio_mapped() [ hence all those folio_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it @@ -1999,7 +1999,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), - * try_to_unmap() may return before page_mapped() has become false, + * try_to_unmap() may return before folio_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) @@ -2428,7 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), - * try_to_migrate() may return before page_mapped() has become false, + * try_to_migrate() may return before folio_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) @@ -2929,7 +2929,7 @@ static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() - * because that depends on page_mapped(); but not all its usages + * because that depends on folio_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ From 0b20c36c118d2122f57982c644e526c0fcd4a947 Mon Sep 17 00:00:00 2001 From: fujunjie Date: Mon, 4 May 2026 10:39:57 +0000 Subject: [PATCH 073/321] mm/madvise: reject invalid process_madvise() advice for zero-length vectors process_madvise() used to validate the advice while walking each imported iovec. If the vector has zero total length, vector_madvise() does not enter the loop and can return success without checking whether the advice value is valid. For a local mm, such as process_madvise(PIDFD_SELF, ...), the remote-only process_madvise_remote_valid() check is skipped. As a result, an invalid advice can be reported as success when the vector has zero total length. This differs from madvise(), which rejects an invalid advice before returning success for a zero-length range. Validate the generic madvise behavior at the syscall-facing entry points before any vector walk. In process_madvise(), do this before the remote-only advice restriction so unsupported advice is rejected with the same priority for local and remote mm. Use an errno-returning helper for address/length validation, and handle zero-length ranges explicitly at the call sites. Requests with valid advice and zero total length remain a noop and continue to return 0. Add a selftest that covers invalid advice with a zero-length iovec and an empty vector, while also checking that a request with valid advice and zero length still succeeds. Link: https://lore.kernel.org/tencent_C3AEB0E769C5F4F9370F9411B69B7F8B2907@qq.com Fixes: 021781b01275 ("mm/madvise: unrestrict process_madvise() for current process") Signed-off-by: fujunjie Acked-by: David Hildenbrand (Arm) Reviewed-by: SeongJae Park Cc: Christian Brauner Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 60 ++++++++++------------- tools/testing/selftests/mm/process_madv.c | 28 +++++++++++ 2 files changed, 53 insertions(+), 35 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 69708e953cf5..cd9bb077072c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1834,50 +1834,29 @@ static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) tlb_finish_mmu(madv_behavior->tlb); } -static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +/** + * check_input_range() - Check if the requested range is valid. + * @start: Start address of madvise-requested address range. + * @len_in: Length of madvise-requested address range. + * + * Returns: 0 if the input range is valid, otherwise an error code. + */ +static int check_input_range(unsigned long start, size_t len_in) { size_t len; - if (!madvise_behavior_valid(behavior)) - return false; - if (!PAGE_ALIGNED(start)) - return false; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return false; + return -EINVAL; if (start + len < start) - return false; + return -EINVAL; - return true; -} - -/* - * madvise_should_skip() - Return if the request is invalid or nothing. - * @start: Start address of madvise-requested address range. - * @len_in: Length of madvise-requested address range. - * @behavior: Requested madvise behavior. - * @err: Pointer to store an error code from the check. - * - * If the specified behaviour is invalid or nothing would occur, we skip the - * operation. This function returns true in the cases, otherwise false. In - * the former case we store an error on @err. - */ -static bool madvise_should_skip(unsigned long start, size_t len_in, - int behavior, int *err) -{ - if (!is_valid_madvise(start, len_in, behavior)) { - *err = -EINVAL; - return true; - } - if (start + PAGE_ALIGN(len_in) == start) { - *err = 0; - return true; - } - return false; + return 0; } static bool is_madvise_populate(struct madvise_behavior *madv_behavior) @@ -2013,8 +1992,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh .tlb = &tlb, }; - if (madvise_should_skip(start, len_in, behavior, &error)) + if (!madvise_behavior_valid(behavior)) + return -EINVAL; + + error = check_input_range(start, len_in); + if (error || !len_in) return error; + error = madvise_lock(&madv_behavior); if (error) return error; @@ -2056,7 +2040,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, size_t len_in = iter_iov_len(iter); int error; - if (madvise_should_skip(start, len_in, behavior, &error)) + error = check_input_range(start, len_in); + if (error || !len_in) ret = error; else ret = madvise_do_behavior(start, len_in, &madv_behavior); @@ -2131,6 +2116,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + if (!madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c index cd4610baf5d7..3fffd5f7e6fb 100644 --- a/tools/testing/selftests/mm/process_madv.c +++ b/tools/testing/selftests/mm/process_madv.c @@ -309,6 +309,34 @@ TEST_F(process_madvise, invalid_vlen) ASSERT_EQ(munmap(map, pagesize), 0); } +/* + * Test that invalid advice is rejected even when the iovec has zero total + * length. A request with valid advice and zero length is a noop, but + * invalid advice should still fail with EINVAL. + */ +TEST_F(process_madvise, invalid_advice_zero_length) +{ + struct iovec vec = { + .iov_base = NULL, + .iov_len = 0, + }; + int pidfd = self->pidfd; + ssize_t ret; + + errno = 0; + ret = sys_process_madvise(pidfd, &vec, 1, -1, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + errno = 0; + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, 0); + + ret = sys_process_madvise(pidfd, NULL, 0, -1, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); +} + /* * Test process_madvise() with an invalid flag value. Currently, only a flag * value of 0 is supported. This test is reserved for the future, e.g., if From 7b32f64bc512b40b268776c5ac4d354b325b3197 Mon Sep 17 00:00:00 2001 From: Frederick Mayle Date: Sun, 26 Apr 2026 20:01:47 -0700 Subject: [PATCH 074/321] mm: limit filemap_fault readahead to VMA boundaries When a file mapping covers a strict subset of a file, an access to the mapping can trigger readahead of file pages outside the mapped region. Readahead is meant to prefetch pages likely to be accessed soon, but these pages aren't accessible via the same means, so it fair to say we don't have a good indicator they'll be accessed soon. Take an ELF file for example: an access to the end of a program's read-only segment isn't a sign that nearby file contents will be accessed next (they are likely to be mapped discontiguously, or not at all). The pressure from loading these pages into the cache can evict more useful pages. To improve the behavior, make three changes: * Introduce a new readahead_control field, max_index, as a hard limit on the readahead. The existing file_ra_state->size can't be used as a limit, it is more of a hint and can be increased by various heuristics. * Set readahead_control->max_index to the end of the VMA in all of the readahead paths that can be triggered from a fault on a file mapping (both "sync" and "async" readahead). * Limit the read-around range start to the VMA's start. Note that these changes only affect readahead triggered in the context of a fault, they do not affect readahead triggered by read syscalls. If a user mixes the two types of accesses, the behavior is expected to be the following: if a fault causes readahead and places a PG_readahead marker and then a read(2) syscall hits the PG_readahead marker, the resulting async readahead *will not* be limited to the VMA end. Conversely, if a read(2) syscall places a PG_readahead marker and then a fault hits the marker, the async readahead *will* be limited to the VMA end. There is an edge case that the above motivation glosses over: A single file mapping might be backed by multiple VMAs. For example, a whole file could be mapped RW, then part of the mapping made RO using mprotect. This patch would hurt performance of a sequential faulted read of such a mapping, the degree depending on how fragmented the VMAs are. A usage pattern like that is likely rare and already suffering from sub-optimal performance because, e.g., the fragmented VMAs limit the fault-around, so each VMA boundary in a sequential faulted read would cause a minor fault. Still, this patch would make it worse. See a previous discussion of this topic at [1]. Tested by mapping and reading a small subset of a large file, then using the cachestat syscall to verify the number of cached pages didn't exceed the mapping size. In practical scenarios, the effect depends on the specific file and usage. Sometimes there is no effect at all, but, for some ELF files in Android, we see ~20% fewer pages pulled into the cache. A comprehensive performance evaluation hasn't been done, but, in addition to the anecdontal memory savings mentioned above, a benchmark was run with fio 3.38, showing neutral looking results: /data/local/tmp/fio --version fio --name=mmap_test --ioengine=mmap --rw=read --bs=4k \ --offset=1G --size=1G --filesize=3G --numjobs=1 \ --filename=testfile.bin Before: 4366.6 MiB/s (avg of 3459, 4592, 4613, 4697, 4472) After: 4444.0 MiB/s (avg of 4633, 4655, 4511, 4571, 3850) +1.7% Same, with --ioengine=mmap --rw=randread Before: 445.6 MiB/s (avg of 446, 447, 442, 452, 441) After: 447.0 MiB/s (avg of 447, 446, 446, 451, 445) +0.3% Same, with --ioengine=psync --rw=read Before: 3086.6 MiB/s (avg of 3122, 3094, 3066, 3094, 3057) After: 3084.6 MiB/s (avg of 3039, 3103, 3103, 3084, 3094) -0.06% Same, with --ioengine=psync --rw=randread Before: 2226.4 MiB/s (avg of 2256, 2183, 2207, 2265, 2221) After: 2231.4 MiB/s (avg of 2236, 2241, 2236, 2193, 2251) +0.2% Link: https://lore.kernel.org/20260427030148.653228-1-fmayle@google.com Link: https://lore.kernel.org/all/ivnv2crd3et76p2nx7oszuqhzzah756oecn5yuykzqfkqzoygw@yvnlkhjjssoz/ [1] Signed-off-by: Frederick Mayle Reviewed-by: Jan Kara Reviewed-by: Kalesh Singh Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 4 ++++ mm/readahead.c | 6 +++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 31a848485ad9..1f50991b43e3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1350,6 +1350,7 @@ struct readahead_control { struct file_ra_state *ra; /* private: use the readahead_* accessors instead */ pgoff_t _index; + pgoff_t _max_index; /* limit readahead to _max_index, inclusive */ unsigned int _nr_pages; unsigned int _batch_count; bool dropbehind; @@ -1363,6 +1364,7 @@ struct readahead_control { .mapping = m, \ .ra = r, \ ._index = i, \ + ._max_index = ULONG_MAX, \ } #define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) diff --git a/mm/filemap.c b/mm/filemap.c index 4e636647100c..97772a05a18e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3314,6 +3314,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) bool force_thp_readahead = false; unsigned short mmap_miss; + ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; + /* Use the readahead code, even if readahead is disabled */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) @@ -3396,6 +3398,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->start = max(ra->start, vmf->vma->vm_pgoff); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra->order = 0; @@ -3438,6 +3441,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, } if (folio_test_readahead(folio)) { + ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } diff --git a/mm/readahead.c b/mm/readahead.c index 7b05082c89ea..8c12b63ccd4a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -324,6 +324,8 @@ static void do_page_cache_ra(struct readahead_control *ractl, return; end_index = (isize - 1) >> PAGE_SHIFT; + if (end_index > ractl->_max_index) + end_index = ractl->_max_index; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ @@ -471,7 +473,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t start = readahead_index(ractl); pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); - pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + pgoff_t limit; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; @@ -484,6 +486,8 @@ void page_cache_ra_order(struct readahead_control *ractl, goto fallback; } + limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + limit = min(limit, ractl->_max_index); limit = min(limit, index + ra->size - 1); new_order = min(mapping_max_folio_order(mapping), new_order); From 3b9e3cc0405b422db884054ea2417b7b85220c56 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:20 -0700 Subject: [PATCH 075/321] mm/damon/core: introduce damon_ctx->paused Patch series "mm/damon: let DAMON be paused and resumed", v2. DAMON utilizes a few mechanisms that enhance itself over time. Adaptive regions adjustment, goal-based DAMOS quota auto-tuning and monitoring intervals auto-tuning like self-training mechanisms are such examples. It also adds access frequency stability information (age) to the monitoring results, which makes it enhanced over time. Sometimes users have to stop DAMON. In this case, DAMON internal state that enhanced over the time of the last execution simply goes away. Restarted DAMON have to train itself and enhance its output from the scratch. This makes DAMON less useful in such cases. Introducing three such use cases below. Investigation of DAMON. It is best to do the investigation online, especially when it is a production environment. DAMON therefore provides features for such online investigations, including DAMOS stats, monitoring result snapshot exposure, and multiple tracepoints. When those are insufficient, and there are additional clues that could be interfered by DAMON, users have to temporarily stop DAMON to collect the additional clues. It is not very useful since many of DAMON internal clues are gone when DAMON is stopped. The loss of the monitoring results that improved over time is also problematic, especially in production environments. Monitoring of workloads that have different user-known phases. For example, in Android, applications are known to have very different access patterns and behaviors when they are running on the foreground and the background. It can therefore be useful to separate monitoring of apps based on whether they are running on the foreground and on the background. Having two DAMON threads per application that paused and resumed for the apps foreground/background switches can be useful for the purpose. But such pause/resume of the execution is not supported. Tests of DAMON. A few DAMON selftests are using drgn to dump the internal DAMON status. The tests show if the dumped status is the same as what the test code expected. Because DAMON keeps running and modifying its internal status, there are chances of data races that can cause false test results. Stopping DAMON can avoid the race. But, since the internal state of DAMON is dropped, the test coverage will be limited. Let DAMON execution be paused and resumed without loss of the internal state, to overhaul the limitations. For this, introduce a new DAMON context parameter, namely 'pause'. API callers can update it while the context is running, using the online parameters update functions (damon_commit_ctx() and damon_call()). Once it is set, kdamond_fn() main loop will do only limited works excluding the monitoring and DAMOS works, while sleeping sampling intervals per the work. The limited works include handling of the online parameters update. Hence users can unset the 'pause' parameter again. Once it is unset, kdamond_fn() main loop will do all the work again (resumed). Under the paused state, it also does stop condition checks and handling of it, so that paused DAMON can also be stopped if needed. Expose the feature to the user space via DAMON sysfs interface. Also, update existing drgn-based tests to test and use the feature. Tests ===== I confirmed the feature functionality using real time tracing ('perf trace' or 'trace-cmd stream') of damon:damon_aggregated DAMON tracepoint. By pausing and resuming the DAMON execution, I was able to see the trace stops and continued as expected. Note that the pause feature support is added to DAMON user-space tool (damo) after v3.1.9. Users can use '--pause_ctx' command line option of damo for that, and I actually used it for my test. The extended drgn-based selftests are also testing a part of the functionality. Patches Sequence ================ Patch 1 introduces the new core API for the pause feature. Patch 2 extend DAMON sysfs interface for the new parameter. Patches 3-5 update design, usage and ABI documents for the new sysfs file, respectively. The following five patches are for tests. Patch 6 implements a new kunit test for the pause parameter online commitment. Patches 7 and 8 extend DAMON selftest helpers to support the new feature. Patch 9 extends selftest to test the commitment of the feature. Finally, patch 10 updates existing selftest to be safe from the race condition using the pause/resume feature. This patch (of 10): DAMON supports only start and stop of the execution. When it is stopped, its internal data that it self-trained goes away. It will be useful if the execution can be paused and resumed with the previous self-trained data. Introduce per-context API parameter, 'paused', for the purpose. The parameter can be set and unset while DAMON is running and paused, using the online parameters commit helper functions (damon_commit_ctx() and damon_call()). Once 'paused' is set, the kdamond_fn() main loop does only limited works with sampling interval sleep during the works. The limited works include the handling of the online parameters update, so that users can unset the 'pause' and resume the execution when they want. It also keep checking DAMON stop conditions and handling of it, so that DAMON can be stopped while paused if needed. Link: https://lore.kernel.org/20260427151231.113429-1-sj@kernel.org Link: https://lore.kernel.org/20260427151231.113429-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index d3a231275c23..f2370a3a4a9a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -801,6 +801,7 @@ struct damon_attrs { * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. * @min_region_sz: Minimum region size. + * @pause: Pause kdamond main loop. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -854,6 +855,7 @@ struct damon_ctx { struct damon_operations ops; unsigned long addr_unit; unsigned long min_region_sz; + bool pause; struct list_head adaptive_targets; struct list_head schemes; diff --git a/mm/damon/core.c b/mm/damon/core.c index 7aeaf319a18a..05e4bef367db 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1370,6 +1370,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) if (err) return err; } + dst->pause = src->pause; dst->ops = src->ops; dst->addr_unit = src->addr_unit; dst->min_region_sz = src->min_region_sz; @@ -3237,6 +3238,14 @@ static int kdamond_fn(void *data) kdamond_call(ctx, false); if (ctx->maybe_corrupted) break; + while (ctx->pause) { + damos_walk_cancel(ctx); + kdamond_usleep(ctx->attrs.sample_interval); + /* allow caller unset pause via damon_call() */ + kdamond_call(ctx, false); + if (kdamond_need_stop(ctx) || ctx->maybe_corrupted) + goto done; + } if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); else From 3375284944ead898236652bd68a8dac66b65792d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:21 -0700 Subject: [PATCH 076/321] mm/damon/sysfs: add pause file under context dir Add pause DAMON sysfs file under the context directory. It exposes the damon_ctx->pause API parameter to the users so that they can use the pause/resume feature. Link: https://lore.kernel.org/20260427151231.113429-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index eefa959aa30a..d5863cc33d23 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -866,6 +866,7 @@ struct damon_sysfs_context { struct damon_sysfs_attrs *attrs; struct damon_sysfs_targets *targets; struct damon_sysfs_schemes *schemes; + bool pause; }; static struct damon_sysfs_context *damon_sysfs_context_alloc( @@ -878,6 +879,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc( context->kobj = (struct kobject){}; context->ops_id = ops_id; context->addr_unit = 1; + context->pause = false; return context; } @@ -1053,6 +1055,30 @@ static ssize_t addr_unit_store(struct kobject *kobj, return count; } +static ssize_t pause_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%c\n", context->pause ? 'Y' : 'N'); +} + +static ssize_t pause_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + bool pause; + int err = kstrtobool(buf, &pause); + + if (err) + return err; + context->pause = pause; + return count; +} + + static void damon_sysfs_context_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_context, kobj)); @@ -1067,10 +1093,14 @@ static struct kobj_attribute damon_sysfs_context_operations_attr = static struct kobj_attribute damon_sysfs_context_addr_unit_attr = __ATTR_RW_MODE(addr_unit, 0600); +static struct kobj_attribute damon_sysfs_context_pause_attr = + __ATTR_RW_MODE(pause, 0600); + static struct attribute *damon_sysfs_context_attrs[] = { &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, &damon_sysfs_context_addr_unit_attr.attr, + &damon_sysfs_context_pause_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_context); @@ -1470,6 +1500,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (sys_ctx->ops_id == DAMON_OPS_PADDR) ctx->min_region_sz = max( DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); + ctx->pause = sys_ctx->pause; err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; From 60bee40e30d047356a118bd637ba4960baadcd46 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:22 -0700 Subject: [PATCH 077/321] Docs/mm/damon/design: update for context pause/resume feature Update DAMON design document for the context execution pause/resume feature. Link: https://lore.kernel.org/20260427151231.113429-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index da74ab20e289..fa7392b5a331 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -19,6 +19,13 @@ types of monitoring. To know how user-space can do the configurations and start/stop DAMON, refer to :ref:`DAMON sysfs interface ` documentation. +Users can also request each context execution to be paused and resumed. When +it is paused, the kdamond does nothing other than applying online parameter +update. + +To know how user-space can pause/resume each context, refer to :ref:`DAMON +sysfs context ` usage documentation. + Overall Architecture ==================== From ade1a22a8bf612c4e9fd8fabd5b103dae4d6a0c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:23 -0700 Subject: [PATCH 078/321] Docs/admin-guide/mm/damon/usage: update for pause file Update DAMON usage document for the DAMON context execution pause/resume feature. Link: https://lore.kernel.org/20260427151231.113429-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index e84b58731f7e..d5548e460857 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -66,7 +66,8 @@ comma (","). │ :ref:`kdamonds `/nr_kdamonds │ │ :ref:`0 `/state,pid,refresh_ms │ │ │ :ref:`contexts `/nr_contexts - │ │ │ │ :ref:`0 `/avail_operations,operations,addr_unit + │ │ │ │ :ref:`0 `/avail_operations,operations,addr_unit, + │ │ │ │ pause │ │ │ │ │ :ref:`monitoring_attrs `/ │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us │ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us @@ -196,9 +197,9 @@ details). At the moment, only one context per kdamond is supported, so only contexts// ------------- -In each context directory, three files (``avail_operations``, ``operations`` -and ``addr_unit``) and three directories (``monitoring_attrs``, ``targets``, -and ``schemes``) exist. +In each context directory, four files (``avail_operations``, ``operations``, +``addr_unit`` and ``pause``) and three directories (``monitoring_attrs``, +``targets``, and ``schemes``) exist. DAMON supports multiple types of :ref:`monitoring operations `, including those for virtual address @@ -216,6 +217,9 @@ reading from the ``operations`` file. ``addr_unit`` file is for setting and getting the :ref:`address unit ` parameter of the operations set. +``pause`` file is for setting and getting the :ref:`pause request +` parameter of the context. + .. _sysfs_monitoring_attrs: contexts//monitoring_attrs/ From f0cefc367686a5fb1de0b9b0a3bcd179ef5e67ee Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:24 -0700 Subject: [PATCH 079/321] Docs/ABI/damon: update for pause sysfs file Update DAMON ABI document for the DAMON context execution pause/resume feature. Link: https://lore.kernel.org/20260427151231.113429-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 213eb87392d8..971c22e34e72 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -84,6 +84,13 @@ Description: Writing an integer to this file sets the 'address unit' parameter of the given operations set of the context. Reading the file returns the last-written 'address unit' value. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//pause +Date: Mar 2026 +Contact: SeongJae Park +Description: Writing a boolean keyword to this file sets the 'pause' request + parameter for the context. Reading the file returns the + last-written 'pause' value. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/sample_us Date: Mar 2022 Contact: SeongJae Park From eb1ae61075f3c9e4e395f23993b5f3593a2e8ff1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:25 -0700 Subject: [PATCH 080/321] mm/damon/tests/core-kunit: test pause commitment Add a kunit test for commitment of damon_ctx->pause parameter that can be done using damon_commit_ctx(). Link: https://lore.kernel.org/20260427151231.113429-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 6de622a2fd79..1b23a22ac04c 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -1083,6 +1083,10 @@ static void damon_test_commit_ctx(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); src->min_region_sz = 4095; KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL); + src->min_region_sz = 4096; + src->pause = true; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); + KUNIT_EXPECT_TRUE(test, dst->pause); damon_destroy_ctx(src); damon_destroy_ctx(dst); } From 5d8585a1d7f689a6fee5a497d83017c5a8a4acfc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:26 -0700 Subject: [PATCH 081/321] selftests/damon/_damon_sysfs: support pause file staging DAMON test-purpose sysfs interface control Python module, _damon_sysfs, is not supporting the newly added pause file. Add the support of the file, for future test and use of the feature. Link: https://lore.kernel.org/20260427151231.113429-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 0f13512fa5e6..8b12cc048440 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -621,10 +621,11 @@ class DamonCtx: targets = None schemes = None kdamond = None + pause = None idx = None def __init__(self, ops='paddr', monitoring_attrs=DamonAttrs(), targets=[], - schemes=[]): + schemes=[], pause=False): self.ops = ops self.monitoring_attrs = monitoring_attrs self.monitoring_attrs.context = self @@ -639,6 +640,8 @@ class DamonCtx: scheme.idx = idx scheme.context = self + self.pause=pause + def sysfs_dir(self): return os.path.join(self.kdamond.sysfs_dir(), 'contexts', '%d' % self.idx) @@ -679,6 +682,11 @@ class DamonCtx: err = scheme.stage() if err is not None: return err + + err = write_file(os.path.join(self.sysfs_dir(), 'pause'), self.pause) + if err is not None: + return err + return None class Kdamond: From d0e3f902aef881dab99111b59897dd045d932e47 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:27 -0700 Subject: [PATCH 082/321] selftests/damon/drgn_dump_damon_status: dump pause drgn_dump_damon_status is not dumping the damon_ctx->pause parameter value, so it cannot be tested. Dump it for future tests. Link: https://lore.kernel.org/20260427151231.113429-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/drgn_dump_damon_status.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index b5c56233a923..972948e6215f 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -202,6 +202,7 @@ def damon_ctx_to_dict(ctx): ['attrs', attrs_to_dict], ['adaptive_targets', targets_to_list], ['schemes', schemes_to_list], + ['pause', bool], ]) def main(): From e88be73275e9bff727977499066606e35fa8db13 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:28 -0700 Subject: [PATCH 083/321] selftests/damon/sysfs.py: check pause on assert_ctx_committed() Extend sysfs.py tests to confirm damon_ctx->pause can be set using the pause sysfs file. Link: https://lore.kernel.org/20260427151231.113429-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 7e93584ff02b..eb56c19cd3f9 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -195,6 +195,7 @@ def assert_ctx_committed(ctx, dump): assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs']) assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) + assert_true(dump['pause'] == ctx.pause, 'pause', dump) def assert_ctxs_committed(kdamonds): status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) From cb1a7622c90c169b1dabdd680711f85b6fde7319 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 27 Apr 2026 08:12:29 -0700 Subject: [PATCH 084/321] selftests/damon/sysfs.py: pause DAMON before dumping status The sysfs.py test commits DAMON parameters, dump the internal DAMON state, and show if the parameters are committed as expected using the dumped state. While the dumping is ongoing, DAMON is alive. It can make internal changes including addition and removal of regions. It can therefore make a race that can result in false test results. Pause DAMON execution during the state dumping to avoid such races. Link: https://lore.kernel.org/20260427151231.113429-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 38 ++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index eb56c19cd3f9..cd4d82c85211 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -198,18 +198,55 @@ def assert_ctx_committed(ctx, dump): assert_true(dump['pause'] == ctx.pause, 'pause', dump) def assert_ctxs_committed(kdamonds): + ctxs_paused_for_dump = [] + kdamonds_paused_for_dump = [] + # pause for safe state dumping + for kd in kdamonds.kdamonds: + for ctx in kd.contexts: + if ctx.pause is False: + ctx.pause = True + ctxs_paused_for_dump.append(ctx) + if not kd in kdamonds_paused_for_dump: + kdamonds_paused_for_dump.append(kd) + if kd in kdamonds_paused_for_dump: + err = kd.commit() + if err is not None: + print('pause fail (%s)' % err) + kdamonds.stop() + exit(1) + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) if err is not None: print(err) kdamonds.stop() exit(1) + # resume contexts paused for safe state dumping + for ctx in ctxs_paused_for_dump: + ctx.pause = False + for kd in kdamonds_paused_for_dump: + err = kd.commit() + if err is not None: + print('resume fail (%s)' % err) + kdamonds.stop() + exit(1) + + # restore for comparison + for ctx in ctxs_paused_for_dump: + ctx.pause = True + ctxs = kdamonds.kdamonds[0].contexts dump = status['contexts'] assert_true(len(ctxs) == len(dump), 'ctxs length', dump) for idx, ctx in enumerate(ctxs): assert_ctx_committed(ctx, dump[idx]) + # restore for the caller + for kd in kdamonds.kdamonds: + for ctx in kd.contexts: + if ctx in ctxs_paused_for_dump: + ctx.pause = False + def main(): kdamonds = _damon_sysfs.Kdamonds( [_damon_sysfs.Kdamond( @@ -309,6 +346,7 @@ def main(): print('kdamond start failed: %s' % err) exit(1) kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True + kdamonds.kdamonds[0].contexts[0].pause = True kdamonds.kdamonds[0].commit() del kdamonds.kdamonds[0].contexts[0].targets[1] assert_ctxs_committed(kdamonds) From d94d0f9c153f8d9a234171d1ff1c48e513254e7a Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Tue, 24 Mar 2026 19:07:09 +0000 Subject: [PATCH 085/321] mm/migrate: rename PAGE_ migration flags to FOLIO_ These flags only track folio-specific state during migration and are not used for movable_ops pages. Rename the enum values and the old_page_state variable to match. No functional change. Link: https://lore.kernel.org/20260324190706.964555-4-shivankg@amd.com Signed-off-by: Shivank Garg Suggested-by: David Hildenbrand Acked-by: David Hildenbrand (Arm) Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Reviewed-by: Huang Ying Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Joshua Hahn Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Rakie Kim Cc: Shivank Garg Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 8a64291ab5b4..0c6a0ab6ecce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, * This is safe because nobody is using it except us. */ enum { - PAGE_WAS_MAPPED = BIT(0), - PAGE_WAS_MLOCKED = BIT(1), - PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED, + FOLIO_WAS_MAPPED = BIT(0), + FOLIO_WAS_MLOCKED = BIT(1), + FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED, }; static void __migrate_folio_record(struct folio *dst, - int old_page_state, - struct anon_vma *anon_vma) + int old_folio_state, struct anon_vma *anon_vma) { - dst->private = (void *)anon_vma + old_page_state; + dst->private = (void *)anon_vma + old_folio_state; } static void __migrate_folio_extract(struct folio *dst, - int *old_page_state, - struct anon_vma **anon_vmap) + int *old_folio_state, struct anon_vma **anon_vmap) { unsigned long private = (unsigned long)dst->private; - *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES); - *old_page_state = private & PAGE_OLD_STATES; + *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES); + *old_folio_state = private & FOLIO_OLD_STATES; dst->private = NULL; } @@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, { struct folio *dst; int rc = -EAGAIN; - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; bool locked = false; bool dst_locked = false; @@ -1253,7 +1251,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, } locked = true; if (folio_test_mlocked(src)) - old_page_state |= PAGE_WAS_MLOCKED; + old_folio_state |= FOLIO_WAS_MLOCKED; if (folio_test_writeback(src)) { /* @@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, dst_locked = true; if (unlikely(page_has_movable_ops(&src->page))) { - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return 0; } @@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); - old_page_state |= PAGE_WAS_MAPPED; + old_folio_state |= FOLIO_WAS_MAPPED; } if (!folio_mapped(src)) { - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return 0; } @@ -1344,7 +1342,7 @@ out: if (rc == -EAGAIN) ret = NULL; - migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); @@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct list_head *ret) { int rc; - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; bool src_deferred_split = false; bool src_partially_mapped = false; struct list_head *prev; - __migrate_folio_extract(dst, &old_page_state, &anon_vma); + __migrate_folio_extract(dst, &old_folio_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); @@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); - if (old_page_state & PAGE_WAS_MLOCKED) + if (old_folio_state & FOLIO_WAS_MLOCKED) lru_add_drain(); - if (old_page_state & PAGE_WAS_MAPPED) + if (old_folio_state & FOLIO_WAS_MAPPED) remove_migration_ptes(src, dst, 0); out_unlock_both: @@ -1439,11 +1437,11 @@ out: */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return rc; } - migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); @@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios, dst = list_first_entry(dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, src_folios, lru) { - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; - __migrate_folio_extract(dst, &old_page_state, &anon_vma); - migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + __migrate_folio_extract(dst, &old_folio_state, &anon_vma); + migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); From 838376c60df0f28b5b3659a3ef649f07d0eeadf6 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Wed, 29 Apr 2026 16:42:16 +0800 Subject: [PATCH 086/321] mm/memcontrol: hoist pstatc_pcpu assignment out of CPU loop In mem_cgroup_alloc(), the assignment of pstatc_pcpu is invariant with respect to the for_each_possible_cpu() loop: both the 'parent' pointer and 'parent->vmstats_percpu' remain constant throughout all iterations. The original code redundantly re-evaluated the 'if (parent)' condition and reassigned pstatc_pcpu on every CPU iteration, then repeated the same ternary check 'parent ? pstatc_pcpu : NULL' when storing into statc->parent_pcpu. Move the single conditional assignment of pstatc_pcpu to before the loop, resolving both the loop-invariant placement issue and the duplicated null check. On systems with a large number of possible CPUs, this eliminates repeated branch evaluation with no functional change. No functional change intended. Link: https://lore.kernel.org/20260429084216.186238-1-hui.zhu@linux.dev Signed-off-by: Hui Zhu Reviewed-by: SeongJae Park Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 177732fef010..2bc9a7238939 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4002,11 +4002,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg1_alloc_events(memcg)) goto fail; + pstatc_pcpu = parent ? parent->vmstats_percpu : NULL; for_each_possible_cpu(cpu) { - if (parent) - pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent_pcpu = parent ? pstatc_pcpu : NULL; + statc->parent_pcpu = pstatc_pcpu; statc->vmstats = memcg->vmstats; } From b56ca146a2b2750172f91f6db960a37a1a546efd Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:02 +0530 Subject: [PATCH 087/321] vmalloc: add __GFP_SKIP_KASAN support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "kasan: hw_tags: Disable tagging for stack and page-tables", v4. Stacks and page tables are always accessed with the match-all tag, so assigning a new random tag every time at allocation and setting invalid tag at deallocation time, just adds overhead without improving the detection. With __GFP_SKIP_KASAN the page keeps its poison tag and KASAN_TAG_KERNEL (match-all tag) is stored in the page flags while keeping the poison tag in the hardware. The benefit of it is that 256 tag setting instruction per 4 kB page aren't needed at allocation and deallocation time. Thus match-all pointers still work, while non-match tags (other than poison tag) still fault. __GFP_SKIP_KASAN only skips for KASAN_HW_TAGS mode, so coverage is unchanged. Benchmark: The benchmark has two modes. In thread mode, the child process forks and creates N threads. In pgtable mode, the parent maps and faults a specified memory size and then forks repeatedly with children exiting immediately. Thread benchmark: 2000 iterations, 2000 threads: 2.575 s → 2.229 s (~13.4% faster) The pgtable samples: - 2048 MB, 2000 iters 19.08 s → 17.62 s (~7.6% faster) This patch (of 3): For allocations that will be accessed only with match-all pointers (e.g., kernel stacks), setting tags is wasted work. If the caller already set __GFP_SKIP_KASAN, skip tag setting of vmalloc pages. Before this patch, __GFP_SKIP_KASAN wasn't being used with vmalloc APIs. So it wasn't being checked. Now its being checked and acted upon. Other KASAN modes are unchanged because __GFP_SKIP_KASAN is ignored for them in the page allocator, and in vmalloc too we ignore this flag for them. This is a preparatory patch for optimizing kernel stack allocations. Link: https://lore.kernel.org/20260429102704.680174-1-dev.jain@arm.com Link: https://lore.kernel.org/20260429102704.680174-2-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Co-developed-by: Ryan Roberts Signed-off-by: Ryan Roberts Co-developed-by: Dev Jain Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 6 +++--- mm/vmalloc.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index cd4972a7c97c..54ca0c88bab6 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -281,9 +281,9 @@ enum { * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by - * kasan_unpoison_vmalloc instead. For userspace pages, results in - * poisoning being skipped as well, see should_skip_kasan_poison for - * details. Only effective in HW_TAGS mode. + * kasan_unpoison_vmalloc instead. If passed to vmalloc, kasan_unpoison_vmalloc + * is skipped too. For userspace pages, results in poisoning being skipped as + * well, see should_skip_kasan_poison for details. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 99fce4f9f6e4..eabb86b13b7e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3933,7 +3933,7 @@ fail: __GFP_NOFAIL | __GFP_ZERO |\ __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ - GFP_USER | __GFP_NOLOCKDEP) + GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN) static gfp_t vmalloc_fix_flags(gfp_t flags) { @@ -3974,6 +3974,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * * %__GFP_NOWARN can be used to suppress failure messages. * + * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages + * (when prot=%PAGE_KERNEL). + * * Can not be called from interrupt nor NMI contexts. * Return: the address of the area or %NULL on failure */ @@ -3987,6 +3990,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long original_align = align; unsigned int shift = PAGE_SHIFT; + bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN); if (WARN_ON_ONCE(!size)) return NULL; @@ -4017,7 +4021,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, again: area = __get_vm_area_node(size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, - gfp_mask, caller); + gfp_mask & ~__GFP_SKIP_KASAN, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, @@ -4035,7 +4039,7 @@ again: * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - if (kasan_hw_tags_enabled()) { + if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) { /* * Modify protection bits to allow tagging. * This must be done before mapping. @@ -4072,7 +4076,8 @@ again: (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ - area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); + if (!skip_vmalloc_kasan) + area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED From 6ae51adb084a9d87a8b9501d2231e20271dece87 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:03 +0530 Subject: [PATCH 088/321] kasan: skip HW tagging for all kernel thread stacks HW-tag KASAN never checks kernel stacks because stack pointers carry the match-all tag, so setting/poisoning tags is pure overhead. - Add __GFP_SKIP_KASAN to THREADINFO_GFP so every stack allocator that uses it skips tagging (fork path plus arch users) - Add __GFP_SKIP_KASAN to GFP_VMAP_STACK for the fork-specific vmap stacks. - When reusing cached vmap stacks, skip kasan_unpoison_range() if HW tags are enabled. Software KASAN is unchanged; this only affects tag-based KASAN. Link: https://lore.kernel.org/20260429102704.680174-3-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Ben Segall Cc: David Hildenbrand (Arm) Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/thread_info.h | 2 +- kernel/fork.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 051e42902690..307b8390fc67 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -92,7 +92,7 @@ static inline long set_restart_fn(struct restart_block *restart, #define THREAD_ALIGN THREAD_SIZE #endif -#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) +#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_SKIP_KASAN) /* * flag set/clear/test wrappers diff --git a/kernel/fork.c b/kernel/fork.c index 8ac38beae360..ec6a120291e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -204,7 +204,7 @@ static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); * accounting is performed by the code assigning/releasing stacks to tasks. * We need a zeroed memory without __GFP_ACCOUNT. */ -#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN) struct vm_stack { struct rcu_head rcu; @@ -342,7 +342,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) } /* Reset stack metadata. */ - kasan_unpoison_range(vm_area->addr, THREAD_SIZE); + if (!kasan_hw_tags_enabled()) + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); stack = kasan_reset_tag(vm_area->addr); From d46644af7636c4cb876110c8ff7f1efbbb815bfe Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 29 Apr 2026 15:57:04 +0530 Subject: [PATCH 089/321] mm: skip KASAN tagging for page-allocated page tables Page tables are always accessed via the linear mapping with a match-all tag, so HW-tag KASAN never checks them. For page-allocated tables (PTEs and PGDs etc), avoid the tag setup and poisoning overhead by using __GFP_SKIP_KASAN. SLUB-backed page tables are unchanged for now. (They aren't widely used and require more SLUB related skip logic. Leave it later.) Link: https://lore.kernel.org/20260429102704.680174-4-dev.jain@arm.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Dev Jain Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: K Prateek Nayak Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 57137d3ac159..051aa1331051 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -4,7 +4,7 @@ #ifdef CONFIG_MMU -#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) +#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_SKIP_KASAN) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** From 70d8797c15d640982365e96e34e93a3aa38e82da Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:23 -0700 Subject: [PATCH 090/321] mm/damon: introduce damon_set_region_system_rams_default() Patch series "mm/damon/reclaim,lru_sort: monitor all system rams by default". DAMON_RECLAIM and DAMON_LRU_SORT set the biggest 'System RAM' resource of the system as the default monitoring target address range. The main intention behind the design is to minimize the overhead coming from monitoring of non-System RAM areas. This could result in an odd setup when there are multiple discrete System RAMs of considerable sizes. For example, there are System RAMs each having 500 GiB size. In this case, only the first 500 GiB will be set as the monitoring region by default. This is particularly common on NUMA systems. Hence the modules allow users to set the monitoring target address range using the module parameters if the default setup doesn't work for them. In other words, the current design trades ease of setup for lower overhead. However, because DAMON utilizes the sampling based access check and the adaptive regions adjustment mechanisms, the overhead from the monitoring of non-System RAM areas should be negligible in most setups. Meanwhile, the setup complexity is causing real headaches for users who need to run those modules on various types of systems. That is, the current tradeoff is not a good deal. Set the physical address range that can cover all System RAM areas of the system as the default monitoring regions for DAMON_RECLAIM and DAMON_LRU_SORT. Technically speaking, this is changing documented behavior. However, it makes no sense to believe there is a real use case that really depends on the old weird default behavior. If the old default behavior was working for them in the reasonable way, this change will only add a negligible amount of monitoring overhead. If it didn't work, the users may already be using manual monitoring regions setup, and they will not be affected by this change. Patches Sequence ================ Patch 1 introduces a new core function that will be used for the new default monitoring target region setup. Patch 2 and 3 update DAMON_RECLAIM and DAMON_LRU_SORT to use the new function instead of the old one, respectively. Patch 4 removes the old core function that was replaced by the new one, as there is no more user of it. Patch 5 updates DAMON_STAT to use the new one instead of its in-house nearly-duplicate self implementation of the functionality. Finally patches 6 and 7 update the DAMON_RECLAIM and DAMON_LRU_SORT user documentation for the new behaviors, respectively. This patch (of 7): damon_set_region_biggest_system_ram_default() sets the monitoring target region as the caller requested. If the caller didn't specify the region, it finds the biggest System RAM of the system and sets it as the target region. When there are more than one considerable size of System RAM resources in the system, the default target setup makes no sense. Introduce a variant, namely damon_set_region_system_rams_default(). It sets a physical address range that covers all System RAM resources as the default target region. Link: https://lore.kernel.org/20260429041232.90257-1-sj@kernel.org Link: https://lore.kernel.org/20260429041232.90257-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++ mm/damon/core.c | 79 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f2370a3a4a9a..f656908b2d38 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1010,6 +1010,11 @@ int damon_kdamond_pid(struct damon_ctx *ctx); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); +int damon_set_region_system_rams_default(struct damon_target *t, + unsigned long *start, unsigned long *end, + unsigned long addr_unit, + unsigned long min_region_sz); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, unsigned long addr_unit, diff --git a/mm/damon/core.c b/mm/damon/core.c index 05e4bef367db..980a31cd3498 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3328,14 +3328,20 @@ done: return 0; } -static int walk_system_ram(struct resource *res, void *arg) -{ - struct resource *a = arg; +struct damon_system_ram_range_walk_arg { + bool walked; + struct resource res; +}; - if (resource_size(a) < resource_size(res)) { - a->start = res->start; - a->end = res->end; +static int damon_system_ram_walk_fn(struct resource *res, void *arg) +{ + struct damon_system_ram_range_walk_arg *a = arg; + + if (!a->walked) { + a->walked = true; + a->res.start = res->start; } + a->res.end = res->end; return 0; } @@ -3352,6 +3358,67 @@ static unsigned long damon_res_to_core_addr(resource_size_t ra, return ra / addr_unit; } +static bool damon_find_system_rams_range(unsigned long *start, + unsigned long *end, unsigned long addr_unit) +{ + struct damon_system_ram_range_walk_arg arg = {}; + + walk_system_ram_res(0, -1, &arg, damon_system_ram_walk_fn); + if (!arg.walked) + return false; + *start = damon_res_to_core_addr(arg.res.start, addr_unit); + *end = damon_res_to_core_addr(arg.res.end + 1, addr_unit); + if (*end <= *start) + return false; + return true; +} + +/** + * damon_set_region_system_rams_default() - Set the region of the given + * monitoring target as requested, or to cover all 'System RAM' resources. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * @addr_unit: The address unit for the damon_ctx of @t. + * @min_region_sz: Minimum region size. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds 'System + * RAM' resources and sets the region to cover all the resource. In the latter + * case, this function saves the start and the end addresseses of the first and + * the last resources in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_system_rams_default(struct damon_target *t, + unsigned long *start, unsigned long *end, + unsigned long addr_unit, unsigned long min_region_sz) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_system_rams_range(start, end, addr_unit)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1, min_region_sz); +} + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct resource *a = arg; + + if (resource_size(a) < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + /* * Find biggest 'System RAM' resource and store its start and end address in * @start and @end, respectively. If no System RAM is found, returns false. From 99976875c9e59b975c85d73386d76944ce74f598 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:24 -0700 Subject: [PATCH 091/321] mm/damon/reclaim: cover all system rams DAMON_RECLAIM allows users to set the physical address range to monitor and do the work on. When users don't explicitly set the range, the biggest System RAM resource of the system is selected as the monitoring target address range. The intention was to reduce the overhead from monitoring non-System RAM areas because monitoring of non-System RAM may be meaningless. However, because of the sampling based access check and adaptive regions adjustment, the overhead should be negligible. It makes more sense to just cover all system rams of the system. Do so. Link: https://lore.kernel.org/20260429041232.90257-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index b330ff169590..a60ee800d63e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -113,7 +113,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); * Start of the target memory region in physical address. * * The start physical address of memory region that DAMON_RECLAIM will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_start __read_mostly; module_param(monitor_region_start, ulong, 0600); @@ -122,7 +123,8 @@ module_param(monitor_region_start, ulong, 0600); * End of the target memory region in physical address. * * The end physical address of memory region that DAMON_RECLAIM will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); @@ -232,11 +234,9 @@ static int damon_reclaim_apply_parameters(void) damos_add_filter(scheme, filter); } - err = damon_set_region_biggest_system_ram_default(param_target, - &monitor_region_start, - &monitor_region_end, - param_ctx->addr_unit, - param_ctx->min_region_sz); + err = damon_set_region_system_rams_default(param_target, + &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); From e17741ad08451e652924abe6277362d2ae19dd4a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:25 -0700 Subject: [PATCH 092/321] mm/damon/lru_sort: cover all system rams DAMON_LRU_SORT allows users to set the physical address range to monitor and do the work on. When users don't explicitly set the range, the biggest system ram resource of the system is selected as the monitoring target address range. The intention was to reduce the overhead from monitoring non-System RAM areas because monitoring non-System RAM may be meaningless. However, because of the sampling based access check and adaptive regions adjustment, the overhead should be negligible. It makes more sense to just cover all system rams of the system. Do so. Link: https://lore.kernel.org/20260429041232.90257-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 7569e471160a..2eb559d913b6 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -139,7 +139,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); * Start of the target memory region in physical address. * * The start physical address of memory region that DAMON_LRU_SORT will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_start __read_mostly; module_param(monitor_region_start, ulong, 0600); @@ -148,7 +149,8 @@ module_param(monitor_region_start, ulong, 0600); * End of the target memory region in physical address. * * The end physical address of memory region that DAMON_LRU_SORT will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); @@ -326,7 +328,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) goto out; - err = damon_set_region_biggest_system_ram_default(param_target, + err = damon_set_region_system_rams_default(param_target, &monitor_region_start, &monitor_region_end, param_ctx->addr_unit, From 3a870b43776c0c9740a087eb0d831cd6cb8016f7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:26 -0700 Subject: [PATCH 093/321] mm/damon/core: remove damon_set_region_biggest_system_ram_default() Now nobody is using damon_set_region_biggest_system_ram_default(). Remove it. Link: https://lore.kernel.org/20260429041232.90257-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 ---- mm/damon/core.c | 64 ------------------------------------------- 2 files changed, 69 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f656908b2d38..c7a31572689b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1015,11 +1015,6 @@ int damon_set_region_system_rams_default(struct damon_target *t, unsigned long addr_unit, unsigned long min_region_sz); -int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end, - unsigned long addr_unit, - unsigned long min_region_sz); - #endif /* CONFIG_DAMON */ #endif /* _DAMON_H */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 980a31cd3498..9f38deddcb30 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3408,70 +3408,6 @@ int damon_set_region_system_rams_default(struct damon_target *t, return damon_set_regions(t, &addr_range, 1, min_region_sz); } -static int walk_system_ram(struct resource *res, void *arg) -{ - struct resource *a = arg; - - if (resource_size(a) < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool damon_find_biggest_system_ram(unsigned long *start, - unsigned long *end, unsigned long addr_unit) - -{ - struct resource res = {}; - - walk_system_ram_res(0, -1, &res, walk_system_ram); - *start = damon_res_to_core_addr(res.start, addr_unit); - *end = damon_res_to_core_addr(res.end + 1, addr_unit); - if (*end <= *start) - return false; - return true; -} - -/** - * damon_set_region_biggest_system_ram_default() - Set the region of the given - * monitoring target as requested, or biggest 'System RAM'. - * @t: The monitoring target to set the region. - * @start: The pointer to the start address of the region. - * @end: The pointer to the end address of the region. - * @addr_unit: The address unit for the damon_ctx of @t. - * @min_region_sz: Minimum region size. - * - * This function sets the region of @t as requested by @start and @end. If the - * values of @start and @end are zero, however, this function finds the biggest - * 'System RAM' resource and sets the region to cover the resource. In the - * latter case, this function saves the start and end addresses of the resource - * in @start and @end, respectively. - * - * Return: 0 on success, negative error code otherwise. - */ -int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end, - unsigned long addr_unit, unsigned long min_region_sz) -{ - struct damon_addr_range addr_range; - - if (*start > *end) - return -EINVAL; - - if (!*start && !*end && - !damon_find_biggest_system_ram(start, end, addr_unit)) - return -EINVAL; - - addr_range.start = *start; - addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, min_region_sz); -} - /* * damon_moving_sum() - Calculate an inferred moving sum value. * @mvsum: Inferred sum of the last @len_window values. From 122dff8c22eafcdb3adeaf7bdf1c63adeb9457e2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:27 -0700 Subject: [PATCH 094/321] mm/damon/stat: use damon_set_region_system_rams_default() damon_stat_set_moniotirng_region() is nearly a duplicate of the core function, damon_set_region_system_rams_default(). Use the core implementation. Link: https://lore.kernel.org/20260429041232.90257-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/stat.c | 53 +++---------------------------------------------- 1 file changed, 3 insertions(+), 50 deletions(-) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 3951b762cbdd..f4d3203e9263 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -148,59 +148,12 @@ static int damon_stat_damon_call_fn(void *data) return 0; } -struct damon_stat_system_ram_range_walk_arg { - bool walked; - struct resource res; -}; - -static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg) -{ - struct damon_stat_system_ram_range_walk_arg *a = arg; - - if (!a->walked) { - a->walked = true; - a->res.start = res->start; - } - a->res.end = res->end; - return 0; -} - -static unsigned long damon_stat_res_to_core_addr(resource_size_t ra, - unsigned long addr_unit) -{ - /* - * Use div_u64() for avoiding linking errors related with __udivdi3, - * __aeabi_uldivmod, or similar problems. This should also improve the - * performance optimization (read div_u64() comment for the detail). - */ - if (sizeof(ra) == 8 && sizeof(addr_unit) == 4) - return div_u64(ra, addr_unit); - return ra / addr_unit; -} - -static int damon_stat_set_monitoring_region(struct damon_target *t, - unsigned long addr_unit, unsigned long min_region_sz) -{ - struct damon_addr_range addr_range; - struct damon_stat_system_ram_range_walk_arg arg = {}; - - walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn); - if (!arg.walked) - return -EINVAL; - addr_range.start = damon_stat_res_to_core_addr( - arg.res.start, addr_unit); - addr_range.end = damon_stat_res_to_core_addr( - arg.res.end + 1, addr_unit); - if (addr_range.end <= addr_range.start) - return -EINVAL; - return damon_set_regions(t, &addr_range, 1, min_region_sz); -} - static struct damon_ctx *damon_stat_build_ctx(void) { struct damon_ctx *ctx; struct damon_attrs attrs; struct damon_target *target; + unsigned long start = 0, end = 0; ctx = damon_new_ctx(); if (!ctx) @@ -230,8 +183,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_stat_set_monitoring_region(target, ctx->addr_unit, - ctx->min_region_sz)) + if (damon_set_region_system_rams_default(target, &start, &end, + ctx->addr_unit, ctx->min_region_sz)) goto free_out; return ctx; free_out: From 2262a915615ba308a87e8cf05acf1b16c01ca04b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:28 -0700 Subject: [PATCH 095/321] Docs/admin-guide/mm/damon/reclaim: update for entire memory monitoring Update DAMON_RECLAIM usage document for the changed default monitoring target region selection. Link: https://lore.kernel.org/20260429041232.90257-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/reclaim.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 01a34c215b66..57ab8b187650 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -229,7 +229,8 @@ Start of target memory region in physical address. The start physical address of memory region that DAMON_RECLAIM will do work against. That is, DAMON_RECLAIM will find cold memory regions in this region -and reclaims. By default, biggest System RAM is used as the region. +and reclaims. By default, the system's entire physical memory is used as the +region. monitor_region_end ------------------ @@ -238,7 +239,8 @@ End of target memory region in physical address. The end physical address of memory region that DAMON_RECLAIM will do work against. That is, DAMON_RECLAIM will find cold memory regions in this region -and reclaims. By default, biggest System RAM is used as the region. +and reclaims. By default, the system's entire physical memory is used as the +region. addr_unit --------- From 77289dcfa973d4a9984abaa2093e739038e1d94d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 28 Apr 2026 21:12:29 -0700 Subject: [PATCH 096/321] Docs/admin-guide/mm/damon/lru_sort: update for entire memory monitoring Update DAMON_LRU_SORT usage document for the changed default monitoring target region selection. Link: https://lore.kernel.org/20260429041232.90257-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 25e2f042a383..b93ca9b0853d 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -246,7 +246,8 @@ monitor_region_start Start of target memory region in physical address. The start physical address of memory region that DAMON_LRU_SORT will do work -against. By default, biggest System RAM is used as the region. +against. By default, the system's entire physical memory is used as the +region. monitor_region_end ------------------ @@ -254,7 +255,8 @@ monitor_region_end End of target memory region in physical address. The end physical address of memory region that DAMON_LRU_SORT will do work -against. By default, biggest System RAM is used as the region. +against. By default, the system's entire physical memory is used as the +region. addr_unit --------- From 9f7ff45e99d322077af7f53f4a0a2b0907816531 Mon Sep 17 00:00:00 2001 From: Vineet Agarwal Date: Wed, 29 Apr 2026 17:28:16 +0530 Subject: [PATCH 097/321] selftests/mm: khugepaged: initialize file contents via mmap file_setup_area() currently allocates anonymous memory, fills it, and writes it into the backing file used for collapse testing. Instead of copying data through write(), resize the file with ftruncate(), map it directly with MAP_SHARED, and initialize the mapped area in place. This simplifies the setup path and avoids the need for explicit partial write handling. Link: https://lore.kernel.org/20260429115816.98824-1-agarwal.vineet2006@gmail.com Signed-off-by: Vineet Agarwal Reviewed-by: Zi Yan Tested-by: Zi Yan Acked-by: David Hildenbrand (Arm) Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 3fe7ef04ac62..c8393ca52cab 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -373,7 +373,7 @@ static void *file_setup_area(int nr_hpages) unlink(finfo.path); /* Cleanup from previous failed tests */ printf("Creating %s for collapse%s...", finfo.path, finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); - fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + fd = open(finfo.path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL, 777); if (fd < 0) { perror("open()"); @@ -381,9 +381,21 @@ static void *file_setup_area(int nr_hpages) } size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); + if (ftruncate(fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } fill_memory(p, 0, size); - write(fd, p, size); + if (msync(p, size, MS_SYNC)) { + perror("msync()"); + exit(EXIT_FAILURE); + } close(fd); munmap(p, size); success("OK"); From ab3fad1b1cdc7aab95c49f389642c4fb88a4f35e Mon Sep 17 00:00:00 2001 From: Vineet Agarwal Date: Wed, 29 Apr 2026 19:34:34 +0530 Subject: [PATCH 098/321] mm/khugepaged: return -EAGAIN for SCAN_PAGE_HAS_PRIVATE in MADV_COLLAPSE MADV_COLLAPSE uses errno values to provide actionable feedback to userspace. Temporary resource constraints are mapped to -EAGAIN so the caller may retry, while intrinsic failures of the specified range are mapped to -EINVAL. collapse_file() returns SCAN_PAGE_HAS_PRIVATE when filemap_release_folio() fails while isolating file-backed folios for collapse. This currently falls through the default case in madvise_collapse_errno() and is reported to userspace as -EINVAL. However, filemap_release_folio() failure commonly reflects temporary folio state rather than a permanently uncollapsible range. For example, ext4 returns false when a folio still has dirty journalled data, btrfs returns false for dirty or writeback folios before extent state release, and NFS may return false while reclaiming filesystem-private folio state. In such cases, retrying MADV_COLLAPSE after writeback, reclaim or journal progress may succeed. This matches the existing -EAGAIN handling for SCAN_PAGE_DIRTY_OR_WRITEBACK and other transient collapse failures more closely than -EINVAL. Therefore, map SCAN_PAGE_HAS_PRIVATE to -EAGAIN so userspace receives retryable feedback for this temporary failure path. Link: https://lore.kernel.org/20260429140434.439456-1-agarwal.vineet2006@gmail.com Signed-off-by: Vineet Agarwal Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5f4e009593e0..28a843f30b32 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2808,6 +2808,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_HAS_PRIVATE: case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* From 7e6cc9f954aa3455cd6ef4dfcbd4102265c30884 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 29 Apr 2026 08:03:05 -0700 Subject: [PATCH 099/321] Docs/admin-guide/mm/damon/usage: mark scheme filters sysfs dir as deprecated Patch series "mm/damon/sysfs: document filters/ directory as deprecated". Commit ab71d2d30121 ("mm/damon/sysfs-schemes: let damon_sysfs_scheme_set_filters() be used for different named directories") introduced alternatives of 'filters' directory, namely core_filters/ and 'ops_filters/ directories. Now the alternatives are well stabilized and ready for all users. All filters/ directory use cases are expected to be able to be migrated to the alternatives. An LTS kernel having the alternatives, namely 6.18.y, is also released. Existence of filters/ directory is only confusing. It would be better not immediately removing the directory, though. There could be users that need time before migrating to the alternatives. There might be unexpected use cases that the alternatives cannot support. Doing the deprecation step by step across multiple years like DAMON debugfs deprecation would be safer. Start the deprecation changes by announcing the deprecation on the documents. Every year, one more action for completely removing the directory will be followed, like DAMON debugfs deprecation did. Following yearly actions are currently expected. In 2027, deprecation warning kernel messages will be printed once, for use of filters/ directory. In 2028, filters/ directory will be renamed to filters_DEPRECATED/. In 2029, filters_DEPRECATED/ directory will be removed. This patch (of 2): The alternatives of 'filters/' directory, namely 'core_filters/' and 'ops_filters/', can fully support all the features 'filters/' directory can do, and provide better user experience. Having 'filters/' directory is only confusing to users. Announce it as deprecated on the usage document. Link: https://lore.kernel.org/20260429150309.82282-1-sj@kernel.org Link: https://lore.kernel.org/20260429150309.82282-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d5548e460857..11c75a598393 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -485,10 +485,10 @@ directory can be used for installing filters regardless of their handled layers. Filters that requested by ``core_filters`` and ``ops_filters`` will be installed before those of ``filters``. All three directories have same files. -Use of ``filters`` directory can make expecting evaluation orders of given -filters with the files under directory bit confusing. Users are hence -recommended to use ``core_filters`` and ``ops_filters`` directories. The -``filters`` directory could be deprecated in future. +Use of ``filters`` directory can make filters evaluation orders confusing to +expect. For this reason, ``filters`` directory is deprecated. It is still +functioning, but is scheduled for removal in the near future. Users should use +``core_filters`` and ``ops_filters`` directories instead. In the beginning, the directory has only one file, ``nr_filters``. Writing a number (``N``) to the file creates the number of child directories named ``0`` From 4c53a9fdb6f83f261a6e2d433602ed0189408f82 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 29 Apr 2026 08:03:06 -0700 Subject: [PATCH 100/321] Docs/ABI/damon: mark schemes//filters/ deprecated Now the 'filters/' directory is deprecated. Update ABI document to also announce the fact. Also update the descriptions of the files to be based on 'core_filter/' directory, to make the old descriptions ready to be removed when the time arrives. Link: https://lore.kernel.org/20260429150309.82282-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 62 ++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 971c22e34e72..ee29d4e204ff 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -396,15 +396,20 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the low watermark of the scheme in permil. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters/nr_filters -Date: Dec 2022 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters +Date: Feb 2025 +Contact: SeongJae Park +Description: Directory for DAMON core layer-handled DAMOS filters. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters/nr_filters +Date: Feb 2025 Contact: SeongJae Park Description: Writing a number 'N' to this file creates the number of directories for setting filters of the scheme named '0' to - 'N-1' under the filters/ directory. + 'N-1' under the core_filters/ directory. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//type -Date: Dec 2022 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//type +Date: Feb 2025 Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the type of the memory of the interest. 'anon' for anonymous pages, @@ -412,77 +417,78 @@ Description: Writing to and reading from this file sets and gets the type of 'addr' for address range (an open-ended interval), or 'target' for DAMON monitoring target can be written and read. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//memcg_path -Date: Dec 2022 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//memcg_path +Date: Feb 2025 Contact: SeongJae Park Description: If 'memcg' is written to the 'type' file, writing to and reading from this file sets and gets the path to the memory cgroup of the interest. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//addr_start -Date: Jul 2023 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//addr_start +Date: Feb 2025 Contact: SeongJae Park Description: If 'addr' is written to the 'type' file, writing to or reading from this file sets or gets the start address of the address range for the filter. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//addr_end -Date: Jul 2023 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//addr_end +Date: Feb 2025 Contact: SeongJae Park Description: If 'addr' is written to the 'type' file, writing to or reading from this file sets or gets the end address of the address range for the filter. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//min +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//min Date: Feb 2025 Contact: SeongJae Park Description: If 'hugepage_size' is written to the 'type' file, writing to or reading from this file sets or gets the minimum size of the hugepage for the filter. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//max +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//max Date: Feb 2025 Contact: SeongJae Park Description: If 'hugepage_size' is written to the 'type' file, writing to or reading from this file sets or gets the maximum size of the hugepage for the filter. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//target_idx -Date: Dec 2022 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//target_idx +Date: Feb 2025 Contact: SeongJae Park Description: If 'target' is written to the 'type' file, writing to or reading from this file sets or gets the index of the DAMON monitoring target of the interest. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching -Date: Dec 2022 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//matching +Date: Feb 2025 Contact: SeongJae Park Description: Writing 'Y' or 'N' to this file sets whether the filter is for the memory of the 'type', or all except the 'type'. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//allow -Date: Jan 2025 +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//allow +Date: Feb 2025 Contact: SeongJae Park Description: Writing 'Y' or 'N' to this file sets whether to allow or reject applying the scheme's action to the memory that satisfies the 'type' and the 'matching' of the directory. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters -Date: Feb 2025 -Contact: SeongJae Park -Description: Directory for DAMON core layer-handled DAMOS filters. Files - under this directory works same to those of - /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters - directory. - What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//ops_filters Date: Feb 2025 Contact: SeongJae Park Description: Directory for DAMON operations set layer-handled DAMOS filters. Files under this directory works same to those of - /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters + /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters directory. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters +Date: Dec 2022 +Contact: SeongJae Park +Description: Directory for DAMOS filters. Files under this directory works + same to those of + /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//{core,ops}_filters + directory. This is deprecated. Use the core_filters and + ops_filters instead. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//dests/nr_dests Date: Jul 2025 Contact: SeongJae Park From 5ebb2064da361ca860c052bca9ae37962adef3f7 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:06 +0000 Subject: [PATCH 101/321] mm: use zone lock guard in reserve_highatomic_pageblock() Patch series "mm: use spinlock guards for zone lock", v3. This series uses spinlock guard for zone lock across several mm functions to replace explicit lock/unlock patterns with automatic scope-based cleanup. This simplifies the control flow by removing 'flags' variables, goto labels, and redundant unlock calls. Patches are ordered by decreasing value. The first six patches simplify the control flow by removing gotos, multiple unlock paths, or 'ret' variables. The last two are simpler lock/unlock pair conversions that only remove 'flags' and can be dropped if considered unnecessary churn. Binary size increase is +39 bytes, with Peter Zijlstra's fix for guards [1] applied. This is due to the compiler not being able to deduplicate epilogue and eliminate redundant NULL check. See discussion [2] for more details. I proposed a patch [3] that fixes this, but until it is merged we need to assume +39 bytes will stay (though it is compiler dependent). This patch (of 8): Use the spinlock_irqsave zone lock guard in reserve_highatomic_pageblock() to replace the explicit lock/unlock and goto out_unlock pattern with automatic scope-based cleanup. Link: https://lore.kernel.org/cover.1777462630.git.d@ilvokhin.com Link: https://lore.kernel.org/3657e1144e2ffc1ca0eb57d57d89bfec4073d8c6.1777462630.git.d@ilvokhin.com Link: https://lore.kernel.org/all/20260309164516.GE606826@noisy.programming.kicks-ass.net/ [1] Link: https://lore.kernel.org/all/afC5C6fylF4AsITV@shell.ilvokhin.com/ [2] Link: https://lore.kernel.org/all/20260427165037.205337-1-d@ilvokhin.com/ [3] Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9c6313e69f3..36d37e9ff3b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3442,7 +3442,7 @@ static void reserve_highatomic_pageblock(struct page *page, int order, struct zone *zone) { int mt; - unsigned long max_managed, flags; + unsigned long max_managed; /* * The number reserved as: minimum is 1 pageblock, maximum is @@ -3456,29 +3456,26 @@ static void reserve_highatomic_pageblock(struct page *page, int order, if (zone->nr_reserved_highatomic >= max_managed) return; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); /* Recheck the nr_reserved_highatomic limit under the lock */ if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; + return; /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ if (!migratetype_is_mergeable(mt)) - goto out_unlock; + return; if (order < pageblock_order) { if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) - goto out_unlock; + return; zone->nr_reserved_highatomic += pageblock_nr_pages; } else { change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); zone->nr_reserved_highatomic += 1 << order; } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); } /* From 3a92b4e99b7429f98625a08f3dd2aea92754aa99 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:07 +0000 Subject: [PATCH 102/321] mm: use zone lock guard in unset_migratetype_isolate() Use spinlock_irqsave zone lock guard in unset_migratetype_isolate() to replace the explicit lock/unlock and goto pattern with automatic scope-based cleanup. Link: https://lore.kernel.org/815c0905ea77828ed32bf56ff0a6d3c6548eb3a2.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Acked-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_isolation.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c48ff5c00244..9d606052dd80 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -223,15 +223,14 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, static void unset_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); if (!is_migrate_isolate_page(page)) - goto out; + return; /* * Because freepage with more than pageblock_order on isolated @@ -279,8 +278,6 @@ static void unset_migratetype_isolate(struct page *page) __putback_isolated_page(page, order, get_pageblock_migratetype(page)); } zone->nr_isolate_pageblock--; -out: - spin_unlock_irqrestore(&zone->lock, flags); } static inline struct page * From 055526c21e2dc802389435bc684cf17cdf507909 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:08 +0000 Subject: [PATCH 103/321] mm: use zone lock guard in unreserve_highatomic_pageblock() Use spinlock_irqsave zone lock guard in unreserve_highatomic_pageblock() to replace the explicit lock/unlock pattern with automatic scope-based cleanup. Link: https://lore.kernel.org/69db814cd178915cb5615334a29304678f960963.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36d37e9ff3b9..56ba22e1a816 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3491,7 +3491,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, bool force) { struct zonelist *zonelist = ac->zonelist; - unsigned long flags; struct zoneref *z; struct zone *zone; struct page *page; @@ -3508,7 +3507,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, pageblock_nr_pages) continue; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); unsigned long size; @@ -3555,12 +3554,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * so this should not fail on zone boundaries. */ WARN_ON_ONCE(ret == -1); - if (ret > 0) { - spin_unlock_irqrestore(&zone->lock, flags); + if (ret > 0) return ret; - } } - spin_unlock_irqrestore(&zone->lock, flags); } return false; From feb0df835fde31ba6af7ab2b7b05751cadc97472 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:09 +0000 Subject: [PATCH 104/321] mm: use zone lock guard in set_migratetype_isolate() Use spinlock_irqsave scoped lock guard in set_migratetype_isolate() to replace the explicit lock/unlock pattern with automatic scope-based cleanup. The scoped variant is used to keep dump_page() outside the locked section to avoid a lockdep splat. Link: https://lore.kernel.org/6883351ad7f74d20875fff30e0e3214a089cea97.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Acked-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_isolation.c | 62 ++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d606052dd80..7a9d631945a3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -167,48 +167,40 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, { struct zone *zone = page_zone(page); struct page *unmovable; - unsigned long flags; unsigned long check_unmovable_start, check_unmovable_end; if (PageUnaccepted(page)) accept_page(page); - spin_lock_irqsave(&zone->lock, flags); - - /* - * We assume the caller intended to SET migrate type to isolate. - * If it is already set, then someone else must have raced and - * set it before us. - */ - if (is_migrate_isolate_page(page)) { - spin_unlock_irqrestore(&zone->lock, flags); - return -EBUSY; - } - - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. - * - * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock - * to avoid redundant checks. - */ - check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), - end_pfn); - - unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, - mode); - if (!unmovable) { - if (!pageblock_isolate_and_move_free_pages(zone, page)) { - spin_unlock_irqrestore(&zone->lock, flags); + scoped_guard(spinlock_irqsave, &zone->lock) { + /* + * We assume the caller intended to SET migrate type to + * isolate. If it is already set, then someone else must have + * raced and set it before us. + */ + if (is_migrate_isolate_page(page)) return -EBUSY; - } - zone->nr_isolate_pageblock++; - spin_unlock_irqrestore(&zone->lock, flags); - return 0; - } - spin_unlock_irqrestore(&zone->lock, flags); + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by + * itself. We just check MOVABLE pages. + * + * Pass the intersection of [start_pfn, end_pfn) and the page's + * pageblock to avoid redundant checks. + */ + check_unmovable_start = max(page_to_pfn(page), start_pfn); + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), + end_pfn); + + unmovable = has_unmovable_pages(check_unmovable_start, + check_unmovable_end, mode); + if (!unmovable) { + if (!pageblock_isolate_and_move_free_pages(zone, page)) + return -EBUSY; + zone->nr_isolate_pageblock++; + return 0; + } + } if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) { /* * printk() with zone->lock held will likely trigger a From ee8a0c15c26f6cf67e4e5207cb18e6262d7e886e Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:10 +0000 Subject: [PATCH 105/321] mm: use zone lock guard in take_page_off_buddy() Use spinlock_irqsave zone lock guard in take_page_off_buddy() to replace the explicit lock/unlock pattern with automatic scope-based cleanup. This also allows to return directly from the loop, removing the 'ret' variable. Link: https://lore.kernel.org/a981721632a981f148c63e3f7df3d1116a0c3f6d.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 56ba22e1a816..f5ad74490c5d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7644,11 +7644,9 @@ bool take_page_off_buddy(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); - unsigned long flags; unsigned int order; - bool ret = false; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); @@ -7663,14 +7661,12 @@ bool take_page_off_buddy(struct page *page) break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - ret = true; - break; + return true; } if (page_count(page_head) > 0) break; } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + return false; } /* From 5e22451096cd65ded8a7550fb324c8e6dc3b2b22 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:11 +0000 Subject: [PATCH 106/321] mm: use zone lock guard in put_page_back_buddy() Use spinlock_irqsave zone lock guard in put_page_back_buddy() to replace the explicit lock/unlock pattern with automatic scope-based cleanup. Link: https://lore.kernel.org/b0fceedca37139da36aa626ac72eb9840b641021.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5ad74490c5d..49711916703e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7675,23 +7675,19 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long flags; - bool ret = false; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); if (put_page_testzero(page)) { unsigned long pfn = page_to_pfn(page); int migratetype = get_pfnblock_migratetype(page, pfn); ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); - if (TestClearPageHWPoison(page)) { - ret = true; - } + if (TestClearPageHWPoison(page)) + return true; } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + return false; } #endif From 5ad64655dde8e5416fc0fff51a189879fe3235fd Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:12 +0000 Subject: [PATCH 107/321] mm: use zone lock guard in free_pcppages_bulk() Use spinlock_irqsave zone lock guard in free_pcppages_bulk() to replace the explicit lock/unlock pattern with automatic scope-based cleanup. Link: https://lore.kernel.org/aafc2d660057a91eb40417f8ff4645b0a8c525e2.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 49711916703e..8835064aaa8c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1469,7 +1469,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { - unsigned long flags; unsigned int order; struct page *page; @@ -1482,7 +1481,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); while (count > 0) { struct list_head *list; @@ -1514,8 +1513,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, order, mt); } while (count > 0 && !list_empty(list)); } - - spin_unlock_irqrestore(&zone->lock, flags); } /* Split a multi-block free page into its individual pageblocks. */ From 95b8e432265f61bd9ecdce07d76be6182289ac2a Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 29 Apr 2026 12:02:13 +0000 Subject: [PATCH 108/321] mm: use zone lock guard in __offline_isolated_pages() Use spinlock_irqsave zone lock guard in __offline_isolated_pages() to replace the explicit lock/unlock pattern with automatic scope-based cleanup. Link: https://lore.kernel.org/13149be4f8151e18eb5f1eb4f3241ab3cffb373e.1777462630.git.d@ilvokhin.com Signed-off-by: Dmitry Ilvokhin Suggested-by: Steven Rostedt Acked-by: Michal Hocko Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8835064aaa8c..69a99af77777 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7531,7 +7531,7 @@ void zone_pcp_reset(struct zone *zone) unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long already_offline = 0, flags; + unsigned long already_offline = 0; unsigned long pfn = start_pfn; struct page *page; struct zone *zone; @@ -7539,7 +7539,7 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn, offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); while (pfn < end_pfn) { page = pfn_to_page(pfn); /* @@ -7569,7 +7569,6 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn, del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } - spin_unlock_irqrestore(&zone->lock, flags); return end_pfn - start_pfn - already_offline; } From 0b9c0aeba938aad9964f855df00bf929b83a484d Mon Sep 17 00:00:00 2001 From: fujunjie Date: Tue, 28 Apr 2026 01:59:43 +0000 Subject: [PATCH 109/321] mm/filemap: count only the faulting address as a mmap hit Patch series "mm/filemap: tighten mmap_miss hit accounting", v3. mmap_miss is increased when synchronous mmap readahead is needed, and decreased when filemap_map_pages() maps folios that are already in the page cache. The decrease side can over-credit hits in two cases: - fault-around installs nearby PTEs even though the fault only proves that the faulting address was accessed; - after synchronous mmap readahead returns VM_FAULT_RETRY, the retry can find the folio brought in by the same miss and immediately cancel that miss. Current evidence comes from a local KVM/data-disk microbenchmark using mmap_miss_probe, with an 8 GiB guest, 2 vCPUs, 8192 KiB read_ahead_kb, cold page cache before each run, 1% of the file accessed, and medians of 3 runs. mmap_miss_probe mmap()s a prepared file with MADV_NORMAL and then touches one byte at selected base-page offsets. The access order is random, sequential, or a fixed page stride. The harness drops caches before each run and samples /proc/vmstat around that access loop. The 20 GiB case below is a larger-than-memory file case in an 8 GiB guest. No separate memory hog was used. The 4 GiB case uses the same 8 GiB guest but keeps the file fit-in-memory. Each case used a fresh temporary qcow2 data disk, seen by the guest as /dev/vda, formatted as ext4 and mounted at /mnt/mmap-matrix. Each result is "pgpgin GiB / elapsed seconds". "pgpgin GiB" is the delta of the guest /proc/vmstat pgpgin counter, converted from KiB to GiB; it is used here as an approximate block input counter, not as resident memory or exact application IO. "Elapsed seconds" is the wall-clock runtime of the whole mmap_miss_probe access pass, not per-access latency. For the 20 GiB larger-than-memory case: workload before after random 223.377 GiB/101.293s 1.010 GiB/4.790s stride1021 204.214 GiB/97.557s 204.208 GiB/108.086s stride2053 409.584 GiB/193.700s 0.970 GiB/3.685s stride4099 406.452 GiB/134.241s 0.975 GiB/3.499s sequential 0.212 GiB/0.050s 0.212 GiB/0.057s For the 4 GiB fit-in-memory case: workload before after random 3.987 GiB/1.960s 0.980 GiB/1.221s stride1021 4.002 GiB/1.838s 4.002 GiB/1.851s stride2053 3.991 GiB/1.835s 0.811 GiB/0.985s stride4099 4.001 GiB/1.836s 0.819 GiB/1.037s sequential 0.056 GiB/0.013s 0.056 GiB/0.018s The 20 GiB setup also has an ablation. P1 is only the faulting-address hit accounting change. P2-only is only the FAULT_FLAG_TRIED retry filter. P1+P2 is the combined accounting change: workload variant result random baseline 223.377 GiB/101.293s random P1 223.268 GiB/98.481s random P2-only 223.257 GiB/100.091s random P1+P2 1.010 GiB/4.790s stride2053 baseline 409.584 GiB/193.700s stride2053 P1 409.584 GiB/197.645s stride2053 P2-only 15.722 GiB/5.485s stride2053 P1+P2 0.970 GiB/3.685s sequential baseline 0.212 GiB/0.050s sequential P1 0.212 GiB/0.046s sequential P2-only 0.212 GiB/0.050s sequential P1+P2 0.212 GiB/0.057s After the v2 implementation refactor, only the final P1+P2 shape was rerun in the same setup. The numbers stayed in line with the v1 P1+P2 rows above: workload larger-than-memory case fit-in-memory case 20 GiB file, 1% access 4 GiB file, 1% access random 1.010 GiB/4.383s 0.980 GiB/1.088s stride1021 204.216 GiB/105.601s 4.001 GiB/1.783s stride2053 0.970 GiB/3.760s 0.810 GiB/0.908s stride4099 0.975 GiB/3.410s 0.818 GiB/0.870s sequential 0.212 GiB/0.060s 0.056 GiB/0.016s This does not claim to solve every sparse pattern. The stride1021 rows are intentionally shown as a boundary: with 8192 KiB read_ahead_kb, file->f_ra.ra_pages is 2048 base pages, and synchronous mmap read-around uses a 2048-page window centered around the fault, roughly [index - 1024, index + 1023]. stride1021 is 1021 * 4 KiB = 4084 KiB, so the next access lands inside the previous read-around window. About every other access can be a real faulting-address page-cache hit, and the other half can each read about 8 MiB. For about 52k accesses in the 20 GiB/1% run, half of them times 8 MiB is about 205 GiB, matching the observed 204 GiB. This patch (of 2): filemap_map_pages() reduces file->f_ra.mmap_miss when fault-around maps folios that are already present in the page cache. That hit accounting is too generous because fault-around can install PTEs around the faulting address even though the fault only proves that the faulting address was accessed. Move the mmap_miss update back into filemap_map_pages(), drop the mmap_miss argument from the helper functions, and decrement mmap_miss only when the helper return value shows that the faulting address was mapped. Keep the existing workingset-folio behavior unchanged. Link: https://lore.kernel.org/tencent_AA501E9A238337BD167E5C2ACF948A1AF308@qq.com Link: https://lore.kernel.org/tencent_756F151FE66F3D80479A6F982C0AB8569F09@qq.com Signed-off-by: fujunjie Reviewed-by: Jan Kara Reviewed-by: Vishal Moola Cc: Matthew Wilcox (Oracle) Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/filemap.c | 62 ++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 97772a05a18e..816eabb22e19 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3751,8 +3751,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss, - pgoff_t file_end) + unsigned long *rss, pgoff_t file_end) { struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; @@ -3784,16 +3783,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - /* - * If there are too many folios that are recently evicted - * in a file, they will probably continue to be evicted. - * In such situation, read-ahead is only a waste of IO. - * Don't decrease mmap_miss in this scenario to make sure - * we can stop read-ahead. - */ - if (!folio_test_workingset(folio)) - (*mmap_miss)++; - /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the @@ -3840,7 +3829,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3848,10 +3837,6 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) goto out; - /* See comment of filemap_map_folio_range() */ - if (!folio_test_workingset(folio)) - (*mmap_miss)++; - /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit @@ -3886,7 +3871,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; - unsigned short mmap_miss = 0, mmap_miss_saved; /* * Recalculate end_pgoff based on file_end before calling @@ -3925,6 +3909,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, folio_type = mm_counter_file(folio); do { unsigned long end; + vm_fault_t map_ret; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; @@ -3932,13 +3917,34 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; - if (!folio_test_large(folio)) - ret |= filemap_map_order0_folio(vmf, - folio, addr, &rss, &mmap_miss); - else - ret |= filemap_map_folio_range(vmf, folio, - xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss, file_end); + if (!folio_test_large(folio)) { + map_ret = filemap_map_order0_folio(vmf, folio, addr, + &rss); + } else { + unsigned long start = xas.xa_index - folio->index; + + map_ret = filemap_map_folio_range(vmf, folio, start, + addr, nr_pages, &rss, + file_end); + } + ret |= map_ret; + + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + */ + if ((map_ret & VM_FAULT_NOPAGE) && + !folio_test_workingset(folio)) { + unsigned short mmap_miss; + + mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + if (mmap_miss) + WRITE_ONCE(file->f_ra.mmap_miss, + mmap_miss - 1); + } folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); @@ -3948,12 +3954,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, out: rcu_read_unlock(); - mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); - if (mmap_miss >= mmap_miss_saved) - WRITE_ONCE(file->f_ra.mmap_miss, 0); - else - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); - return ret; } EXPORT_SYMBOL(filemap_map_pages); From 9b0fcac3cfe7ffeb3da78bfee765072861c81ce2 Mon Sep 17 00:00:00 2001 From: fujunjie Date: Tue, 28 Apr 2026 01:59:44 +0000 Subject: [PATCH 110/321] mm/filemap: do not count FAULT_FLAG_TRIED retries as mmap hits A fault that starts synchronous mmap readahead can return VM_FAULT_RETRY after dropping mmap_lock. The retry may then map the folio brought in by that same miss. Do not let this retry decrement mmap_miss. The retry still maps the folio from the page cache; it just does not count as a useful mmap readahead hit. Link: https://lore.kernel.org/tencent_22E6B8849EC1141FE7773C64467E6F1E2C09@qq.com Signed-off-by: fujunjie Reviewed-by: Jan Kara Reviewed-by: Vishal Moola Cc: Matthew Wilcox (Oracle) Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/filemap.c b/mm/filemap.c index 816eabb22e19..ab34cab2416a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3937,6 +3937,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, * we can stop read-ahead. */ if ((map_ret & VM_FAULT_NOPAGE) && + !(vmf->flags & FAULT_FLAG_TRIED) && !folio_test_workingset(folio)) { unsigned short mmap_miss; From ca9caa098f70e25c0edd812a640c6367e711c886 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 1 May 2026 10:20:57 +0800 Subject: [PATCH 111/321] selftests/cgroup: fix hardcoded page size in test_percpu_basic Patch series "selftests/cgroup: Fix false positive failures in test_percpu_basic", v2. This patch series addresses two separate issues that cause false positive failures in the test_percpu_basic test within the cgroup kmem selftests. The first issue stems from a hardcoded assumption about the system page size, which breaks the test on architectures with larger page sizes. The second issue is an overly strict memory check that fails to account for the slab metadata allocated during cgroup creation. This patch (of 2): MAX_VMSTAT_ERROR uses a hardcoded page size of 4096, which assumes 4K pages. This causes test_percpu_basic to fail on systems where the kernel is configured with a larger page size, such as aarch64 systems using 16K or 64K pages, where the maximum permissible discrepancy between memory.current and percpu charges is proportionally larger. Replace the hardcoded 4096 with sysconf(_SC_PAGESIZE) to correctly derive the page size at runtime regardless of the underlying architecture or kernel configuration. Link: https://lore.kernel.org/20260501022058.18024-1-li.wang@linux.dev Link: https://lore.kernel.org/20260501022058.18024-2-li.wang@linux.dev Signed-off-by: Li Wang Acked-by: Waiman Long Reviewed-by: Sayali Patil Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_kmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 12f59925500b..69cb1b50988c 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -24,7 +24,7 @@ * the maximum discrepancy between charge and vmstat entries is number * of cpus multiplied by 64 pages. */ -#define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) +#define MAX_VMSTAT_ERROR (sysconf(_SC_PAGESIZE) * 64 * get_nprocs()) #define KMEM_DEAD_WAIT_RETRIES 80 From 5de852b6e7cddc75d7182d5503e54d3ad50d109a Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 1 May 2026 10:20:58 +0800 Subject: [PATCH 112/321] selftests/cgroup: include slab in test_percpu_basic memory check test_percpu_basic() currently compares memory.current against only memory.stat:percpu after creating 1000 child cgroups. Observed failure: #./test_kmem ok 1 test_kmem_basic ok 2 test_kmem_memcg_deletion ok 3 test_kmem_proc_kpagecgroup ok 4 test_kmem_kernel_stacks ok 5 test_kmem_dead_cgroups memory.current 11530240 percpu 8440000 not ok 6 test_percpu_basic That assumption is too strict: child cgroup creation also allocates slab-backed metadata, so memory.current is expected to be larger than percpu alone. One visible path is: cgroup_mkdir() cgroup_create() cgroup_addrm_file() cgroup_add_file() __kernfs_create_file() __kernfs_new_node() kmem_cache_zalloc() These kernfs allocations are charged as slab and show up in memory.stat:slab. Update the check to compare memory.current against (percpu + slab) within MAX_VMSTAT_ERROR, and print slab/delta in the failure message to improve diagnostics. Link: https://lore.kernel.org/20260501022058.18024-3-li.wang@linux.dev Signed-off-by: Li Wang Reviewed-by: Waiman Long Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Tejun Heo Cc: Vlastimil Babka Cc: Sayali Patil Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_kmem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 69cb1b50988c..1db0ba1226b9 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -353,7 +353,7 @@ static int test_percpu_basic(const char *root) { int ret = KSFT_FAIL; char *parent, *child; - long current, percpu; + long current, percpu, slab; int i; parent = cg_name(root, "percpu_basic_test"); @@ -383,13 +383,14 @@ static int test_percpu_basic(const char *root) current = cg_read_long(parent, "memory.current"); percpu = cg_read_key_long(parent, "memory.stat", "percpu "); + slab = cg_read_key_long(parent, "memory.stat", "slab "); - if (current > 0 && percpu > 0 && labs(current - percpu) < - MAX_VMSTAT_ERROR) + if (current > 0 && percpu > 0 && slab >= 0 && + labs(current - (percpu + slab)) < MAX_VMSTAT_ERROR) ret = KSFT_PASS; else - printf("memory.current %ld\npercpu %ld\n", - current, percpu); + printf("memory.current %ld\npercpu %ld\nslab %ld\ndelta %ld\n", + current, percpu, slab, current - (percpu + slab)); cleanup_children: for (i = 0; i < 1000; i++) { From 0453f857eb32c11d8cc48988911fc5905d054319 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Apr 2026 18:17:38 -0700 Subject: [PATCH 113/321] mm/damon/reclaim: add autotune_monitoring_intervals parameter Patch series "mm/damon/reclaim: support monitoring intervals auto-tuning". The monitoring intervals auto-tuning feature of DAMON has proven to be useful in multiple environments. Add a new DAMON_RECLAIM parameter for supporting the feature, and update the document for the new parameter. This patch (of 2): DAMON's monitoring intervals auto-tuning feature has proven to be useful in multiple environments. DAMON_RECLAIM is still asking users to do the manual tuning of the intervals. Add a module parameter for utilizing the auto-tuning feature with the suggested default setup. Note that use of the auto-tuning overrides the manually entered monitoring intervals. Also, note that the 'min_age' will dynamically changed proportional to auto-tuned intervals. It is recommended to use 'min_age' short enough and use 'quota_mem_pressure_us' like coldness threshold auto-tuning features together. Link: https://lore.kernel.org/20260501011740.81988-1-sj@kernel.org Link: https://lore.kernel.org/20260501011740.81988-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a60ee800d63e..7126d47fb8b2 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -91,6 +91,20 @@ module_param(quota_mem_pressure_us, ulong, 0600); static unsigned long quota_autotune_feedback __read_mostly; module_param(quota_autotune_feedback, ulong, 0600); +/* + * Auto-tune monitoring intervals. + * + * If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's + * sampling and aggregation intervals. The auto-tuning aims to capture + * meaningful amount of access events in each DAMON-snapshot, while keeping the + * sampling intervals 5 milliseconds in minimum, and 10 seconds in maximum. + * Setting this as ``N`` disables the auto-tuning. + * + * Disabled by default. + */ +static bool autotune_monitoring_intervals __read_mostly; +module_param(autotune_monitoring_intervals, bool, 0600); + static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ @@ -152,7 +166,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, static struct damon_ctx *ctx; static struct damon_target *target; -static struct damos *damon_reclaim_new_scheme(void) +static struct damos *damon_reclaim_new_scheme(unsigned long aggr_interval) { struct damos_access_pattern pattern = { /* Find regions having PAGE_SIZE or larger size */ @@ -162,8 +176,7 @@ static struct damos *damon_reclaim_new_scheme(void) .min_nr_accesses = 0, .max_nr_accesses = 0, /* for min_age or more micro-seconds */ - .min_age_region = min_age / - damon_reclaim_mon_attrs.aggr_interval, + .min_age_region = min_age / aggr_interval, .max_age_region = UINT_MAX, }; @@ -184,6 +197,7 @@ static int damon_reclaim_apply_parameters(void) { struct damon_ctx *param_ctx; struct damon_target *param_target; + struct damon_attrs attrs; struct damos *scheme; struct damos_quota_goal *goal; struct damos_filter *filter; @@ -201,12 +215,21 @@ static int damon_reclaim_apply_parameters(void) goto out; } - err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); + attrs = damon_reclaim_mon_attrs; + if (autotune_monitoring_intervals) { + attrs.sample_interval = 5000; + attrs.aggr_interval = 100000; + attrs.intervals_goal.access_bp = 40; + attrs.intervals_goal.aggrs = 3; + attrs.intervals_goal.min_sample_us = 5000; + attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000; + } + err = damon_set_attrs(param_ctx, &attrs); if (err) goto out; err = -ENOMEM; - scheme = damon_reclaim_new_scheme(); + scheme = damon_reclaim_new_scheme(attrs.aggr_interval); if (!scheme) goto out; damon_set_schemes(param_ctx, &scheme, 1); From 1794454a3bf66974f806301fa2952aed719780fb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Apr 2026 18:17:39 -0700 Subject: [PATCH 114/321] Docs/admin-guide/mm/damon/reclaim: update for autotune_monitoring_intervals Update DAMON_RECLAIM usage document for the newly added monitoring intervals auto-tuning enablement parameter. Link: https://lore.kernel.org/20260501011740.81988-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/reclaim.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 57ab8b187650..ec7e3e32b4ac 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -85,6 +85,17 @@ identifies the region as cold, and reclaims it. 120 seconds by default. +autotune_monitoring_intervals +----------------------------- + +If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's +sampling and aggregation intervals. The auto-tuning aims to capture meaningful +amount of access events in each DAMON-snapshot, while keeping the sampling +interval 5 milliseconds in minimum, and 10 seconds in maximum. Setting this as +``N`` disables the auto-tuning. + +Disabled by default. + quota_ms -------- From 3a0bc9568c354357546557d8b969785bc27fd260 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 1 May 2026 19:05:03 -0700 Subject: [PATCH 115/321] mm/damon/stat: add a parameter for reading kdamond pid Patch series "mm/damon/stat: add kdamond_pid parameter". DAMON_STAT doesn't provide the pid of its kdamond, unlike DAMON_RECLAIM and DAMON_LRU_SORT. This makes user-space management of DAMON_STAT unnecessarily complicated. Provide the information via a new parameter, namely kdamond_pid, and document it. This patch (of 2): Knowing the pid of the kdamonds can help user-space management including monitoring of DAMON's system resource consumption. To make it easier, DAMON_SYSFS, DAMON_RECLAIM and DAMON_LRU_SORT provide the pid information. DAMON_STAT is not providing it, though. Expose the pid of DAMON_STAT kdamond via a new read-only module parameter, namely kdamond_pid. This also makes DAMON modules usage more standardized, because DAMON_RECLAIM and DAMON_LRU_SORT also provide the information via their read-only parameters of the same name. Link: https://lore.kernel.org/20260502020505.80822-1-sj@kernel.org Link: https://lore.kernel.org/20260502020505.80822-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/stat.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index f4d3203e9263..0e14f5bb8f75 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -266,6 +266,45 @@ static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp) return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N'); } +static int damon_stat_kdamond_pid_store( + const char *val, const struct kernel_param *kp) +{ + /* + * kdamond_pid is read-only, but kernel command line could write it. + * Do nothing here. + */ + return 0; +} + +static int damon_stat_kdamond_pid_load( + char *buffer, const struct kernel_param *kp) +{ + int pid; + + if (!damon_stat_context) { + pid = -1; + } else { + pid = damon_kdamond_pid(damon_stat_context); + if (pid < 1) + pid = -1; + } + return sprintf(buffer, "%d\n", pid); +} + +static const struct kernel_param_ops kdamond_pid_param_ops = { + .set = damon_stat_kdamond_pid_store, + .get = damon_stat_kdamond_pid_load, +}; + +/* + * PID of the DAMON thread + * + * If DAMON_STAT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400); +MODULE_PARM_DESC(kdamond_pid, "pid of the kdamond"); + static int __init damon_stat_init(void) { int err = 0; From f27d56b4f2aa0ffeda7113df3443448bc907acaf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 1 May 2026 19:05:04 -0700 Subject: [PATCH 116/321] Docs/admin-guide/mm/damon/stat: document kdamond_pid parameter Update DAMON_STAT usage document for newly added kdamond_pid parameter. Link: https://lore.kernel.org/20260502020505.80822-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/stat.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/stat.rst b/Documentation/admin-guide/mm/damon/stat.rst index c4b14daeb2dd..46c5dd96aa2e 100644 --- a/Documentation/admin-guide/mm/damon/stat.rst +++ b/Documentation/admin-guide/mm/damon/stat.rst @@ -89,3 +89,10 @@ percentiles of the idle time values via this read-only parameter. Reading the parameter returns 101 idle time values in milliseconds, separated by comma. Each value represents 0-th, 1st, 2nd, 3rd, ..., 99th and 100th percentile idle times. + +kdamond_pid +----------- + +PID of the DAMON thread. + +If DAMON_STAT is enabled, this becomes the PID of the worker thread. Else, -1. From 9d5685286aa8c5ef70d8e1a34cef5daf518ae237 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Tue, 5 May 2026 02:11:25 +0000 Subject: [PATCH 117/321] highmem-internal.h: fix typo in the comment for kunmap_atomic() Replace `PREEMP_RT` with `PREEMPT_RT` in the header comment to match the correct kernel configuration name. Link: https://lore.kernel.org/20260505021125.1941691-1-zhouzhouyi@gmail.com Signed-off-by: Zhouyi Zhou Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 0574c21ca45d..bb71e7dba4f7 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -262,7 +262,7 @@ static inline bool is_kmap_addr(const void *x) * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables - * pagefaults. Depending on PREEMP_RT configuration, re-enables also + * pagefaults. Depending on PREEMPT_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. From 66366d291f666ddeda5f8c84f253e308de3e6b55 Mon Sep 17 00:00:00 2001 From: Zijiang Huang Date: Wed, 6 May 2026 21:09:19 +0800 Subject: [PATCH 118/321] mm/swap: add cond_resched() in swap_reclaim_full_clusters to prevent softlockup We hit a real softlockup in an internal stress test environment. The workload was LTP memory/swap stress on a large arm64 machine, with 320 CPUs, about 1TB memory and an 8.6GB swap device. The system was under heavy load and the swap device had a large number of full clusters. The softlockup was triggered during a stress test after about 3 days. So, add periodic cond_resched() calls during large full_clusters reclaim operations to prevent softlockup issues. Detailed call trace as follow: PID: 3817773 TASK: ffff0883bb28b780 CPU: 48 COMMAND: "kworker/48:7" #0 [ffff800080183d10] __crash_kexec at ffffa4c1361e5de4 #1 [ffff800080183d90] panic at ffffa4c1360d5e9c #2 [ffff800080183e20] watchdog_timer_fn at ffffa4c136231fa8 ... #16 [ffff8000c4ad3cb0] swap_cache_del_folio at ffffa4c1363e1614 #17 [ffff8000c4ad3ce0] __try_to_reclaim_swap at ffffa4c1363e4bfc #18 [ffff8000c4ad3d40] swap_reclaim_full_clusters at ffffa4c1363e5474 #19 [ffff8000c4ad3da0] swap_reclaim_work at ffffa4c1363e550c #20 [ffff8000c4ad3dc0] process_one_work at ffffa4c136102edc #21 [ffff8000c4ad3e10] worker_thread at ffffa4c136103398 #22 [ffff8000c4ad3e70] kthread at ffffa4c13610d95c Link: https://lore.kernel.org/20260506130919.2298807-1-kerayhuang@tencent.com Fixes: 5168a68eb78f ("mm, swap: avoid over reclaim of full clusters") Signed-off-by: Zijiang Huang Reviewed-by: Kairui Song Reviewed-by: Hao Peng Reviewed-by: albinwyang Reviewed-by: Baoquan He Acked-by: Chris Li Cc: Barry Song Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Cc: Youngjun Park Cc: Signed-off-by: Andrew Morton --- mm/swapfile.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9174f1eeffb0..74a1e324449d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1054,6 +1054,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) swap_cluster_unlock(ci); if (to_scan <= 0) break; + cond_resched(); } } From 77d100d11c87e62010fe65a9a4d117ca0a05f8d0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 6 May 2026 05:58:24 -0700 Subject: [PATCH 119/321] mm/kmemleak: dedupe verbose scan output by allocation backtrace Patch series "mm/kmemleak: dedupe verbose scan output", v3. I am starting to run with kmemleak in verbose enabled in some "probe points" across the my employers fleet so that suspected leaks land in dmesg without needing a separate read of /sys/kernel/debug/kmemleak. The downside is that workloads which leak many objects from a single allocation site flood the console with byte-for-byte identical backtraces. Hundreds of duplicates per scan are common, drowning out distinct leaks and unrelated kernel messages, while adding no signal beyond the first occurrence. This series collapses those duplicates inside kmemleak itself. Each unique stackdepot trace_handle prints once per scan, followed by a short summary line when more than one object shares it: kmemleak: unreferenced object 0xff110001083beb00 (size 192): kmemleak: comm "modprobe", pid 974, jiffies 4294754196 kmemleak: ... kmemleak: backtrace (crc 6f361828): kmemleak: __kmalloc_cache_noprof+0x1af/0x650 kmemleak: ... kmemleak: ... and 71 more object(s) with the same backtrace The "N new suspected memory leaks" tally and the contents of /sys/kernel/debug/kmemleak are unchanged - the per-object detail is still available on demand, only the verbose (dmesg) output is collapsed. Patch 1 is the kmemleak change. Patch 2 adds a selftest that loads samples/kmemleak's CONFIG_SAMPLE kmemleak-test module to generate ten leaks sharing one call site and checks that the printed count is strictly less than the reported leak total. Not sure if Patch 2 is useful or not, if not, it is easier to discard. This patch (of 2): In kmemleak's verbose mode, every unreferenced object found during a scan is logged with its full header, hex dump and 16-frame backtrace. Workloads that leak many objects from a single allocation site flood dmesg with byte-for-byte identical backtraces, drowning out distinct leaks and other kernel messages. Dedupe within each scan using stackdepot's trace_handle as the key: for every leaked object with a recorded stack trace, look up the representative kmemleak_object in a per-scan xarray keyed by trace_handle. The first sighting stores the object pointer (with a get_object() reference) and sets object->dup_count to 1; later sightings just bump dup_count on the representative. After the scan, walk the xarray once and emit each unique backtrace, followed by a single summary line when more than one object shares it. Leaks whose trace_handle is 0 (early-boot allocations tracked before kmemleak_init() set up object_cache, or stack_depot_save() failures under memory pressure) cannot be deduped, so they are still printed inline via the same locked OBJECT_ALLOCATED-checked helper. The contents of /sys/kernel/debug/kmemleak are unchanged - only the verbose console output is collapsed. Safety notes: - The xarray store happens outside object->lock: object->lock is a raw spinlock, while xa_store() may grab xa_node slab locks at a higher wait-context level which lockdep flags as invalid. trace_handle is captured under object->lock (which serialises with kmemleak_update_trace()'s writer), so it is safe to use after dropping the lock. - get_object() pins the kmemleak_object metadata across rcu_read_unlock(), but the underlying tracked allocation can still be freed concurrently. The deferred print path therefore re-acquires object->lock and re-checks OBJECT_ALLOCATED via print_leak_locked() before touching object->pointer; __delete_object() clears that flag under the same lock before the user memory goes away. The same helper is used by the trace_handle == 0 and xa_store() failure fallbacks, so every printer in the new path has identical safety guarantees. - If get_object() fails after we set OBJECT_REPORTED, the object is already being torn down (use_count hit zero); the leak count is still accurate but the verbose line is dropped, which is correct - the memory was freed concurrently and is no longer a leak. - If xa_store() fails to allocate an xa_node under memory pressure, we fall back to printing inline via print_leak_locked() instead of silently dropping the leak. - The hex dump is skipped for coalesced entries (dup_count > 1): bytes would differ across objects sharing a backtrace anyway, and skipping it removes the only remaining read of object->pointer's contents in the deferred path. The representative's reported size may also differ from the coalesced objects' sizes; the printed trace_handle reflects the representative's current value rather than the value used as the dedup key, which is normally - but not strictly - identical. Link: https://lore.kernel.org/20260506-kmemleak_dedup-v3-0-2d36aafc34da@debian.org Link: https://lore.kernel.org/20260506-kmemleak_dedup-v3-1-2d36aafc34da@debian.org Signed-off-by: Breno Leitao Reviewed-by: Catalin Marinas Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kmemleak.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 140 insertions(+), 8 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2eff0d6b622b..7c7ba17ce7af 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include @@ -157,6 +158,8 @@ struct kmemleak_object { struct hlist_head area_list; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ + /* per-scan dedup count, valid only while in scan-local dedup xarray */ + unsigned int dup_count; char comm[TASK_COMM_LEN]; /* executable name */ }; @@ -360,8 +363,9 @@ static const char *__object_type_str(struct kmemleak_object *object) * Printing of the unreferenced objects information to the seq file. The * print_unreferenced function must be called with the object->lock held. */ -static void print_unreferenced(struct seq_file *seq, - struct kmemleak_object *object) +static void __print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object, + bool hex_dump) { int i; unsigned long *entries; @@ -373,7 +377,8 @@ static void print_unreferenced(struct seq_file *seq, object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); - hex_dump_object(seq, object); + if (hex_dump) + hex_dump_object(seq, object); warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum); for (i = 0; i < nr_entries; i++) { @@ -382,6 +387,12 @@ static void print_unreferenced(struct seq_file *seq, } } +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + __print_unreferenced(seq, object, true); +} + /* * Print the kmemleak_object information. This function is used mainly for * debugging special cases when kmemleak operations. It must be called with @@ -1684,6 +1695,103 @@ unlock_put: put_object(object); } +/* + * Print one leak inline. The hex dump is gated on OBJECT_ALLOCATED so it + * does not touch user memory that was freed concurrently; the rest of the + * report (backtrace, comm, pid) is always emitted since the kmemleak_object + * metadata is pinned by the caller. + */ +static void print_leak_locked(struct kmemleak_object *object, bool hex_dump) +{ + raw_spin_lock_irq(&object->lock); + __print_unreferenced(NULL, object, + hex_dump && (object->flags & OBJECT_ALLOCATED)); + raw_spin_unlock_irq(&object->lock); +} + +/* + * Per-scan dedup table for verbose leak printing. The xarray is keyed by + * stackdepot trace_handle and stores a pointer to the representative + * kmemleak_object. The per-scan repeat count lives in object->dup_count. + * + * dedup_record() must run outside object->lock: xa_store() may take + * mutexes (xa_node slab allocation) which lockdep would flag against the + * raw spinlock object->lock. + */ +static void dedup_record(struct xarray *dedup, struct kmemleak_object *object, + depot_stack_handle_t trace_handle) +{ + struct kmemleak_object *rep; + void *old; + + /* + * No stack trace to dedup against: early-boot allocation tracked + * before kmemleak_init() set up object_cache, or stack_depot_save() + * failure under memory pressure. + */ + if (!trace_handle) { + print_leak_locked(object, true); + return; + } + + /* stack is available, now we can de-dup */ + rep = xa_load(dedup, trace_handle); + if (rep) { + rep->dup_count++; + return; + } + + /* + * Object is being torn down (use_count already hit zero); the + * tracked memory at object->pointer is unsafe to read, so skip. + */ + if (!get_object(object)) + return; + + object->dup_count = 1; + old = xa_store(dedup, trace_handle, object, GFP_ATOMIC); + if (xa_is_err(old)) { + /* xa_node allocation failed; fall back to inline print. */ + print_leak_locked(object, true); + put_object(object); + return; + } + /* + * scan_mutex serialises all writers to the dedup xarray, so xa_store() + * after a NULL xa_load() must always overwrite an empty slot. + */ + WARN_ON_ONCE(old); +} + +/* + * Drain the dedup table. Re-acquires object->lock and re-checks + * OBJECT_ALLOCATED before printing: while get_object() pins the + * kmemleak_object metadata, the underlying tracked allocation may have + * been freed since the scan walked it (kmemleak_free clears + * OBJECT_ALLOCATED under object->lock before the user memory goes away). + * The hex dump is skipped for coalesced entries since the bytes would + * differ across objects anyway. + */ +static void dedup_flush(struct xarray *dedup) +{ + struct kmemleak_object *object; + unsigned long idx; + unsigned int dup; + bool coalesced; + + xa_for_each(dedup, idx, object) { + dup = object->dup_count; + coalesced = dup > 1; + + print_leak_locked(object, !coalesced); + if (coalesced) + pr_warn(" ... and %u more object(s) with the same backtrace\n", + dup - 1); + put_object(object); + xa_erase(dedup, idx); + } +} + /* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the @@ -1694,6 +1802,7 @@ static void kmemleak_scan(void) struct kmemleak_object *object; struct zone *zone; int __maybe_unused i; + struct xarray dedup; int new_leaks = 0; jiffies_last_scan = jiffies; @@ -1834,10 +1943,18 @@ static void kmemleak_scan(void) return; /* - * Scanning result reporting. + * Scanning result reporting. When verbose printing is enabled, dedupe + * by stackdepot trace_handle so each unique backtrace is logged once + * per scan, annotated with the number of objects that share it. The + * per-leak count below still reflects every object, and + * /sys/kernel/debug/kmemleak still lists them individually. */ + xa_init(&dedup); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { + depot_stack_handle_t trace_handle; + bool dedup_print; + if (need_resched()) kmemleak_cond_resched(object); @@ -1849,18 +1966,33 @@ static void kmemleak_scan(void) if (!color_white(object)) continue; raw_spin_lock_irq(&object->lock); + trace_handle = 0; + dedup_print = false; if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; - - if (kmemleak_verbose) - print_unreferenced(NULL, object); - + if (kmemleak_verbose) { + trace_handle = object->trace_handle; + dedup_print = true; + } new_leaks++; } raw_spin_unlock_irq(&object->lock); + + /* + * Defer the verbose print outside object->lock: xa_store() + * may take xa_node slab locks at a higher wait-context level + * which lockdep would flag against the raw_spinlock_t + * object->lock. rcu_read_lock() keeps the kmemleak_object + * alive across the call. + */ + if (dedup_print) + dedup_record(&dedup, object, trace_handle); } rcu_read_unlock(); + /* Flush'em all */ + dedup_flush(&dedup); + xa_destroy(&dedup); if (new_leaks) { kmemleak_found_leaks = true; From cfaef29c20e86738aec28641b6de1e078298999e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 6 May 2026 05:58:25 -0700 Subject: [PATCH 120/321] selftests/mm: add kmemleak verbose dedup test Add a regression test for the per-scan verbose dedup added in the preceding commit. The test loads samples/kmemleak's helper module (CONFIG_SAMPLE_KMEMLEAK=m) to generate orphan allocations, several of which share an allocation backtrace, runs four kmemleak scans with verbose printing enabled, then walks dmesg looking for two "unreferenced object" reports within a single scan that share an identical backtrace - which would mean dedup failed to collapse them. The test is intentionally permissive on detection but strict on regressions: - PASS when no duplicates are observed, regardless of whether the dedup summary line ("... and N more object(s) with the same backtrace") was actually emitted. Per-CPU chunk reuse, slab freelist pointers, kernel stack residue and CONFIG_DEBUG_KMEMLEAK_ AUTO_SCAN can all keep most of the orphans "still referenced" or reported across many separate scans, so the dedup path may have nothing to fold within one scan. That is not a regression. - PASS reports whether dedup actually fired, so a passing run on a well-behaved environment is still informative. - FAIL when two same-backtrace reports land in a single scan (clear dedup regression). - FAIL when kmemleak's own per-scan tally counts leaks but the verbose path emits zero "unreferenced object" lines - that catches a regression in the verbose printer itself, which would otherwise pass the duplicate check trivially. - SKIP when kmemleak is absent, disabled at runtime, or the helper module is not built. The dmesg parser anchors stack-frame matching to the indentation kmemleak uses for them (4+ spaces under "kmemleak: ") so unrelated kmemleak warnings landing between reports do not get lumped into the backtrace key and mask a duplicate. Link: https://lore.kernel.org/20260506-kmemleak_dedup-v3-2-2d36aafc34da@debian.org Signed-off-by: Breno Leitao Cc: Catalin Marinas Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/ksft_kmemleak_dedup.sh | 222 ++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100755 tools/testing/selftests/mm/ksft_kmemleak_dedup.sh diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 18779045b7f6..41053fdaad88 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -151,6 +151,7 @@ TEST_PROGS += ksft_gup_test.sh TEST_PROGS += ksft_hmm.sh TEST_PROGS += ksft_hugetlb.sh TEST_PROGS += ksft_hugevm.sh +TEST_PROGS += ksft_kmemleak_dedup.sh TEST_PROGS += ksft_ksm.sh TEST_PROGS += ksft_ksm_numa.sh TEST_PROGS += ksft_madv_guard.sh diff --git a/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh b/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh new file mode 100755 index 000000000000..d01950244490 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Regression test for kmemleak's per-scan verbose dedup. +# +# Loads samples/kmemleak's helper module to generate orphan allocations +# (some of which share an allocation backtrace), runs a few kmemleak +# scans with verbose printing enabled, and verifies that no two +# "unreferenced object" reports within a single scan share the same +# backtrace - which would mean dedup failed to collapse them. +# +# This test is intentionally permissive: the kmemleak-test module's +# leaks frequently get reported across many separate scans (per-CPU +# chunk reuse, slab freelist pointers, kernel stack residue), so dedup +# may never have anything to fold within one scan. That is not a +# regression. The test only fails when it actually catches dedup not +# happening on input that should have triggered it - i.e. two reports +# with identical backtraces in the same scan. +# +# Author: Breno Leitao + +ksft_skip=4 +KMEMLEAK=/sys/kernel/debug/kmemleak +VERBOSE_PARAM=/sys/module/kmemleak/parameters/verbose +MODULE=kmemleak-test + +skip() { + echo "SKIP: $*" + exit $ksft_skip +} + +fail() { + echo "FAIL: $*" + exit 1 +} + +pass() { + echo "PASS: $*" + exit 0 +} + +[ "$(id -u)" -eq 0 ] || skip "must run as root" +[ -r "$KMEMLEAK" ] || skip "no kmemleak debugfs (CONFIG_DEBUG_KMEMLEAK)" +[ -w "$VERBOSE_PARAM" ] || skip "kmemleak verbose param missing" +modinfo "$MODULE" >/dev/null 2>&1 || + skip "$MODULE not built (CONFIG_SAMPLE_KMEMLEAK)" + +# The verdict depends entirely on dmesg contents, so a silently-empty +# dmesg (dmesg_restrict=1 with CAP_SYSLOG dropped, restricted container, +# etc.) would let the script report PASS without parsing anything. Probe +# both read and clear up front and skip cleanly if either is denied. +dmesg >/dev/null 2>&1 || + skip "cannot read dmesg (need CAP_SYSLOG or dmesg_restrict=0)" +dmesg -C >/dev/null 2>&1 || + skip "cannot clear dmesg (need CAP_SYSLOG or dmesg_restrict=0)" + +# kmemleak can be present but disabled at runtime (boot arg kmemleak=off, +# or it self-disabled after an internal error). In that state writes other +# than "clear" return EPERM, so probe once and skip if so. +if ! echo scan > "$KMEMLEAK" 2>/dev/null; then + skip "kmemleak is disabled (check dmesg or kmemleak= boot arg)" +fi + +prev_verbose=$(cat "$VERBOSE_PARAM") +# shellcheck disable=SC2317 # invoked indirectly via trap +cleanup() { + echo "$prev_verbose" > "$VERBOSE_PARAM" 2>/dev/null + rmmod "$MODULE" 2>/dev/null + # Drain the leak set we generated. Subsequent selftests (e.g. + # tools/testing/selftests/net/netfilter/nft_interface_stress.sh) + # fail on any non-empty kmemleak report, so leaving the helper + # module's intentional leaks behind would poison the rest of a + # kselftest run. + # + # Caveat: kmemleak_clear() only greys objects that have already + # been reported (OBJECT_REPORTED && unreferenced_object()). Helper + # allocations that stayed "still referenced" throughout the test + # (stale pointers in per-CPU chunks, slab freelists, kernel stacks) + # were never reported and are therefore not greyed by this clear - + # they remain tracked and a later scan can still surface them. Such + # leftovers are inherent to the kmemleak-test sample module and are + # not specific to this test; consumers that fail on any kmemleak + # output (rather than on the test-specific backtraces) need to be + # robust to that, or this test should be excluded from the run. + echo clear > "$KMEMLEAK" 2>/dev/null +} +trap cleanup EXIT + +echo 1 > "$VERBOSE_PARAM" + +# Drain the existing leak set so the next scan only reports our objects. +echo clear > "$KMEMLEAK" + +# Re-clear dmesg now (the up-front probe also cleared it, but anything +# logged between then and here - module unload chatter, the probe scan, +# the verbose-param write - would otherwise pollute the parse window). +dmesg -C >/dev/null + +# If the module was left loaded by a previous aborted run, modprobe would +# be a no-op and the init function would not run, so no new leaks would be +# generated. Force a clean state first. +rmmod "$MODULE" 2>/dev/null +modprobe "$MODULE" || skip "failed to load $MODULE" +# Removing the module orphans the list elements without freeing them. +rmmod "$MODULE" || skip "failed to unload $MODULE" + +# Run a handful of scans so kmemleak has the chance to age and report +# the orphans. We do not require any particular number to be reported: +# the regression check below operates on whatever lands in dmesg. +# +# Note: with CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y the kernel's own scan +# thread can report and mark these orphans (OBJECT_REPORTED) before our +# manual scans run, after which our scans will see nothing. The +# lower-bound check below catches the case where that happens and the +# manual scans also produce nothing. +SCAN_COUNT=4 +SCAN_SLEEP=6 +for _ in $(seq 1 "$SCAN_COUNT"); do + echo scan > "$KMEMLEAK" + sleep "$SCAN_SLEEP" +done + +# Strip the leading "[ nnn.nnnnnn] " dmesg timestamp prefix. Without +# this, two identical stack frames printed from two reports in the same +# scan would produce different per-frame strings (different timestamps) +# and the duplicate-backtrace check below would not match them, silently +# passing a real dedup regression. Doing the strip here makes the rest +# of the parser timestamp-agnostic regardless of what dmesg defaults to. +log=$(dmesg | sed 's/^\[[^]]*\] //') + +# After running the workload (modprobe + scans), dmesg should contain at +# least the helper module's pr_info lines and our manual-scan output. An +# empty capture here means dmesg succeeded earlier but is now denying us +# the buffer (race with dmesg_restrict toggling, etc.); refuse to give a +# verdict on no evidence. +[ -n "$log" ] || skip "dmesg returned empty after running workload" + +# Lower bound: if kmemleak's own per-scan tally counted leaks but the +# verbose path emitted no "unreferenced object" line, the verbose printer +# itself is regressed - fail rather than silently passing on no input. +new_leaks=$(echo "$log" | + sed -n 's/.*kmemleak: \([0-9]\+\) new suspected.*/\1/p' | + awk '{s+=$1} END{print s+0}') +printed=$(echo "$log" | grep -c 'kmemleak: unreferenced object') +if [ "$new_leaks" -gt 0 ] && [ "$printed" -eq 0 ]; then + fail "verbose path broken: $new_leaks leaks counted, 0 printed in $SCAN_COUNT scans" +fi + +# Walk the log: split into per-scan chunks at "N new suspected memory +# leaks" boundaries; within each chunk, capture each "unreferenced +# object" report's backtrace and check that no backtrace is reported +# more than once. A duplicate within a single scan means dedup failed +# to collapse two leaks that share an allocation site. +violations=$(echo "$log" | awk ' + function flush_block() { + if (in_block) { + # Skip empty backtraces: leaks with trace_handle == 0 + # (early-boot allocations or stack_depot_save() failures + # under memory pressure) are intentionally not deduped, + # so multiple such reports in one scan are expected and + # must not be flagged as a regression. + if (bt != "") + seen[bt]++ + in_block = 0 + collecting = 0 + bt = "" + } + } + function check_and_reset( b) { + for (b in seen) + if (seen[b] > 1) + printf("backtrace seen %d times in one scan:\n%s\n", + seen[b], b) + delete seen + } + # Scan boundary: the per-scan summary line. + /kmemleak: [0-9]+ new suspected memory leaks/ { + flush_block() + check_and_reset() + next + } + # Start of a new "unreferenced object" report. + /kmemleak: unreferenced object/ { + flush_block() + in_block = 1 + next + } + # Inside a report, the "backtrace (crc ...):" line switches us to + # backtrace-collecting mode. + in_block && /kmemleak:[[:space:]]+backtrace \(crc/ { + collecting = 1 + next + } + # Once collecting, capture only deeply-indented "kmemleak: " lines + # (stack frames have 4+ spaces of indentation under "kmemleak: "; + # headers and the "... and N more" tail line have less). This stops + # unrelated kmemleak warns landing between reports from being lumped + # into the backtrace key, which would mask a genuine duplicate. + in_block && collecting && /kmemleak:[[:space:]]{4,}/ { + bt = bt $0 "\n" + next + } + END { + flush_block() + check_and_reset() + } +') + +if [ -n "$violations" ]; then + echo "$violations" + fail "kmemleak dedup regression: same backtrace reported more than once in a single scan" +fi + +# Count the dedup summary lines so the report distinguishes "dedup +# actually fired" from "no same-backtrace leaks turned up to dedup". +dedup_lines=$(echo "$log" | grep -c 'more object(s) with the same backtrace') + +if [ "$dedup_lines" -gt 0 ]; then + pass "no dedup violations across $SCAN_COUNT scans; dedup fired ($dedup_lines summary line(s) observed)" +else + pass "no dedup violations across $SCAN_COUNT scans; dedup had nothing to collapse" +fi From 9012c4e647df9a3c5450dcccd766877a3efebc46 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 8 May 2026 18:18:15 -0700 Subject: [PATCH 121/321] mm/damon: replace damon_rand() with a per-ctx lockless PRNG damon_rand() on the sampling_addr hot path called get_random_u32_below(), which takes a local_lock_irqsave() around a per-CPU batched entropy pool and periodically refills it with ChaCha20. At elevated nr_regions counts (20k+), the lock_acquire / local_lock pair plus __get_random_u32_below() dominate kdamond perf profiles. Replace the helper with a lockless lfsr113 generator (struct rnd_state) held per damon_ctx and seeded from get_random_u64() in damon_new_ctx(). kdamond is the single consumer of a given ctx, so no synchronization is required. Range mapping uses traditional reciprocal multiplication, similar as get_random_u32_below(); for spans larger than U32_MAX (only reachable on 64-bit) the slow path combines two u32 outputs and uses mul_u64_u64_shr() at 64-bit width. On 32-bit the slow path is dead code and gets eliminated by the compiler. The new helper takes a ctx parameter; damon_split_regions_of() and the kunit tests that call it directly are updated accordingly. lfsr113 is a linear PRNG and MUST NOT be used for anything security-sensitive. DAMON's sampling_addr is not exposed to userspace and is only consumed as a probe point for PTE accessed-bit sampling, so a non-cryptographic PRNG is appropriate here. Tested with paddr monitoring and max_nr_regions=20000: kdamond CPU usage reduced from ~72% to ~50% of one core. Link: https://lore.kernel.org/20260505145212.108644-1-jiayuan.chen@linux.dev Link: https://lore.kernel.org/damon/20260426173346.86238-1-sj@kernel.org/T/#m4f1fd74112728f83a41511e394e8c3fef703039c Link: https://lore.kernel.org/20260509011816.85145-1-sj@kernel.org Signed-off-by: Jiayuan Chen Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Shu Anzai Cc: Quanmin Yan Signed-off-by: Andrew Morton --- include/linux/damon.h | 28 +++++++++++++++++++++------- mm/damon/core.c | 12 ++++++++---- mm/damon/paddr.c | 8 ++++---- mm/damon/tests/core-kunit.h | 28 ++++++++++++++++++++++------ mm/damon/vaddr.c | 7 ++++--- 5 files changed, 59 insertions(+), 24 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index c7a31572689b..4d4f031bcb45 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -8,23 +8,18 @@ #ifndef _DAMON_H_ #define _DAMON_H_ +#include #include #include +#include #include #include -#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION_SZ PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) -/* Get a random number in [l, r) */ -static inline unsigned long damon_rand(unsigned long l, unsigned long r) -{ - return l + get_random_u32_below(r - l); -} - /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -859,8 +854,27 @@ struct damon_ctx { struct list_head adaptive_targets; struct list_head schemes; + + /* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */ + struct rnd_state rnd_state; }; +/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */ +static inline unsigned long damon_rand(struct damon_ctx *ctx, + unsigned long l, unsigned long r) +{ + unsigned long span = r - l; + u64 rnd; + + if (span <= U32_MAX) { + rnd = prandom_u32_state(&ctx->rnd_state); + return l + (unsigned long)((rnd * span) >> 32); + } + rnd = ((u64)prandom_u32_state(&ctx->rnd_state) << 32) | + prandom_u32_state(&ctx->rnd_state); + return l + mul_u64_u64_shr(rnd, span, 64); +} + static inline struct damon_region *damon_next_region(struct damon_region *r) { return container_of(r->list.next, struct damon_region, list); diff --git a/mm/damon/core.c b/mm/damon/core.c index 9f38deddcb30..3a8725e400c6 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -611,6 +611,8 @@ struct damon_ctx *damon_new_ctx(void) INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); + prandom_seed_state(&ctx->rnd_state, get_random_u64()); + return ctx; } @@ -2939,8 +2941,9 @@ static void damon_split_region_at(struct damon_target *t, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs, - unsigned long min_region_sz) +static void damon_split_regions_of(struct damon_ctx *ctx, + struct damon_target *t, int nr_subs, + unsigned long min_region_sz) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2955,7 +2958,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs, * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ - sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_sub = ALIGN_DOWN(damon_rand(ctx, 1, 10) * sz_region / 10, min_region_sz); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) @@ -2996,7 +2999,8 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions, ctx->min_region_sz); + damon_split_regions_of(ctx, t, nr_subregions, + ctx->min_region_sz); last_nr_regions = nr_regions; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5cdcc5037cbc..c4738cd5e221 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -49,11 +49,11 @@ static void damon_pa_mkold(phys_addr_t paddr) } static void __damon_pa_prepare_access_check(struct damon_region *r, - unsigned long addr_unit) + struct damon_ctx *ctx) { - r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end); - damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit)); + damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, ctx->addr_unit)); } static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) @@ -63,7 +63,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(r, ctx->addr_unit); + __damon_pa_prepare_access_check(r, ctx); } } diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 1b23a22ac04c..866f716e5760 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -273,54 +273,70 @@ static void damon_test_merge_regions_of(struct kunit *test) static void damon_test_split_regions_of(struct kunit *test) { + struct damon_ctx *c; struct damon_target *t; struct damon_region *r; unsigned long sa[] = {0, 300, 500}; unsigned long ea[] = {220, 400, 700}; int i; + c = damon_new_ctx(); + if (!c) + kunit_skip(test, "ctx alloc fail"); + t = damon_new_target(); - if (!t) + if (!t) { + damon_destroy_ctx(c); kunit_skip(test, "target alloc fail"); + } r = damon_new_region(0, 22); if (!r) { damon_free_target(t); + damon_destroy_ctx(c); kunit_skip(test, "region alloc fail"); } damon_add_region(r, t); - damon_split_regions_of(t, 2, 1); + damon_split_regions_of(c, t, 2, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); - if (!t) + if (!t) { + damon_destroy_ctx(c); kunit_skip(test, "second target alloc fail"); + } r = damon_new_region(0, 220); if (!r) { damon_free_target(t); + damon_destroy_ctx(c); kunit_skip(test, "second region alloc fail"); } damon_add_region(r, t); - damon_split_regions_of(t, 4, 1); + damon_split_regions_of(c, t, 4, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); t = damon_new_target(); - if (!t) + if (!t) { + damon_destroy_ctx(c); kunit_skip(test, "third target alloc fail"); + } for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); if (!r) { damon_free_target(t); + damon_destroy_ctx(c); kunit_skip(test, "region alloc fail"); } damon_add_region(r, t); } - damon_split_regions_of(t, 4, 5); + damon_split_regions_of(c, t, 4, 5); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u); damon_for_each_region(r, t) KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul); damon_free_target(t); + + damon_destroy_ctx(c); } static void damon_test_ops_registration(struct kunit *test) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index dd5f2d7027ac..1b0ebe3b6951 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -333,9 +333,10 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) */ static void __damon_va_prepare_access_check(struct mm_struct *mm, - struct damon_region *r) + struct damon_region *r, + struct damon_ctx *ctx) { - r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end); damon_va_mkold(mm, r->sampling_addr); } @@ -351,7 +352,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - __damon_va_prepare_access_check(mm, r); + __damon_va_prepare_access_check(mm, r, ctx); mmput(mm); } } From a1e6b0968833c2dd6193d05daf5700f9e0492126 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Sat, 9 May 2026 08:56:31 +0800 Subject: [PATCH 122/321] proc/meminfo: expose per-node balloon pages in node meminfo Commit 835de37603ef ("meminfo: add a per node counter for balloon drivers") added NR_BALLOON_PAGES and exposed it in /proc/meminfo. However, the per-node view at /sys/devices/system/node/nodeX/meminfo was not updated, even though the counter is already tracked per-node. Add it to node_read_meminfo() so users can see balloon usage per NUMA node without having to parse the raw vmstat file. Link: https://lore.kernel.org/20260509005631.17183-1-hao.ge@linux.dev Signed-off-by: Hao Ge Acked-by: David Hildenbrand (Arm) Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- drivers/base/node.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 126f66aa2c3e..f4d9a21cc24e 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -523,6 +523,7 @@ static ssize_t node_read_meminfo(struct device *dev, #ifdef CONFIG_UNACCEPTED_MEMORY "Node %d Unaccepted: %8lu kB\n" #endif + "Node %d Balloon: %8lu kB\n" "Node %d GPUActive: %8lu kB\n" "Node %d GPUReclaim: %8lu kB\n" , @@ -559,6 +560,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED)) #endif , + nid, K(node_page_state(pgdat, NR_BALLOON_PAGES)), nid, K(node_page_state(pgdat, NR_GPU_ACTIVE)), nid, K(node_page_state(pgdat, NR_GPU_RECLAIM)) ); From ce872d5a5955bc0e8a9f5c7d3fad85212c13030d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 11 May 2026 16:43:07 +0800 Subject: [PATCH 123/321] mm/memory_hotplug: factor out altmap freeing checks Use a small helper to centralize altmap freeing after verifying that all vmemmap pages were released. This keeps the check consistent between the normal teardown path and the memory hotplug error paths. Link: https://lore.kernel.org/20260511084307.1827127-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Suggested-by: David Hildenbrand (Arm) Acked-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 462d8dcd636d..af5489f03771 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1403,6 +1403,12 @@ bool mhp_supports_memmap_on_memory(void) } EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); +static void altmap_free(struct vmem_altmap *altmap) +{ + WARN_ONCE(altmap->alloc, "Altmap not fully unmapped"); + kfree(altmap); +} + static void remove_memory_blocks_and_altmaps(u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); @@ -1427,12 +1433,8 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) put_device(&mem->dev); remove_memory_block_devices(cur_start, memblock_size); - arch_remove_memory(cur_start, memblock_size, altmap, NULL); - - /* Verify that all vmemmap pages have actually been freed. */ - WARN(altmap->alloc, "Altmap not fully unmapped"); - kfree(altmap); + altmap_free(altmap); } } @@ -1463,7 +1465,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, /* call arch's memory hotadd */ ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms); if (ret < 0) { - kfree(params.altmap); + altmap_free(params.altmap); goto out; } @@ -1472,7 +1474,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, params.altmap, NULL); - kfree(params.altmap); + altmap_free(params.altmap); goto out; } } From 2b117897d5c7c5ffdaca3ea40aa7658c54ae7cb8 Mon Sep 17 00:00:00 2001 From: Hongfu Li Date: Tue, 12 May 2026 18:13:05 +0800 Subject: [PATCH 124/321] selftests/mm: fix mmap() return value check in run_migration_benchmark mmap() returns MAP_FAILED on error, not NULL. The current check uses !buffer->ptr, which evaluates to false when mmap() fails (since MAP_FAILED is (void *)-1, not 0), so the error path is never taken. Link: https://lore.kernel.org/20260512101305.139509-1-lihongfu@kylinos.cn Signed-off-by: Hongfu Li Acked-by: David Hildenbrand (Arm) Reviewed-by: Dev Jain Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Donet Tom Reviewed-by: Lorenzo Stoakes Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 77fb4c5d871b..7a4daadfb0c8 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -2738,7 +2738,7 @@ static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_siz buffer->ptr = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!buffer->ptr) + if (buffer->ptr == MAP_FAILED) return -1; /* Apply THP hint if requested */ From 42791eddab096b67e368ff0c1f3e331b4b72971a Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:29 +0200 Subject: [PATCH 125/321] sparc/mm: remove register_page_bootmem_info() Patch series "mm: remove CONFIG_HAVE_BOOTMEM_INFO_NODE (Part 1)". We want to remove CONFIG_HAVE_BOOTMEM_INFO_NODE. As a first step, let's limit the remaining harm to x86 and core code, removing sparc, ppc and s390 leftovers, starting the stepwise removal by removing and simplifying some code. Once a related x86 vmemmap fix [1] is in, we can merge part 2 that will remove CONFIG_HAVE_BOOTMEM_INFO_NODE entirely. Tested on x86-64 with hugetlb vmemmap optimization in combination with KMEMLEAK, making sure that the problem reported in dd0ff4d12dd2 ("bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem") does not reappear -- hoping I managed to trigger the original problem. This patch (of 8): sparc does not select CONFIG_HAVE_BOOTMEM_INFO_NODE, therefore, register_page_bootmem_info_node() is a nop. Let's just get rid of register_page_bootmem_info(). Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-0-3fb0be6fc688@kernel.org Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-1-3fb0be6fc688@kernel.org Link: https://lore.kernel.org/r/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org [1] Signed-off-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 367c269305e5..3b679b1d1d72 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -2477,17 +2476,6 @@ int page_in_phys_avail(unsigned long paddr) return 0; } -static void __init register_page_bootmem_info(void) -{ -#ifdef CONFIG_NUMA - int i; - - for_each_online_node(i) - if (NODE_DATA(i)->node_spanned_pages) - register_page_bootmem_info_node(NODE_DATA(i)); -#endif -} - void __init arch_setup_zero_pages(void) { phys_addr_t zero_page_pa = kern_base + @@ -2498,14 +2486,6 @@ void __init arch_setup_zero_pages(void) void __init mem_init(void) { - /* - * Must be done after boot memory is put on freelist, because here we - * might set fields in deferred struct pages that have not yet been - * initialized, and memblock_free_all() initializes all the reserved - * deferred pages for us. - */ - register_page_bootmem_info(); - if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); } From bf45fe08b0685435320ffa5179714559024ec302 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:30 +0200 Subject: [PATCH 126/321] mm/bootmem_info: drop initialization of page->lru In the past, we used to store the type in page->lru.next, introduced by commit 5f24ce5fd34c ("thp: remove PG_buddy"). The location changed over the years; ever since commit 0386aaa6e9c8 ("bootmem: stop using page->index"), we store it alongside the info in page->private. Consequently, there is no need to reset page->lru anymore. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-2-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 3d7675a3ae04..a0a1ecdec8d0 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -34,7 +34,6 @@ void put_page_bootmem(struct page *page) if (page_ref_dec_return(page) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - INIT_LIST_HEAD(&page->lru); kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } From 7cb87e71e55bb8f3b234ea173964cd53278af11e Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:31 +0200 Subject: [PATCH 127/321] mm/bootmem_info: stop using PG_private Nobody checks PG_private for these pages, and we can happily use set_page_private() without setting PG_private. So let's just stop setting/clearing PG_private. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-3-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index a0a1ecdec8d0..6e2aaab3dca9 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -19,7 +19,6 @@ void get_page_bootmem(unsigned long info, struct page *page, { BUG_ON(type > 0xf); BUG_ON(info > (ULONG_MAX >> 4)); - SetPagePrivate(page); set_page_private(page, info << 4 | type); page_ref_inc(page); } @@ -32,7 +31,6 @@ void put_page_bootmem(struct page *page) type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { - ClearPagePrivate(page); set_page_private(page, 0); kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); From cf49b4ebd2ae13554c780eb482e7900447f29ce9 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:32 +0200 Subject: [PATCH 128/321] mm/bootmem_info: remove call to kmemleak_free_part_phys() The call to kmemleak_free_part_phys() was added in 2022 in commit dd0ff4d12dd2 ("bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem"). In 2025, commit b2aad24b5333 ("mm/memmap: prevent double scanning of memmap by kmemleak") started to use MEMBLOCK_ALLOC_NOLEAKTRACE when allocating the memmap to skip the kmemleak_alloc_phys() in the buddy. So remove the call to kmemleak_free_part_phys(). If this would still be required for other purposes, either free_reserved_page() should take care of it, or selected users. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-4-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador Reviewed-by: Mike Rapoport (Microsoft) Tested-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- include/linux/bootmem_info.h | 1 - mm/bootmem_info.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index 492ceeb1cdf8..f724340755e5 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -82,7 +82,6 @@ static inline void get_page_bootmem(unsigned long info, struct page *page, static inline void free_bootmem_page(struct page *page) { - kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } #endif diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 6e2aaab3dca9..74c1116626c8 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -32,7 +32,6 @@ void put_page_bootmem(struct page *page) if (page_ref_dec_return(page) == 1) { set_page_private(page, 0); - kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } } From 0928e9050da334f629d0e4b97c5462aa90023c65 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:33 +0200 Subject: [PATCH 129/321] mm/bootmem_info: stop marking the pgdat as NODE_INFO We removed the last user of NODE_INFO in commit 119c31caa59e ("mm/sparse: remove !CONFIG_SPARSEMEM_VMEMMAP leftovers for CONFIG_MEMORY_HOTPLUG"). But it really was never used it besides for safety-checks ever since it was introduced in commit 04753278769f ("memory hotplug: register section/node id to free"), where we had the comment: 5) The node information like pgdat has similar issues. But, this will be able to be solved too by this. (Not implemented yet, but, remembering node id in the pages.) Of course, that never happened, and we are not planning on freeing the node data (pgdat/pglist_data), during memory hotunplug. So let's just stop marking the pgdat as NODE_INFO. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-5-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 74c1116626c8..cce1d560f094 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -62,15 +62,8 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) void __init register_page_bootmem_info_node(struct pglist_data *pgdat) { - unsigned long i, pfn, end_pfn, nr_pages; + unsigned long pfn, end_pfn; int node = pgdat->node_id; - struct page *page; - - nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; - page = virt_to_page(pgdat); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); From ae751d567baa08342e5e34b378b72a6f9b2cfada Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:34 +0200 Subject: [PATCH 130/321] mm/bootmem_info: stop marking mem_section_usage as MIX_SECTION_INFO We never free the ms->usage data for boot memory sections (see section_deactivate()). And to identify whether ms->usage was allocated from memblock, we simply identify it by looking at PG_reserved. Consequently, there is no need to mark ms->usage as MIX_SECTION_INFO. Let's just stop doing that. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-6-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index cce1d560f094..0fa78db7fbc0 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -38,10 +38,8 @@ void put_page_bootmem(struct page *page) static void __init register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long mapsize, section_nr, i; + unsigned long section_nr; struct mem_section *ms; - struct mem_section_usage *usage; - struct page *page; start_pfn = SECTION_ALIGN_DOWN(start_pfn); section_nr = pfn_to_section_nr(start_pfn); @@ -50,14 +48,6 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) if (!preinited_vmemmap_section(ms)) register_page_bootmem_memmap(section_nr, pfn_to_page(start_pfn), PAGES_PER_SECTION); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } void __init register_page_bootmem_info_node(struct pglist_data *pgdat) From 0b7bf4bd1a7440e1c74c725984f4e20990854b37 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:35 +0200 Subject: [PATCH 131/321] s390/mm: use free_reserved_page() in vmem_free_pages() We never select CONFIG_HAVE_BOOTMEM_INFO_NODE on s390. Therefore, free_bootmem_page() nowadays always translates to free_reserved_page(). Let's use free_reserved_page() to replace the free_bootmem_page() loop. We can stop including bootmem_info.h. Likely, vmemmap freeing code could be factored out into the core in the future. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-7-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Heiko Carstens Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- arch/s390/mm/vmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index eeadff45e0e1..d8b2a60e0c33 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include @@ -51,7 +50,7 @@ static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *a if (PageReserved(page)) { /* allocated from memblock */ while (nr_pages--) - free_bootmem_page(page++); + free_reserved_page(page++); } else { free_pages(addr, order); } From 14d1948fa2384d0208f32be4a046d6edbf9fcc43 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 11 May 2026 16:05:36 +0200 Subject: [PATCH 132/321] powerpc/mm: remove CONFIG_HAVE_BOOTMEM_INFO_NODE register_page_bootmem_info_node() essentially only calls register_page_bootmem_memmap(). However, on powerpc that function is a nop. So there is not benefit in using CONFIG_HAVE_BOOTMEM_INFO_NODE anymore, let's just drop it. We can stop including bootmem_info.h. Link: https://lore.kernel.org/20260511-bootmem_info_prep-v1-8-3fb0be6fc688@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/powerpc/mm/init_64.c | 8 -------- mm/Kconfig | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index b6f3ae03ca9e..64f0df5bb5cd 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include @@ -388,13 +387,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, #endif -#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE -void register_page_bootmem_memmap(unsigned long section_nr, - struct page *start_page, unsigned long size) -{ -} -#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ - #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/mm/Kconfig b/mm/Kconfig index e221fa1dc54d..97b079372325 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -537,7 +537,7 @@ endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" - select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) + select HAVE_BOOTMEM_INFO_NODE if X86_64 depends on MEMORY_HOTPLUG select MIGRATION From 6d544529f97167162486e93bea035e08daa4d053 Mon Sep 17 00:00:00 2001 From: Vineet Agarwal Date: Tue, 12 May 2026 13:19:24 +0530 Subject: [PATCH 133/321] selftests/mm: check file initialization writes in split_huge_page_test create_pagecache_thp_and_fd() fills the backing file for the pagecache THP tests using repeated write() calls, but the return value is never checked. If a write fails or completes only partially, the test may continue with an incompletely initialized file and produce misleading results. Check the result of write() and fail the test if the expected number of bytes was not written. [akpm@linux-foundation.org: remove unneeded local, per David] Link: https://lore.kernel.org/da82de92-29d8-457c-9f65-40fc4900b922@kernel.org Link: https://lore.kernel.org/20260512074924.27721-1-agarwal.vineet2006@gmail.com Signed-off-by: Vineet Agarwal Acked-by: David Hildenbrand (Arm) Cc: Lorenzo Stoakes Cc: Wei Yang Cc: Vineet Agarwal Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 500d07c4938b..a8725942ee51 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -609,9 +609,13 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, assert(fd_size % sizeof(buf) == 0); for (i = 0; i < sizeof(buf); i++) buf[i] = (unsigned char)i; - for (i = 0; i < fd_size; i += sizeof(buf)) - write(*fd, buf, sizeof(buf)); - + for (i = 0; i < fd_size; i += sizeof(buf)) { + if (write(*fd, buf, sizeof(buf)) != sizeof(buf)) { + ksft_perror("write testfile"); + close(*fd); + goto err_out_unlink; + } + } close(*fd); sync(); *fd = open("/proc/sys/vm/drop_caches", O_WRONLY); From 9b1b295e9fd354b2263aee80a1ef3605d1eee32e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 12 May 2026 15:26:35 +0800 Subject: [PATCH 134/321] drivers/base/memory: make memory block get/put explicit Rename the memory block lookup helper to make the acquired reference explicit, add memory_block_put() to wrap put_device(), remove find_memory_block(), and use memory_block_get() as the single block-id based lookup interface. This makes it clearer to callers that a successful lookup holds a reference that must be dropped, reducing the chance of forgetting the matching put and leaking the memory block device reference. Link: https://lore.kernel.org/linux-mm/7887915D-E598-42B3-9AFE-BFFBACE8DE2D@linux.dev/#t Link: https://lore.kernel.org/20260512072635.3969576-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Acked-by: Michal Hocko Tested-by: Donet Tom Reviewed-by: Lorenzo Stoakes Tested-by: Sumanth Korikkar #s390 Cc: Richard Cheng Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Danilo Krummrich Cc: Doug Anderson Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Kees Cook Cc: Liam R. Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../platforms/pseries/hotplug-memory.c | 14 ++----- drivers/base/memory.c | 38 +++++++------------ drivers/base/node.c | 4 +- drivers/s390/char/sclp_mem.c | 17 ++++----- include/linux/memory.h | 7 +++- mm/memory_hotplug.c | 5 +-- 6 files changed, 35 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index b2f14db59034..5d3b51081ff3 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -164,13 +164,7 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) { - unsigned long section_nr; - struct memory_block *mem_block; - - section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - - mem_block = find_memory_block(section_nr); - return mem_block; + return memory_block_get(phys_to_block_id(lmb->base_addr)); } static int get_lmb_range(u32 drc_index, int n_lmbs, @@ -220,7 +214,7 @@ static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online) else rc = 0; - put_device(&mem_block->dev); + memory_block_put(mem_block); return rc; } @@ -319,12 +313,12 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) rc = dlpar_offline_lmb(lmb); if (rc) { - put_device(&mem_block->dev); + memory_block_put(mem_block); return rc; } __remove_memory(lmb->base_addr, memory_block_size); - put_device(&mem_block->dev); + memory_block_put(mem_block); /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, memory_block_size); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6981b55d582a..d31a421f7483 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -649,7 +649,7 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) * * Called under device_hotplug_lock. */ -struct memory_block *find_memory_block_by_id(unsigned long block_id) +struct memory_block *memory_block_get(unsigned long block_id) { struct memory_block *mem; @@ -659,16 +659,6 @@ struct memory_block *find_memory_block_by_id(unsigned long block_id) return mem; } -/* - * Called under device_hotplug_lock. - */ -struct memory_block *find_memory_block(unsigned long section_nr) -{ - unsigned long block_id = memory_block_id(section_nr); - - return find_memory_block_by_id(block_id); -} - static struct attribute *memory_memblk_attrs[] = { &dev_attr_phys_index.attr, &dev_attr_state.attr, @@ -701,7 +691,7 @@ static int __add_memory_block(struct memory_block *memory) ret = device_register(&memory->dev); if (ret) { - put_device(&memory->dev); + memory_block_put(memory); return ret; } ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, @@ -795,9 +785,9 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state struct memory_block *mem; int ret = 0; - mem = find_memory_block_by_id(block_id); + mem = memory_block_get(block_id); if (mem) { - put_device(&mem->dev); + memory_block_put(mem); return -EEXIST; } mem = kzalloc_obj(*mem); @@ -845,8 +835,8 @@ static void remove_memory_block(struct memory_block *memory) memory->group = NULL; } - /* drop the ref. we got via find_memory_block() */ - put_device(&memory->dev); + /* drop the ref. we got via memory_block_get() */ + memory_block_put(memory); device_unregister(&memory->dev); } @@ -880,7 +870,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, end_block_id = block_id; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id); + mem = memory_block_get(block_id); if (WARN_ON_ONCE(!mem)) continue; remove_memory_block(mem); @@ -908,7 +898,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) return; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id); + mem = memory_block_get(block_id); if (WARN_ON_ONCE(!mem)) continue; num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); @@ -1015,12 +1005,12 @@ int walk_memory_blocks(unsigned long start, unsigned long size, return 0; for (block_id = start_block_id; block_id <= end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id); + mem = memory_block_get(block_id); if (!mem) continue; ret = func(mem, arg); - put_device(&mem->dev); + memory_block_put(mem); if (ret) break; } @@ -1228,22 +1218,22 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, void memblk_nr_poison_inc(unsigned long pfn) { const unsigned long block_id = pfn_to_block_id(pfn); - struct memory_block *mem = find_memory_block_by_id(block_id); + struct memory_block *mem = memory_block_get(block_id); if (mem) { atomic_long_inc(&mem->nr_hwpoison); - put_device(&mem->dev); + memory_block_put(mem); } } void memblk_nr_poison_sub(unsigned long pfn, long i) { const unsigned long block_id = pfn_to_block_id(pfn); - struct memory_block *mem = find_memory_block_by_id(block_id); + struct memory_block *mem = memory_block_get(block_id); if (mem) { atomic_long_sub(i, &mem->nr_hwpoison); - put_device(&mem->dev); + memory_block_put(mem); } } diff --git a/drivers/base/node.c b/drivers/base/node.c index f4d9a21cc24e..3da91929ad4e 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -849,13 +849,13 @@ static void register_memory_blocks_under_nodes(void) for (block_id = start_block_id; block_id <= end_block_id; block_id++) { struct memory_block *mem; - mem = find_memory_block_by_id(block_id); + mem = memory_block_get(block_id); if (!mem) continue; memory_block_add_nid_early(mem, nid); do_register_memory_block_under_node(nid, mem); - put_device(&mem->dev); + memory_block_put(mem); } } diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c index 78c054e26d17..6df1926d4c62 100644 --- a/drivers/s390/char/sclp_mem.c +++ b/drivers/s390/char/sclp_mem.c @@ -204,7 +204,7 @@ static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute addr = sclp_mem->id * block_size; /* * Hold device_hotplug_lock when adding/removing memory blocks. - * Additionally, also protect calls to find_memory_block() and + * Additionally, also protect calls to memory_block_get() and * sclp_attach_storage(). */ rc = lock_device_hotplug_sysfs(); @@ -231,20 +231,19 @@ static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute sclp_mem_change_state(addr, block_size, 0); goto out_unlock; } - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr))); - put_device(&mem->dev); + mem = memory_block_get(phys_to_block_id(addr)); + memory_block_put(mem); WRITE_ONCE(sclp_mem->config, 1); } else { if (!sclp_mem->config) goto out_unlock; - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr))); + mem = memory_block_get(phys_to_block_id(addr)); if (mem->state != MEM_OFFLINE) { - put_device(&mem->dev); + memory_block_put(mem); rc = -EBUSY; goto out_unlock; } - /* drop the ref just got via find_memory_block() */ - put_device(&mem->dev); + memory_block_put(mem); sclp_mem_change_state(addr, block_size, 0); __remove_memory(addr, block_size); #ifdef CONFIG_KASAN @@ -294,11 +293,11 @@ static ssize_t sclp_memmap_on_memory_store(struct kobject *kobj, struct kobj_att return rc; block_size = memory_block_size_bytes(); sclp_mem = container_of(kobj, struct sclp_mem, kobj); - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(sclp_mem->id * block_size))); + mem = memory_block_get(phys_to_block_id(sclp_mem->id * block_size)); if (!mem) { WRITE_ONCE(sclp_mem->memmap_on_memory, value); } else { - put_device(&mem->dev); + memory_block_put(mem); rc = -EBUSY; } unlock_device_hotplug(); diff --git a/include/linux/memory.h b/include/linux/memory.h index 5bb5599c6b2b..463dc02f6cff 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -158,7 +158,11 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(enum memory_block_state state, void *v); -extern struct memory_block *find_memory_block(unsigned long section_nr); +struct memory_block *memory_block_get(unsigned long block_id); +static inline void memory_block_put(struct memory_block *mem) +{ + put_device(&mem->dev); +} typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); @@ -171,7 +175,6 @@ struct memory_group *memory_group_find_by_id(int mgid); typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, struct memory_group *excluded, void *arg); -struct memory_block *find_memory_block_by_id(unsigned long block_id); #define hotplug_memory_notifier(fn, pri) ({ \ static __meminitdata struct notifier_block fn##_mem_nb =\ { .notifier_call = fn, .priority = pri };\ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index af5489f03771..7ac19fab2263 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1423,14 +1423,13 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) struct vmem_altmap *altmap = NULL; struct memory_block *mem; - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); + mem = memory_block_get(phys_to_block_id(cur_start)); if (WARN_ON_ONCE(!mem)) continue; altmap = mem->altmap; mem->altmap = NULL; - /* drop the ref. we got via find_memory_block() */ - put_device(&mem->dev); + memory_block_put(mem); remove_memory_block_devices(cur_start, memblock_size); arch_remove_memory(cur_start, memblock_size, altmap, NULL); From d200cfc81c069e2192c6cc082c38d1c8b0427989 Mon Sep 17 00:00:00 2001 From: Vineet Agarwal Date: Tue, 12 May 2026 09:41:57 +0530 Subject: [PATCH 135/321] mm/damon/sysfs-schemes: fix double increment of nr_regions damos_sysfs_populate_region_dir() increments sysfs_regions->nr_regions twice when adding a new region: once explicitly before kobject_init_and_add(), and once again through the post-increment used for the kobject name. As a result, nr_regions no longer matches the actual number of live regions, and region directory names skip numbers (1, 3, 5, ...). Use the already incremented value for naming instead of incrementing nr_regions a second time. Link: https://lore.kernel.org/20260512041157.109845-1-agarwal.vineet2006@gmail.com Fixes: 66178e4ec30a ("mm/damon/sysfs: use damos_walk() for update_schemes_tried_{bytes,regions}") Signed-off-by: Vineet Agarwal Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index ab2153fff9a8..0d3021db0b99 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2995,7 +2995,7 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - sysfs_regions->nr_regions++)) { + sysfs_regions->nr_regions)) { kobject_put(®ion->kobj); return; } From da7bfa6a39fd4d72e03b6bc5f01148ac22fd216e Mon Sep 17 00:00:00 2001 From: Liew Rui Yan Date: Fri, 1 May 2026 09:37:49 +0800 Subject: [PATCH 136/321] mm/damon/lru_sort: validate min_region_size to be power of 2 Patch series "mm/damon: validate min_region_size to be power of 2", v5. Problem ======= When a user sets an invalid 'addr_unit' (e.g., 3) via DAMON_LRU_SORT or DAMON_RECLAIM, 'min_region_sz' becomes a non-power-of-2 value. While damon_commit_ctx() correctly detects this and returns -EINVAL, it sets the 'maybe_corrupted' flag during this process. This flag causes the running kdamond to terminate. While the termination is a safety measure, it is suboptimal in this case because the error is just a simple invalid input from the user, which shouldn't neccessitate stopping the kdamond. Solution ======== Add an early validation in damon_lru_sort_apply_parameters() and damon_reclaim_apply_parameters() to check 'min_region_sz' before any state change occurs. If it is non-power-of-2, return -EINVAL immediately, preventing 'maybe_corrupted' from being set. Patch 1 fixes the issue for DAMON_LRU_SORT. Patch 2 fixes the issue for DAMON_RECLAIM. This patch (of 2): Problem ======= When a user sets an invalid 'addr_unit' (e.g., 3) via DAMON_LRU_SORT, 'min_region_sz' becomes a non-power-of-2 value. While damon_commit_ctx() correctly detects this and returns -EINVAL, it sets the 'maybe_corrupted' flag during this process. This flag causes the running kdamond to terminate. While the termination is a safety measure, it is suboptimal in this case because the error is just a simple invalid input from the user, which shouldn't neccessitate stopping the kdamond. Reproduction ============ 1. Enable DAMON_LRU_SORT 2. Set addr_unit=3 3. Commit inputs via 'commit_inputs' 4. Observe kdamond termination Solution ======== Add an early validation in damon_lru_sort_apply_parameters() to check 'min_region_sz' before any state change occurs. If it is non-power-of-2, return -EINVAL immediately, preventing 'maybe_corrupted' from being set. Link: https://lore.kernel.org/20260501013750.71704-1-aethernet65535@gmail.com Link: https://lore.kernel.org/20260501013750.71704-2-aethernet65535@gmail.com Signed-off-by: Liew Rui Yan Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 2eb559d913b6..eca88ed941b3 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -286,6 +286,11 @@ static int damon_lru_sort_apply_parameters(void) param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); + if (!is_power_of_2(param_ctx->min_region_sz)) { + err = -EINVAL; + goto out; + } + if (!damon_lru_sort_mon_attrs.sample_interval) { err = -EINVAL; goto out; From d52c1331d28f7a1ae1255cb64d652e86431e6a03 Mon Sep 17 00:00:00 2001 From: Liew Rui Yan Date: Fri, 1 May 2026 09:37:50 +0800 Subject: [PATCH 137/321] mm/damon/reclaim: validate min_region_size to be power of 2 Problem ======= When a user sets an invalid 'addr_unit' (e.g., 3) via DAMON_RECLAIM, 'min_region_sz' becomes a non-power-of-2 value. While damon_commit_ctx() correctly detects this and returns -EINVAL, it sets the 'maybe_corrupted' flag during this process. This flag causes the running kdamond to terminate. While the termination is a safety measure, it is suboptimal in this case because the error is just a simple invalid input from the user, which shouldn't neccessitate stopping the kdamond. Reproduction ============ 1. Enable DAMON_RECLAIM 2. Set addr_unit=3 3. Commit inputs via 'commit_inputs' 4. Observe kdamond termination Solution ======== Add an early validation in damon_reclaim_apply_parameters() to check 'min_region_sz' before any state change occurs. If it is non-power-of-2, return -EINVAL immediately, preventing 'maybe_corrupted' from being set. Link: https://lore.kernel.org/20260501013750.71704-3-aethernet65535@gmail.com Signed-off-by: Liew Rui Yan Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7126d47fb8b2..ed446d00ef1c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -210,6 +210,11 @@ static int damon_reclaim_apply_parameters(void) param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); + if (!is_power_of_2(param_ctx->min_region_sz)) { + err = -EINVAL; + goto out; + } + if (!damon_reclaim_mon_attrs.aggr_interval) { err = -EINVAL; goto out; From 7201b96495522c9ed6fcd9a6b95c08e96c6b3df8 Mon Sep 17 00:00:00 2001 From: zenghongling Date: Mon, 11 May 2026 15:03:09 +0800 Subject: [PATCH 138/321] mm/percpu-internal.h: optimise pcpu_chunk struct to save memory Using pahole, we can see that there are some padding holes in the current pcpu_chunk structure,Adjusting the layout of pcpu_chunk can reduce these holes,decreasing its size from 192 bytes to 128 bytes and eliminating a wasted cache line. With allmodconfig (CONFIG_PERCPU_STATS + NEED_PCPUOBJ_EXT) Before: /* size: 256, cachelines: 4, members: 19 */ After: /* size: 192, cachelines: 3, members: 19 */ with NEED_PCPUOBJ_EXT Before: struct pcpu_chunk { struct list_head list; /* 0 16 */ int free_bytes; /* 16 4 */ struct pcpu_block_md chunk_md; /* 20 32 */ /* XXX 4 bytes hole, try to pack */ long unsigned int * bound_map; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ void * base_addr __attribute__((__aligned__(64))); /* 64 8 */ long unsigned int * alloc_map; /* 72 8 */ struct pcpu_block_md * md_blocks; /* 80 8 */ void * data; /* 88 8 */ bool immutable; /* 96 1 */ bool isolated; /* 97 1 */ /* XXX 2 bytes hole, try to pack */ int start_offset; /* 100 4 */ int end_offset; /* 104 4 */ /* XXX 4 bytes hole, try to pack */ struct obj_cgroup * * obj_cgroups; /* 112 8 */ int nr_pages; /* 120 4 */ int nr_populated; /* 124 4 */ /* --- cacheline 2 boundary (128 bytes) --- */ int nr_empty_pop_pages; /* 128 4 */ /* XXX 4 bytes hole, try to pack */ long unsigned int populated[]; /* 136 0 */ /* size: 192, cachelines: 3, members: 17 */ /* sum members: 122, holes: 4, sum holes: 14 */ /* padding: 56 */ /* forced alignments: 1 */ } __attribute__((__aligned__(64))); After: struct pcpu_chunk { struct list_head list; /* 0 16 */ int free_bytes; /* 16 4 */ struct pcpu_block_md chunk_md; /* 20 32 */ /* XXX 4 bytes hole, try to pack */ long unsigned int * bound_map; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ void * base_addr __attribute__((__aligned__(64))); /* 64 8 */ long unsigned int * alloc_map; /* 72 8 */ struct pcpu_block_md * md_blocks; /* 80 8 */ void * data; /* 88 8 */ bool immutable; /* 96 1 */ bool isolated; /* 97 1 */ /* XXX 2 bytes hole, try to pack */ int start_offset; /* 100 4 */ int end_offset; /* 104 4 */ int nr_pages; /* 108 4 */ int nr_populated; /* 112 4 */ int nr_empty_pop_pages; /* 116 4 */ struct obj_cgroup * * obj_cgroups; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ long unsigned int populated[]; /* 128 0 */ /* size: 128, cachelines: 2, members: 17 */ /* sum members: 122, holes: 2, sum holes: 6 */ /* forced alignments: 1 */ } __attribute__((__aligned__(64))); Link: https://lore.kernel.org/20260511070309.44044-1-zenghongling@kylinos.cn Signed-off-by: zenghongling Suggested-by: Dennis Zhou Acked-by: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu-internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 4b3d6ec43703..8cbe039bf847 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -77,13 +77,13 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ + int nr_pages; /* # of pages served by this chunk */ + int nr_populated; /* # of populated pages */ + int nr_empty_pop_pages; /* # of empty populated pages */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif - int nr_pages; /* # of pages served by this chunk */ - int nr_populated; /* # of populated pages */ - int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; From a9920428f19481d1227992ecbf1c73efd5b93001 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 11 May 2026 10:54:07 +0800 Subject: [PATCH 139/321] mm/khugepaged: fix inconsistent MMF_VM_HUGEPAGE flag due to allocation failure order __khugepaged_enter() sets MMF_VM_HUGEPAGE before allocating the corresponding mm_slot. If mm_slot_alloc() fails, the function returns with the flag set but without inserting the mm into the khugepaged tracking structures, leaving the mm in an inconsistent state where future registration attempts are skipped. Fix this by reordering: allocate the mm_slot first, then check and set the flag. If the flag is already set, free the allocated slot and return. This ensures the flag is only set when the mm is successfully registered in the khugepaged tracking structures. Link: https://lore.kernel.org/20260511025408.54035-1-ye.liu@linux.dev Fixes: 16618670276a ("mm: khugepaged: avoid pointless allocation for "struct mm_slot"") Signed-off-by: Ye Liu Suggested-by: David Hildenbrand Reviewed-by: Lance Yang Acked-by: David Hildenbrand (Arm) Reviewed-by: Dev Jain Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Barry Song Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Xin Hao Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 28a843f30b32..a4b97ec8ce56 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -437,13 +437,16 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(collapse_test_exit(mm), mm); - if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) - return; slot = mm_slot_alloc(mm_slot_cache); if (!slot) return; + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) { + mm_slot_free(mm_slot_cache, slot); + return; + } + spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* From 62b21c6f1d88c66a200c2c54b704e503e2e5a60f Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sun, 10 May 2026 19:37:00 +0100 Subject: [PATCH 140/321] mm/shrinker: avoid out-of-bounds read in set_shrinker_bit() set_shrinker_bit() reads info->unit[shrinker_id_to_index(shrinker_id)] before checking shrinker_id against info->map_nr_max, so an id past the currently visible map_nr_max reads past the unit[] array before the WARN_ON_ONCE() catches it. Determined from code inspection. Move the load into the bounded branch. Link: https://lore.kernel.org/20260510183700.102475-1-devnexen@gmail.com Fixes: 307bececcd12 ("mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}") Signed-off-by: David Carlier Reviewed-by: Qi Zheng Acked-by: Muchun Song Cc: Dave Chinner Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/shrinker.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index 76b3f750cf65..49256f81199f 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -197,12 +197,13 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); - unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { + struct shrinker_info_unit *unit; + + unit = info->unit[shrinker_id_to_index(shrinker_id)]; /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id_to_offset(shrinker_id), unit->map); From fd003fac7cc7d98a942a0778de76683ab731dd9c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 May 2026 17:56:23 -0400 Subject: [PATCH 141/321] maple_tree: document that "last" in mtree_insert_range() is inclusive The kernel doc of mtree_insert_range() does not state if the address represented by the "last" parameter is inclusive or exclusive. This can lead to bugs by code that assumes it is exclusive. Explicitly state that the parameter is inclusive. Link: https://lore.kernel.org/20260512175623.4c5ca8d2@gandalf.local.home Signed-off-by: Steven Rostedt Reviewed-by: "Liam R. Howlett" Acked-by: SeongJae Park Cc: Alice Ryhl Cc: Andrew Ballance Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/maple_tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 60ae5e6fc1ee..e52876435b77 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5727,13 +5727,16 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, EXPORT_SYMBOL(mtree_store); /** - * mtree_insert_range() - Insert an entry at a given range if there is no value. + * mtree_insert_range() - Insert an entry from [first, last] at a given range + * if there is no value. * @mt: The maple tree * @first: The start of the range - * @last: The end of the range + * @last: The end of the range (inclusive) * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * + * Note that @last is inclusive. That is, @last = @first + length - 1; + * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ From c516c365d9915bafc3d2cdeac50a984da22729b5 Mon Sep 17 00:00:00 2001 From: Frederick Mayle Date: Tue, 12 May 2026 13:31:35 -0700 Subject: [PATCH 142/321] mm/readahead: add kerneldoc for read_pages Patch series "mm: document read_pages and simplify usage". Add a kerneldoc for read_pages() to formalize an invariant and then use it to simplify the callers in page_cache_ra_unbounded(). This patch (of 2): Formalize one of the invariants provided by the current implementation so that callers can depend on it, as discussed in [1]. Link: https://lore.kernel.org/all/20260501061146.6e61392d125cf1847d7cc181@linux-foundation.org/ [1] Link: https://lore.kernel.org/20260512203154.754075-2-fmayle@google.com Signed-off-by: Frederick Mayle Cc: Jan Kara Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/readahead.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/readahead.c b/mm/readahead.c index 8c12b63ccd4a..23bec5497308 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -146,6 +146,17 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); +/** + * read_pages() - Start IO for a contiguous range of allocated folios in the + * page cache. + * @rac: Readahead control. + * + * When read_pages() returns, it is guaranteed that all of the folios will have + * been processed or removed so that ``readahead_count(rac) == 0``. However, + * that does not imply that ``readahead_index(rac)`` will be updated to point + * to the end of the originally requested range because, for example, the + * filesystem may expand the range upwards. + */ static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; From 418bffb6ba2474f445305dd2a5173d8a9ce446b3 Mon Sep 17 00:00:00 2001 From: Frederick Mayle Date: Tue, 12 May 2026 13:31:36 -0700 Subject: [PATCH 143/321] mm/readahead: simplify page_cache_ra_unbounded loop counter reset Minor cleanup, no behavior change intended. `read_pages` ensures that `ractl->_nr_pages` is zero before it returns, so the `ractl->_nr_pages` term in these expressions contributes nothing. This seems to have been true since the statements were introduced in commit f615bd5c4725f ("mm/readahead: Handle ractl nr_pages being modified"). The new expression has an intuitive explanation. When filesystems perform readahead, they increment `ractl->_index` by the number of pages processed, so, after `read_pages` returns, `ractl->_index` points to the first page after those already processed. `index` points to the first page considered in the loop. So, `ractl->_index - index` is the number of pages processed by the loop so far. Link: https://lore.kernel.org/20260512203154.754075-3-fmayle@google.com Signed-off-by: Frederick Mayle Cc: Jan Kara Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/readahead.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 23bec5497308..42f2f20633b0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -281,7 +281,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ read_pages(ractl); ractl->_index += min_nrpages; - i = ractl->_index + ractl->_nr_pages - index; + i = ractl->_index - index; continue; } @@ -297,7 +297,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, break; read_pages(ractl); ractl->_index += min_nrpages; - i = ractl->_index + ractl->_nr_pages - index; + i = ractl->_index - index; continue; } if (i == mark) From 8e0c2085c978ed6d9764d79fc785920360096f21 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 4 May 2026 12:06:37 +0200 Subject: [PATCH 144/321] lib/test_meminit: use && for bools As pointed out by Dan Carpenter, test_kmemcache() was using a bitwise AND on two bools instead of a boolean AND. Fix this for the sake of code cleanliness. Link: https://lore.kernel.org/20260504100637.1535762-1-glider@google.com Fixes: 5015a300a522 ("lib: introduce test_meminit module") Signed-off-by: Alexander Potapenko Reported-by: Dan Carpenter Closes: https://lore.kernel.org/kernel-janitors/afOcIan1ap9kD26M@stanley.mountain/ Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- lib/test_meminit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 6298f66c964b..d028a6552cd6 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -387,7 +387,7 @@ static int __init test_kmemcache(int *total_failures) ctor = flags & 1; rcu = flags & 2; zero = flags & 4; - if (ctor & zero) + if (ctor && zero) continue; num_tests += do_kmem_cache_size(size, ctor, rcu, zero, &failures); From 13f263b60fee0c463f3a9a6c728cd010d8802d69 Mon Sep 17 00:00:00 2001 From: Vineet Agarwal Date: Mon, 4 May 2026 13:43:13 +0530 Subject: [PATCH 145/321] selftests/mm: ksm-functional-tests: fix partial write handling Update write() checks to properly detect and handle partial writes. Previously, the write() calls used <= 0 to detect failure. This condition is never true for partial writes (ret > 0 but ret < len), so partial writes were silently treated as success. Fix this by verifying that write() returns the full expected length and treating any mismatch as failure. Link: https://lore.kernel.org/20260504081638.683223-1-agarwal.vineet2006@gmail.com Signed-off-by: Vineet Agarwal Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 8d874c4754f3..31c06c72203f 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -498,6 +498,7 @@ static void test_prctl_fork(void) static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) { int ksm_fd; + size_t len; ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); if (ksm_fd < 0) @@ -506,11 +507,13 @@ static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) if (write(ksm_fd, "1", 1) != 1) return -errno; - if (write(pages_to_scan_fd, pages_to_scan, strlen(pages_to_scan)) <= 0) - return -errno; + len = strlen(pages_to_scan); + if (write(pages_to_scan_fd, pages_to_scan, len) != len) + return -1; - if (write(sleep_millisecs_fd, sleep_ms, strlen(sleep_ms)) <= 0) - return -errno; + len = strlen(sleep_ms); + if (write(sleep_millisecs_fd, sleep_ms, len) != len) + return -1; return 0; } @@ -526,11 +529,11 @@ static int stop_ksmd_and_restore_frequency(void) if (write(ksm_fd, "2", 1) != 1) return -errno; - if (write(pages_to_scan_fd, "100", 3) <= 0) - return -errno; + if (write(pages_to_scan_fd, "100", 3) != 3) + return -1; - if (write(sleep_millisecs_fd, "20", 2) <= 0) - return -errno; + if (write(sleep_millisecs_fd, "20", 2) != 2) + return -1; return 0; } From 7d40e6b66d97d7feef8ca3c096827fd24c6d623d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 3 May 2026 13:59:16 +0200 Subject: [PATCH 146/321] mm/mseal: use min/max in mseal_apply Use the type-checked min()/max() macros instead of MIN()/MAX(), which are supposed to be used "for obvious constants only". Link: https://lore.kernel.org/20260503115915.18680-3-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Reviewed-by: SeongJae Park Cc: Jann Horn Cc: Liam R. Howlett Cc: Thorsten Blum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mseal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index e2093ae3d25c..9781647483d1 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -65,8 +66,8 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { - const unsigned long curr_start = MAX(vma->vm_start, start); - const unsigned long curr_end = MIN(vma->vm_end, end); + const unsigned long curr_start = max(vma->vm_start, start); + const unsigned long curr_end = min(vma->vm_end, end); if (!vma_test(vma, VMA_SEALED_BIT)) { vma_flags_t vma_flags = vma->flags; From fb95c50921f0a65ef9fd734ae712e416db949d91 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Sun, 3 May 2026 17:42:25 +0900 Subject: [PATCH 147/321] mm/hugetlb_cma: restrict hugetlb_cma parameter to gigantic-page alignment Existing hugetlb_cma parameter handling logic rejects sizes smaller than one gigantic page, but rounds up larger sizes that are not a multiple of it. The two behaviors are inconsistent and neither is documented. To remove existing inconsistent and undefined behavior, restrict hugetlb_cma parameter to only accept multiples of the gigantic page size. After this restriction, the redundant round_up() in the allocation loop can be removed. The new restriction is also documented in kernel-parameters.txt. Also, including other minor changes for readability improvement with no functional change. Link: https://lore.kernel.org/20260503084225.415980-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Suggested-by: Muchun Song Acked-by: Muchun Song Acked-by: Oscar Salvador Cc: David Hildenbrand Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 4 +++ mm/hugetlb_cma.c | 35 +++++++++---------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4d0f545fb3ec..23be2f64439c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2100,6 +2100,10 @@ Kernel parameters Format: nn[KMGTPE] or (node format) :nn[KMGTPE][,:nn[KMGTPE]] + The size must be a multiple of the gigantic page size. + When using node format, this applies to each per-node size. + Missaligned values are dropped with a warning. + Reserve a CMA area of given size and allocate gigantic hugepages using the CMA allocator. If enabled, the boot-time allocation of gigantic hugepages is skipped. diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index 7693ccefd0c6..39344d6c78d8 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -142,7 +142,7 @@ unsigned int __weak arch_hugetlb_cma_order(void) void __init hugetlb_cma_reserve(void) { - unsigned long size, reserved, per_node, order; + unsigned long size, reserved, per_node, order, gigantic_page_size; bool node_specific_cma_alloc = false; int nid; @@ -162,37 +162,36 @@ void __init hugetlb_cma_reserve(void) * breaking this assumption. */ VM_WARN_ON(order <= MAX_PAGE_ORDER); + gigantic_page_size = PAGE_SIZE << order; hugetlb_bootmem_set_nodes(); for (nid = 0; nid < MAX_NUMNODES; nid++) { - if (hugetlb_cma_size_in_node[nid] == 0) + size = hugetlb_cma_size_in_node[nid]; + if (size == 0) continue; if (!node_isset(nid, hugetlb_bootmem_nodes)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; + } else if (!IS_ALIGNED(size, gigantic_page_size)) { + pr_warn("hugetlb_cma: cma area of node %d must be a multiple of %lu MiB\n", + nid, gigantic_page_size / SZ_1M); + } else { + node_specific_cma_alloc = true; continue; } - if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", - nid, (PAGE_SIZE << order) / SZ_1M); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; - } else { - node_specific_cma_alloc = true; - } + hugetlb_cma_size -= size; + hugetlb_cma_size_in_node[nid] = 0; } /* Validate the CMA size again in case some invalid nodes specified. */ if (!hugetlb_cma_size) return; - if (hugetlb_cma_size < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", - (PAGE_SIZE << order) / SZ_1M); + if (!IS_ALIGNED(hugetlb_cma_size, gigantic_page_size)) { + pr_warn("hugetlb_cma: cma area must be a multiple of %lu MiB\n", + gigantic_page_size / SZ_1M); hugetlb_cma_size = 0; return; } @@ -204,7 +203,7 @@ void __init hugetlb_cma_reserve(void) */ per_node = DIV_ROUND_UP(hugetlb_cma_size, nodes_weight(hugetlb_bootmem_nodes)); - per_node = round_up(per_node, PAGE_SIZE << order); + per_node = round_up(per_node, gigantic_page_size); pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", hugetlb_cma_size / SZ_1M, per_node / SZ_1M); } @@ -223,15 +222,13 @@ void __init hugetlb_cma_reserve(void) size = min(per_node, hugetlb_cma_size - reserved); } - size = round_up(size, PAGE_SIZE << order); - snprintf(name, sizeof(name), "hugetlb%d", nid); /* * Note that 'order per bit' is based on smallest size that * may be returned to CMA allocator in the case of * huge page demotion. */ - res = cma_declare_contiguous_multi(size, PAGE_SIZE << order, + res = cma_declare_contiguous_multi(size, gigantic_page_size, HUGETLB_PAGE_ORDER, name, &hugetlb_cma[nid], nid); if (res) { From 80eacd489a50ab2a560bc233b26b94ad9df68410 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 13 May 2026 09:35:46 -0700 Subject: [PATCH 148/321] mm/mmu_notifier: fix a begin vs. start typo in the invalidate range comment Fix a goof in the block comment for invalidate_range_{start,end}() where start() is incorrectly referred to as begin(). No functional change intended. [seanjc@google.com: split to separate patch, write changelog] Link: https://lore.kernel.org/20260513163546.1176742-1-seanjc@google.com Signed-off-by: Takahiro Itazuri Signed-off-by: Sean Christopherson Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Reviewed-by: Mike Rapoport (Microsoft) Cc: Liam R. Howlett Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 69c304b467df..a11a44eef521 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -134,8 +134,8 @@ struct mmu_notifier_ops { * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to - * invalidate_range_begin/end for the whole duration of the - * invalidate_range_begin/end critical section. + * invalidate_range_start/end for the whole duration of the + * invalidate_range_start/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. From de97ae6222c1326db5475467879887d0dd2c62a6 Mon Sep 17 00:00:00 2001 From: Frederick Mayle Date: Fri, 8 May 2026 11:12:31 -0700 Subject: [PATCH 149/321] mm/readahead: no PG_readahead on EOF When readahead pulls in all the remaining pages for a file, setting the readahead bit is counter productive. The async readahead it would trigger would almost certainly be a no-op. Additionally, for mmap'd file IO, the readahead bit limits the fault around [1], causing an extra minor fault when the page is accessed. This was discovered when looking at /sys/kernel/tracing/events/readahead traces for a simple program. With the patch applied, fewer page_cache_ra_unbounded calls are observed. [1] do_fault_around calls filemap_map_pages, which finds eligible pages by calling next_uptodate_folio [2]. next_uptodate_folio skips pages with PG_readahead set [3]. Link: https://github.com/torvalds/linux/blob/v7.0/mm/filemap.c#L3921-L3939 [2] Link: https://github.com/torvalds/linux/blob/v7.0/mm/filemap.c#L3721-L3722 [3] Link: https://lore.kernel.org/20260508181237.670645-1-fmayle@google.com Signed-off-by: Frederick Mayle Reviewed-by: Jan Kara Cc: Kalesh Singh Cc: Suren Baghdasaryan Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/readahead.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 42f2f20633b0..38ce16e3fcbd 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -340,8 +340,11 @@ static void do_page_cache_ra(struct readahead_control *ractl, if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ - if (nr_to_read > end_index - index) + if (nr_to_read > end_index - index) { nr_to_read = end_index - index + 1; + /* We've reached the end, so don't set a readahead marker. */ + lookahead_size = 0; + } filemap_invalidate_lock_shared(mapping); page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); @@ -485,7 +488,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit; - pgoff_t mark = index + ra->size - ra->async_size; + pgoff_t mark; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -499,7 +502,13 @@ void page_cache_ra_order(struct readahead_control *ractl, limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; limit = min(limit, ractl->_max_index); - limit = min(limit, index + ra->size - 1); + if (limit > index + ra->size - 1) { + limit = index + ra->size - 1; + mark = index + ra->size - ra->async_size; + } else { + /* We've reached the end, so don't set a readahead marker. */ + mark = ULONG_MAX; + } new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); From 395085eacdfa37a64b37ae16a6dc467fb8670faf Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 13 May 2026 17:21:11 +0800 Subject: [PATCH 150/321] mm, swap: avoid leaving unused extend table after alloc race Allocating an extend table requires dropping the ci lock first. While the lock is dropped, a concurrent put can decrease the slot's swap count to a value that is no longer maxed out, so the extend table is no longer required. The current allocation path still attach the new extend table to the cluster anyway, leaving it unused. The next maxed out count on the same cluster may still reuse the table, and frees it properly. But swapoff could leak it indeed. To eliminate the waste, re-check under the ci lock that the extend table is still needed before publishing it, and free the local allocation otherwise. Also close the check window by ensuring every count decrement that brings a slot below SWP_TB_COUNT_MAX - 1 runs swap_extend_table_try_free(), not just the MAX to MAX - 1 transition. With this, a freshly published extend table that becomes redundant due to a racing put is freed on the very next decrement, restoring the invariant that an empty cluster never has a non-NULL ci->extend_table. The added overhead is ignorable. [kasong@tencent.com: v2] Link: https://lore.kernel.org/20260515-swap-extend-table-fix-v2-1-833d72ad53e5@tencent.com Link: https://lore.kernel.org/20260513-swap-extend-table-fix-v1-1-a71dea851fb3@tencent.com Fixes: 0d6af9bcf383 ("mm, swap: use the swap table to track the swap count") Signed-off-by: Kairui Song Reported-by: Breno Leitao Closes: https://lore.kernel.org/linux-mm/agG6Dp0umhs6O1SY@gmail.com/ Tested-by: Breno Leitao Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 74a1e324449d..ee515a6fbccd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1443,8 +1443,10 @@ start_over: } static int swap_extend_table_alloc(struct swap_info_struct *si, - struct swap_cluster_info *ci, gfp_t gfp) + struct swap_cluster_info *ci, + unsigned int ci_off, gfp_t gfp) { + int count; void *table; table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); @@ -1452,12 +1454,28 @@ static int swap_extend_table_alloc(struct swap_info_struct *si, return -ENOMEM; spin_lock(&ci->lock); - if (!ci->extend_table) - ci->extend_table = table; - else - kfree(table); + /* + * Extend table allocation requires releasing ci lock first so it's + * possible that the slot has been freed, no longer overflowed, or + * a concurrent extend table allocation has already succeeded, so + * the allocation is no longer needed. + */ + if (!cluster_table_is_alloced(ci)) + goto out_free; + count = swp_tb_get_count(__swap_table_get(ci, ci_off)); + if (count < (SWP_TB_COUNT_MAX - 1)) + goto out_free; + if (ci->extend_table) + goto out_free; + + ci->extend_table = table; spin_unlock(&ci->lock); return 0; + +out_free: + spin_unlock(&ci->lock); + kfree(table); + return 0; } int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) @@ -1472,7 +1490,7 @@ int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) return 0; ci = __swap_offset_to_cluster(si, offset); - ret = swap_extend_table_alloc(si, ci, gfp); + ret = swap_extend_table_alloc(si, ci, swp_cluster_offset(entry), gfp); put_swap_device(si); return ret; @@ -1519,13 +1537,21 @@ static void __swap_cluster_put_entry(struct swap_cluster_info *ci, if (count == (SWP_TB_COUNT_MAX - 1)) { ci->extend_table[ci_off] = 0; __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); - swap_extend_table_try_free(ci); } else { ci->extend_table[ci_off] = count; } } else { __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); } + + /* + * `SWP_TB_COUNT_MAX - 1` triggers extend table allocation. If the + * count was above that, then the extend table is no longer needed, + * so free it. And if we just put the count value from MAX - 1, it's + * also possible that a pending dup just attached an extend table. + */ + if (unlikely(count == SWP_TB_COUNT_MAX - 2 || count == SWP_TB_COUNT_MAX - 1)) + swap_extend_table_try_free(ci); } /** @@ -1665,7 +1691,7 @@ restart: if (unlikely(err)) { if (err == -ENOMEM) { spin_unlock(&ci->lock); - err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); + err = swap_extend_table_alloc(si, ci, ci_off, GFP_ATOMIC); spin_lock(&ci->lock); if (!err) goto restart; From 59f19bf6f119eecfa16355186b593abba8eb5198 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Wed, 13 May 2026 16:25:25 +0800 Subject: [PATCH 151/321] lib/test_hmm: use kvfree() to free kvcalloc() allocations Coccinelle scripts/coccinelle/api/kfree_mismatch.cocci reports the following warnings: lib/test_hmm.c:1256:15-16: WARNING kvmalloc is used to allocate this memory at line 1191 lib/test_hmm.c:1257:15-16: WARNING kvmalloc is used to allocate this memory at line 1196 Fix this by replacing kfree() with kvfree() to correctly handle the vmalloc() fallback path of kvcalloc(). Link: https://lore.kernel.org/20260513082525.154036-1-hao.ge@linux.dev Fixes: 775465fd26a3 ("lib/test_hmm: add zone device private THP test infrastructure") Signed-off-by: Hao Ge Acked-by: Balbir Singh Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Signed-off-by: Andrew Morton --- lib/test_hmm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 213504915737..38996c4baa40 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1253,8 +1253,8 @@ out: mmap_read_unlock(mm); mmput(mm); free_mem: - kfree(src_pfns); - kfree(dst_pfns); + kvfree(src_pfns); + kvfree(dst_pfns); return ret; } From 0496a59745b0723ea74274db16fd5c8b1379b9a9 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 13 May 2026 11:14:16 +0300 Subject: [PATCH 152/321] userfaultfd: ensure mremap_userfaultfd_fail() releases mmap_changing Sashiko says: mremap_userfaultfd_prep() increments ctx->mmap_changing to stall concurrent operations, but mremap_userfaultfd_fail() does not decrement it before dropping the context reference. If an mremap operation fails, ctx->mmap_changing remains elevated. This will causes subsequent userfaultfd operations like a UFFDIO_COPY to fail with -EAGAIN. Decrement ctx->mmap_changing in mremap_userfaultfd_fail(). Link: https://sashiko.dev/#/patchset/20260430113512.115938-1-rppt@kernel.org Link: https://lore.kernel.org/20260513081416.495963-1-rppt@kernel.org Fixes: df2cc96e7701 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: David Hildenbrand (Arm) Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Peter Xu Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4b53dc4a3266..390e4b7d9cb9 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -786,6 +786,8 @@ void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) if (!ctx) return; + atomic_dec(&ctx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } From 12ccf2bef35c4f42f7bf433f7ab699ec103e7f53 Mon Sep 17 00:00:00 2001 From: wangxuewen <18810879172@163.com> Date: Wed, 13 May 2026 15:52:14 +0800 Subject: [PATCH 153/321] mm/shrinker: simplify shrinker_memcg_alloc() using guard() Use guard(mutex) to automatically handle shrinker_mutex locking and unlocking in shrinker_memcg_alloc(). This removes the explicit mutex_unlock() call, the goto-based error path, and the redundant ret variable, resulting in cleaner and more concise code. Link: https://lore.kernel.org/20260513075214.2655710-1-18810879172@163.com Signed-off-by: wangxuewen Acked-by: Muchun Song Cc: Dave Chinner Cc: Roman Gushchin Cc: Xuewen Wang Signed-off-by: Andrew Morton --- mm/shrinker.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index 49256f81199f..7082d01c8c9d 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -216,29 +216,26 @@ static DEFINE_IDR(shrinker_idr); static int shrinker_memcg_alloc(struct shrinker *shrinker) { - int id, ret = -ENOMEM; + int id; if (mem_cgroup_disabled()) return -ENOSYS; if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB)) return -ENOSYS; - mutex_lock(&shrinker_mutex); + guard(mutex)(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) - goto unlock; + return id; if (id >= shrinker_nr_max) { if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); - goto unlock; + return -ENOMEM; } } shrinker->id = id; - ret = 0; -unlock: - mutex_unlock(&shrinker_mutex); - return ret; + return 0; } static void shrinker_memcg_remove(struct shrinker *shrinker) From 96f9fb92126a4fb5b24a54964eaef8f82cc2ab7f Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Wed, 13 May 2026 10:21:16 +0800 Subject: [PATCH 154/321] tools/mm/page-types: fix typo in madvise() error message Patch series "tools/mm/page-types: Fix misc bugs". This series fixes three issues in tools/mm/page-types.c: 1. Fix two typos in madvise() error messages ("madvice" -> "madvise") 2. Fix operator precedence bug in the sigbus handler where the ternary operator binds looser than addition, producing incorrect offset calculation when sigbus_addr is non-NULL 3. Fix --kpageflags option declaration in getopt_long: has_arg should be 1 (required_argument) since the option requires a file path This patch (of 3): Two error messages incorrectly spelled the madvise() function name as "madvice". Fix the typo in both occurrences. Link: https://lore.kernel.org/20260513022120.58033-1-ye.liu@linux.dev Link: https://lore.kernel.org/20260513022120.58033-2-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: David Hildenbrand (Arm) Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index d7e5e8902af8..6594245217a8 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -997,7 +997,7 @@ static void walk_file_range(const char *name, int fd, /* turn off readahead */ if (madvise(ptr, len, MADV_RANDOM)) - fatal("madvice failed: %s", name); + fatal("madvise failed: %s", name); if (sigsetjmp(sigbus_jmp, 1)) { end = off + sigbus_addr ? sigbus_addr - ptr : 0; @@ -1015,7 +1015,7 @@ got_sigbus: /* turn off harvesting reference bits */ if (madvise(ptr, len, MADV_SEQUENTIAL)) - fatal("madvice failed: %s", name); + fatal("madvise failed: %s", name); if (pagemap_read(buf, (unsigned long)ptr / page_size, nr_pages) != nr_pages) From e696ff06db374c1adb877d20e56085abe1d109a3 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Wed, 13 May 2026 10:21:17 +0800 Subject: [PATCH 155/321] tools/mm/page-types: fix ternary operator precedence in sigbus handler The ternary operator (?:) has lower precedence than addition (+), so the expression `off + sigbus_addr ? sigbus_addr - ptr : 0` was parsed as `(off + sigbus_addr) ? (sigbus_addr - ptr) : 0` rather than the intended `off + (sigbus_addr ? sigbus_addr - ptr : 0)`. Add explicit parentheses to ensure the correct evaluation order. Link: https://lore.kernel.org/20260513022120.58033-3-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: SeongJae Park Cc: David Hildenbrand (Arm) Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 6594245217a8..66f429f2b698 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -1000,7 +1000,7 @@ static void walk_file_range(const char *name, int fd, fatal("madvise failed: %s", name); if (sigsetjmp(sigbus_jmp, 1)) { - end = off + sigbus_addr ? sigbus_addr - ptr : 0; + end = off + (sigbus_addr ? sigbus_addr - ptr : 0); fprintf(stderr, "got sigbus at offset %lld: %s\n", (long long)end, name); goto got_sigbus; From 3fb355431eb864a95be3b832605d0575f43d6971 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Wed, 13 May 2026 10:21:18 +0800 Subject: [PATCH 156/321] tools/mm/page-types: fix kpageflags option argument in getopt_long The --kpageflags option requires an argument to specify the kpageflags file path, but has_arg was set to 0 (no_argument) in the long options table. Change it to 1 (required_argument) so getopt_long correctly parses the argument. Link: https://lore.kernel.org/20260513022120.58033-4-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: David Hildenbrand (Arm) Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 66f429f2b698..7fc5a8be5997 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -1261,7 +1261,7 @@ static const struct option opts[] = { { "no-summary", 0, NULL, 'N' }, { "hwpoison" , 0, NULL, 'X' }, { "unpoison" , 0, NULL, 'x' }, - { "kpageflags", 0, NULL, 'F' }, + { "kpageflags", 1, NULL, 'F' }, { "help" , 0, NULL, 'h' }, { NULL , 0, NULL, 0 } }; From 88e09fffeef5825931e6374b9e88d4b1a1d5f6f8 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Tue, 12 May 2026 16:45:59 -0400 Subject: [PATCH 157/321] mm/filemap: fix page_cache_prev_miss() when no hole is found page_cache_prev_miss() is documented to return a value outside the searched range when no gap is found. However, the no-gap-found path returns xas.xa_index, which after a successful loop is the first index in the range. As such, that index is misreported as a gap. The sole caller, page_cache_sync_ra(), uses the return value to estimate the cached run preceding a sequential read. In some cases, the buggy return value can undercount the contiguous range by one, shrinking the readahead window or pushing borderline requests into the small-random-read branch. Fix this by returning the start of the range - 1 when no hole is found. Update page_cache_next_miss() for clarity as well. Both helpers were previously fixed together in commit 9425c591e06a ("page cache: fix page_cache_next/prev_miss off by one"), but the fix was reverted because it caused a hugetlb performance regression. hugetlb no longer uses these functions and next_miss was subsequently refixed in commit 901a269ff3d5 ("filemap: fix page_cache_next_miss() when no hole found") and commit bbcaee20e03e ("readahead: fix return value of page_cache_next_miss() when no hole is found"), but prev_miss was not addressed. This was found by pointing Claude Opus 4.7 at mm/filemap.c. Link: https://lore.kernel.org/20260512-prev_miss_fix-v2-1-4af8e5c1ae62@columbia.edu Fixes: 0d3f92966629 ("page cache: Convert hole search to XArray") Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Tal Zussman Reviewed-by: Jan Kara Reviewed-by: Vishal Moola Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ab34cab2416a..4263d9775998 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1808,9 +1808,8 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); - unsigned long nr = max_scan; - while (nr--) { + while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; @@ -1818,7 +1817,8 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, return 0; } - return index + max_scan; + /* Return end of the range + 1 when no hole is found */ + return xas.xa_index + 1; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1849,12 +1849,13 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - break; + return xas.xa_index; if (xas.xa_index == ULONG_MAX) - break; + return ULONG_MAX; } - return xas.xa_index; + /* Return start of the range - 1 when no hole is found */ + return xas.xa_index - 1; } EXPORT_SYMBOL(page_cache_prev_miss); From 9c860d1d5d69f9cb19eb7c36573ee14065a9c85a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:13 +0000 Subject: [PATCH 158/321] mm: introduce for_each_free_list() Patch series "mm: misc cleanups from __GFP_UNMAPPED series". In v2 of the __GFP_UNMAPPED series [0], we realised that some of the patches could potentially be merged as independent cleanups. These are all independent of one another, if you think some are useful cleanups and others are pointless churn, it should be fine to just pick whatever subset you prefer. No functional change intended. This patch (of 4): There are a couple of places that iterate over the freelists with awareness of the data structures' layout. It seems ideally, code outside of mm should not be aware of the page allocator's freelists at all. But, this patch just doesn't hide them completely, it's just a meek incremental step in that direction: provide a macro to iterate over it without needing to be aware of the actual struct fields. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-0-dacdf5402be8@google.com Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-1-dacdf5402be8@google.com Link: https://lore.kernel.org/all/20260320-page_alloc-unmapped-v2-0-28bf1bd54f41@google.com/ [0] Signed-off-by: Brendan Jackman Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 ++++++--- kernel/power/snapshot.c | 8 ++++---- mm/mm_init.c | 11 +++++++---- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9adb2ad21da5..1331a7b93f33 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -177,9 +177,12 @@ static inline bool migratetype_is_mergeable(int mt) return mt < MIGRATE_PCPTYPES; } -#define for_each_migratetype_order(order, type) \ - for (order = 0; order < NR_PAGE_ORDERS; order++) \ - for (type = 0; type < MIGRATE_TYPES; type++) +#define for_each_free_list(list, zone, order) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ + for (unsigned int __type = 0; \ + __type < MIGRATE_TYPES && \ + (list = &(zone)->free_area[order].free_list[__type], 1); \ + __type++) extern int page_group_by_mobility_disabled; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a564650734dc..d933b5b2c05d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1244,8 +1244,9 @@ unsigned int snapshot_additional_pages(struct zone *zone) static void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + struct list_head *free_list; unsigned long flags; - unsigned int order, t; + unsigned int order; struct page *page; if (zone_is_empty(zone)) @@ -1269,9 +1270,8 @@ static void mark_free_pages(struct zone *zone) swsusp_unset_page_free(page); } - for_each_migratetype_order(order, t) { - list_for_each_entry(page, - &zone->free_area[order].free_list[t], buddy_list) { + for_each_free_list(free_list, zone, order) { + list_for_each_entry(page, free_list, buddy_list) { unsigned long i; pfn = page_to_pfn(page); diff --git a/mm/mm_init.c b/mm/mm_init.c index bd466a3c10c8..db5568cf36e1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1429,11 +1429,14 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, static void __meminit zone_init_free_lists(struct zone *zone) { - unsigned int order, t; - for_each_migratetype_order(order, t) { - INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); + struct list_head *list; + unsigned int order; + + for_each_free_list(list, zone, order) + INIT_LIST_HEAD(list); + + for (order = 0; order < NR_PAGE_ORDERS; order++) zone->free_area[order].nr_free = 0; - } #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); From 23378be820a3f094607f0dca16032ba6c48a8577 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:14 +0000 Subject: [PATCH 159/321] mm/page_alloc: don't overload migratetype in find_suitable_fallback() This function currently returns a signed integer that encodes status in-band, as negative numbers, along with a migratetype. Switch to a more explicit/verbose style that encodes the status and migratetype separately. In the spirit of making things more explicit, also create an enum to avoid using magic integer literals with special meanings. This enables documenting the values at their definition instead of in one of the callers. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-2-dacdf5402be8@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/compaction.c | 3 ++- mm/internal.h | 14 +++++++++++--- mm/page_alloc.c | 40 +++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 21 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3648ce22c807..168e63940b78 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2340,7 +2340,8 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, true) >= 0) + if (find_suitable_fallback(area, order, migratetype, true, NULL) + == FALLBACK_FOUND) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/internal.h b/mm/internal.h index 5a2ddcf68e0b..09931b1e535f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1104,9 +1104,17 @@ static inline void init_cma_pageblock(struct page *page) } #endif - -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claimable); +enum fallback_result { + /* Found suitable migratetype, *mt_out is valid. */ + FALLBACK_FOUND, + /* No fallback found in requested order. */ + FALLBACK_EMPTY, + /* Passed @claimable, but claiming whole block is a bad idea. */ + FALLBACK_NOCLAIM, +}; +enum fallback_result +find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool claimable, int *mt_out); static inline bool free_area_empty(struct free_area *area, int migratetype) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69a99af77777..3e4c4af06f37 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2259,25 +2259,29 @@ static bool should_try_claim_block(unsigned int order, int start_mt) * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claimable) +enum fallback_result +find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool claimable, int *mt_out) { int i; if (claimable && !should_try_claim_block(order, migratetype)) - return -2; + return FALLBACK_NOCLAIM; if (area->nr_free == 0) - return -1; + return FALLBACK_EMPTY; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { int fallback_mt = fallbacks[migratetype][i]; - if (!free_area_empty(area, fallback_mt)) - return fallback_mt; + if (!free_area_empty(area, fallback_mt)) { + if (mt_out) + *mt_out = fallback_mt; + return FALLBACK_FOUND; + } } - return -1; + return FALLBACK_EMPTY; } /* @@ -2387,16 +2391,16 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype, */ for (current_order = MAX_PAGE_ORDER; current_order >= min_order; --current_order) { - area = &(zone->free_area[current_order]); - fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, true); + enum fallback_result result; - /* No block in that order */ - if (fallback_mt == -1) + area = &(zone->free_area[current_order]); + result = find_suitable_fallback(area, current_order, + start_migratetype, true, &fallback_mt); + + if (result == FALLBACK_EMPTY) continue; - /* Advanced into orders too low to claim, abort */ - if (fallback_mt == -2) + if (result == FALLBACK_NOCLAIM) break; page = get_page_from_free_area(area, fallback_mt); @@ -2426,10 +2430,12 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype) int fallback_mt; for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { + enum fallback_result result; + area = &(zone->free_area[current_order]); - fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false); - if (fallback_mt == -1) + result = find_suitable_fallback(area, current_order, start_migratetype, + false, &fallback_mt); + if (result == FALLBACK_EMPTY) continue; page = get_page_from_free_area(area, fallback_mt); From 3687c0fd67249cb971990b382a47f02f19ed9f67 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:15 +0000 Subject: [PATCH 160/321] mm: rejig pageblock mask definitions - Add a PAGEBLOCK_ prefix to the names to avoid polluting the "global namespace" too much. - This new prefix makes MIGRATETYPE_AND_ISO_MASK look pretty long. Well, that global mask only exists for quite a specific purpose, and is quite a weird thing to have a name for anyway. So drop it and take advantage of the newly-defined PAGEBLOCK_ISO_MASK. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-3-dacdf5402be8@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 6 +++--- mm/page_alloc.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e046278a01fa..9a6c3ea17684 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -36,12 +36,12 @@ enum pageblock_bits { #define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS)) -#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) +#define PAGEBLOCK_MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) #ifdef CONFIG_MEMORY_ISOLATION -#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate)) +#define PAGEBLOCK_ISO_MASK BIT(PB_migrate_isolate) #else -#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK +#define PAGEBLOCK_ISO_MASK 0 #endif #if defined(CONFIG_HUGETLB_PAGE) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e4c4af06f37..0278d642445a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -364,7 +364,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); + BUILD_BUG_ON(__MIGRATE_TYPE_END > PAGEBLOCK_MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); @@ -437,7 +437,7 @@ bool get_pfnblock_bit(const struct page *page, unsigned long pfn, __always_inline enum migratetype get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { - unsigned long mask = MIGRATETYPE_AND_ISO_MASK; + unsigned long mask = PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK; unsigned long flags; flags = __get_pfnblock_flags_mask(page, pfn, mask); @@ -446,7 +446,7 @@ get_pfnblock_migratetype(const struct page *page, unsigned long pfn) if (flags & BIT(PB_migrate_isolate)) return MIGRATE_ISOLATE; #endif - return flags & MIGRATETYPE_MASK; + return flags & PAGEBLOCK_MIGRATETYPE_MASK; } /** @@ -534,11 +534,11 @@ static void set_pageblock_migratetype(struct page *page, } VM_WARN_ONCE(get_pageblock_isolate(page), "Use clear_pageblock_isolate() to unisolate pageblock"); - /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ + /* PAGEBLOCK_ISO_MASK clears PB_migrate_isolate if it is set */ #endif __set_pfnblock_flags_mask(page, page_to_pfn(page), (unsigned long)migratetype, - MIGRATETYPE_AND_ISO_MASK); + PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK); } void __meminit init_pageblock_migratetype(struct page *page, @@ -564,7 +564,7 @@ void __meminit init_pageblock_migratetype(struct page *page, flags |= BIT(PB_migrate_isolate); #endif __set_pfnblock_flags_mask(page, page_to_pfn(page), flags, - MIGRATETYPE_AND_ISO_MASK); + PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK); } #ifdef CONFIG_DEBUG_VM @@ -2140,15 +2140,15 @@ static bool __move_freepages_block_isolate(struct zone *zone, } move: - /* Use MIGRATETYPE_MASK to get non-isolate migratetype */ + /* Use PAGEBLOCK_MIGRATETYPE_MASK to get non-isolate migratetype */ if (isolate) { from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), - MIGRATETYPE_MASK); + PAGEBLOCK_MIGRATETYPE_MASK); to_mt = MIGRATE_ISOLATE; } else { from_mt = MIGRATE_ISOLATE; to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), - MIGRATETYPE_MASK); + PAGEBLOCK_MIGRATETYPE_MASK); } __move_freepages_block(zone, start_pfn, from_mt, to_mt); From 248b144a8a6dc534d8bc1c1470efe571de5b7ae6 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 13 May 2026 12:35:16 +0000 Subject: [PATCH 161/321] mm/page_alloc: remove ifdefs from pindex helpers The ifdefs are not technically needed here, everything used here is always defined. Switching to IS_ENABLED() makes the code a bit less tiresome to read. Link: https://lore.kernel.org/20260513-page_alloc-unmapped-prep-v1-4-dacdf5402be8@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka (SUSE) Cc: Axel Rasmussen Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: "Rafael J. Wysocki" Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0278d642445a..dc09a2520313 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -650,19 +650,17 @@ out: static inline unsigned int order_to_pindex(int migratetype, int order) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + bool movable = migratetype == MIGRATE_MOVABLE; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool movable; - if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(!is_pmd_order(order)); + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(!is_pmd_order(order)); - movable = migratetype == MIGRATE_MOVABLE; - - return NR_LOWORDER_PCP_LISTS + movable; + return NR_LOWORDER_PCP_LISTS + movable; + } + } else { + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); } -#else - VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); -#endif return (MIGRATE_PCPTYPES * order) + migratetype; } @@ -671,12 +669,12 @@ static inline int pindex_to_order(unsigned int pindex) { int order = pindex / MIGRATE_PCPTYPES; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pindex >= NR_LOWORDER_PCP_LISTS) - order = HPAGE_PMD_ORDER; -#else - VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); -#endif + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pindex >= NR_LOWORDER_PCP_LISTS) + order = HPAGE_PMD_ORDER; + } else { + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); + } return order; } From d231522bf07287c5bcf7c6af6960f476663324b5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Sun, 17 May 2026 23:37:05 +0000 Subject: [PATCH 162/321] mm/page_alloc: drop a misleading __always_inline get_pfnblock_migratetype() is called from outside page_alloc.c, so it cannot always be inlined. Remove the annotation to avoid misleading readers. At least in my minimal config, with GCC, this doesn't change mm/page_alloc.o at all. Link: https://lore.kernel.org/all/20260517-b4-drop-always-inline-v1-1-97b90930e8b8@google.com/ Signed-off-by: Brendan Jackman Suggested-by: Vlastimil Babka Link: https://lore.kernel.org/all/016c8bef-57ef-44ef-bf60-86dbfd368dcd@kernel.org/ Acked-by: Johannes Weiner Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola Reviewed-by: Vlastimil Babka (SUSE) Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc09a2520313..d7b7f9504bd8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -434,7 +434,7 @@ bool get_pfnblock_bit(const struct page *page, unsigned long pfn, * Use get_pfnblock_migratetype() if caller already has both @page and @pfn * to save a call to page_to_pfn(). */ -__always_inline enum migratetype +enum migratetype get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { unsigned long mask = PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK; From 47166f2199557e57cbab2882b033fb2949818fbb Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 19 May 2026 14:17:58 +0000 Subject: [PATCH 163/321] mm/page_alloc: document that alloc_pages_nolock() uses RCU The allocator interacts with cgroups which rely on RCU. RCU does not work everywhere, so the "any context" claim is slightly overstated here. This should already be enforced by objtool, since this function is not marked noinstr the x86 build should fail if you call it from a place where RCU is not watching. But, expecting readers to make that connection for themselves seems a bit cruel (I don't think there is even any documentation of what noinstr means at all, let alone the connection with RCU). Note this is not claiming that any cgroup code called from the allocator would actually break if this restriction was violated, it could very well be that there's no real way for the allocator to act on a cgroup that can disappear concurrently. But, since it's likely nobody has verified this one way or another, better to just be safe and declare that RCU is required. Allocating from an RCU-unsafe context seems a bit crazy anyway. Link: https://lore.kernel.org/20260519-nolock-rcu-comment-v1-1-4a630c8794e5@google.com Signed-off-by: Brendan Jackman Suggested-by: Junaid Shahid Acked-by: Harry Yoo (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d7b7f9504bd8..0ebffb0bb98b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7937,8 +7937,8 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned * @order: allocation order size * * Allocates pages of a given order from the given node. This is safe to - * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * call from any context where RCU is watching (from atomic, NMI, and also + * reentrant allocator -> tracepoint -> alloc_pages_nolock_noprof). * Allocation is best effort and to be expected to fail easily so nobody should * rely on the success. Failures are not reported via warn_alloc(). * See always fail conditions below. From 63b02a9409cb5180398491b093e48bcb5315f5fb Mon Sep 17 00:00:00 2001 From: "Jose Fernandez (Anthropic)" Date: Mon, 4 May 2026 12:55:17 +0000 Subject: [PATCH 164/321] mm: swap_cgroup: fix NULL deref in lookup_swap_cgroup_id on swapless host lookup_swap_cgroup_id() passes swap_cgroup_ctrl[type].map to __swap_cgroup_id_lookup() without checking that the type was ever registered via swap_cgroup_swapon(). On a swapless host every ctrl->map is NULL, so __swap_cgroup_id_lookup() dereferences NULL + a scaled swp_offset(). Since commit bea67dcc5eea ("mm: attempt to batch free swap entries for zap_pte_range()"), zap_pte_range() -> swap_pte_batch() calls lookup_swap_cgroup_id() on any non-present, non-none PTE that decodes as a real swap entry, without first validating it against swap_info[]. A single PTE corrupted into a type-0 swap entry takes the host down at process exit. We hit this in production on a swapless 6.12.58 host: ~1s of "get_swap_device: Bad swap file entry 3f800204222bb" (do_swap_page() being correctly defensive about the same entry) followed by BUG: unable to handle page fault for address: 000003f800204220 RIP: 0010:lookup_swap_cgroup_id+0x2b/0x60 Call Trace: swap_pte_batch+0xbf/0x230 zap_pte_range+0x4c8/0x780 unmap_page_range+0x190/0x3e0 exit_mmap+0xd9/0x3c0 do_exit+0x20c/0x4b0 syzbot has reported the identical stack. The source of the PTE corruption is a separate bug; this change makes the teardown path as robust as the fault path already is. Every other caller of lookup_swap_cgroup_id() is downstream of a get_swap_device() that has already validated the entry, so the new branch is cold. Link: https://lore.kernel.org/20260504-swap-cgroup-fix-7-0-v1-1-f53ff41ee553@linux.dev Fixes: bea67dcc5eea ("mm: attempt to batch free swap entries for zap_pte_range()") Signed-off-by: Jose Fernandez (Anthropic) Reported-by: syzbot+e12bd9ca48157add237a@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/69859728.050a0220.3b3015.0033.GAE@google.com Assisted-by: Claude:unspecified Cc: Barry Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- mm/swap_cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index de779fed8c21..95c38e54dd58 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -124,6 +124,8 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) return 0; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (unlikely(!ctrl->map)) + return 0; return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); } From a2e61ffb47493ff009b24105792318b3b62e18e2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:40 +0800 Subject: [PATCH 165/321] mm, swap: simplify swap cache allocation helper Patch series "mm, swap: swap table phase IV: unify allocation", v5. This series unifies the allocation and charging of anon and shmem swap in folios, provides better synchronization, consolidates the metadata management, hence dropping the static array and map, and improves the performance. The static metadata overhead is now close to zero, and workload performance is slightly improved. For example, mounting a 1TB swap device saves about 512MB of memory: Before: free -m total used free shared buff/cache available Mem: 1464 805 346 1 382 658 Swap: 1048575 0 1048575 After: free -m total used free shared buff/cache available Mem: 1464 277 899 1 356 1187 Swap: 1048575 0 1048575 Memory usage is ~512M lower, and we now have a close to 0 static overhead. It was about 2 bytes per slot before, now roughly 0.09375 bytes per slot (48 bytes ci info per cluster, which is 512 slots). Performance test is also looking good, testing Redis in a 2G VM using 6G ZRAM as swap: valkey-server --maxmemory 2560M redis-benchmark -r 3000000 -n 3000000 -d 1024 -c 12 -P 32 -t get Before: 3385017.283654 RPS After: 3433309.307292 RPS (1.42% better) Testing with build kernel under global pressure on a 48c96t system, limiting the total memory to 8G, using 12G ZRAM, 24 test runs, enabling THP: make -j96, using defconfig Before: user time 2904.59s system time 4773.99s After: user time 2909.38s system time 4641.55s (2.77% better) Testing with usemem on a 32c machine using 48G brd ramdisk and 16G RAM, 12 test run: usemem --init-time -O -y -x -n 48 1G Before: Throughput (Sum): 6482.58 MB/s Free Latency: 371371.67us After: Throughput (Sum): 6539.28 MB/s Free Latency: 363059.88us Seems similar, or slightly better. This series also reduces memory thrashing, I no longer see any: "Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF", it was shown several times during stress testing before this series when under great pressure: Before: grep -Ri VM_FAULT_OOM | wc -l => 18 After: grep -Ri VM_FAULT_OOM | wc -l => 0 This patch (of 12): Instead of trying to return the existing folio if the entry is already cached in swap_cache_alloc_folio, simply return an error pointer if the allocation failed, and drop the output argument that indicates what kind of folio is actually returned. And a proper wrapper swap_cache_read_folio that decouples and handles the actual requirement - read in the folio, or return the already read folio in cache. This is what async swapin and readahead actually required. As for zswap swap out, the caller just needs to abort if the allocation fails because the entry is gone or already cached, so removing simplifies the return argument, making it cleaner. No feature change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-0-88ae43e064c7@tencent.com Link: https://lore.kernel.org/20260517-swap-table-p4-v5-1-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Zi Yan Cc: Youngjun Park Signed-off-by: Andrew Morton --- mm/swap.h | 3 +- mm/swap_state.c | 180 +++++++++++++++++++++++++----------------------- mm/zswap.c | 23 +++---- 3 files changed, 103 insertions(+), 103 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index a77016f2423b..ad8b17a93758 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -281,8 +281,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); void swap_cache_del_folio(struct folio *folio); struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, - struct mempolicy *mpol, pgoff_t ilx, - bool *alloced); + struct mempolicy *mpol, pgoff_t ilx); /* Below helpers require the caller to lock and pass in the swap cluster. */ void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index 1415a5c54a43..3bba82f6dc79 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -459,54 +459,38 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, * All swap slots covered by the folio must have a non-zero swap count. * * Context: Caller must protect the swap device with reference count or locks. - * Return: Returns the folio being added on success. Returns the existing folio - * if @entry is already cached. Returns NULL if raced with swapin or swapoff. + * Return: 0 if success, error code if failed. */ -static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, - struct folio *folio, - gfp_t gfp, bool charged) +static int __swap_cache_prepare_and_add(swp_entry_t entry, + struct folio *folio, + gfp_t gfp, bool charged) { - struct folio *swapcache = NULL; void *shadow; int ret; __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { + ret = -ENOMEM; goto failed; - - for (;;) { - ret = swap_cache_add_folio(folio, entry, &shadow); - if (!ret) - break; - - /* - * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fallback to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. - */ - if (ret != -EEXIST || folio_test_large(folio)) - goto failed; - - swapcache = swap_cache_get_folio(entry); - if (swapcache) - goto failed; } + ret = swap_cache_add_folio(folio, entry, &shadow); + if (ret) + goto failed; + memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); /* Caller will initiate read into locked folio */ folio_add_lru(folio); - return folio; + return 0; failed: folio_unlock(folio); - return swapcache; + return ret; } /** @@ -515,7 +499,6 @@ failed: * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE - * @new_page_allocated: sets true if allocation happened, false otherwise * * Allocate a folio in the swap cache for one swap slot, typically before * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by @@ -523,18 +506,40 @@ failed: * Currently only supports order 0. * * Context: Caller must protect the swap device with reference count or locks. - * Return: Returns the existing folio if @entry is cached already. Returns - * NULL if failed due to -ENOMEM or @entry have a swap count < 1. + * Return: Returns the folio if allocation succeeded and folio is added to + * swap cache. Returns error code if allocation failed due to race or OOM. */ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx) +{ + int err; + struct folio *folio; + + /* Allocate a new folio to be added into the swap cache. */ + folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!folio) + return ERR_PTR(-ENOMEM); + + /* + * Try to add the new folio to the swap cache. It returns + * -EEXIST if the entry is already cached. + */ + err = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); + if (err) { + folio_put(folio); + return ERR_PTR(err); + } + + return folio; +} + +static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, + struct mempolicy *mpol, pgoff_t ilx, + struct swap_iocb **plug, bool readahead) { struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; - struct folio *result = NULL; - *new_page_allocated = false; /* Check the swap cache again for readahead path. */ folio = swap_cache_get_folio(entry); if (folio) @@ -544,17 +549,24 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, if (!swap_entry_swapped(si, entry)) return NULL; - /* Allocate a new folio to be added into the swap cache. */ - folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) + do { + folio = swap_cache_get_folio(entry); + if (folio) + return folio; + + folio = swap_cache_alloc_folio(entry, gfp, mpol, ilx); + } while (PTR_ERR(folio) == -EEXIST); + + if (IS_ERR_OR_NULL(folio)) return NULL; - /* Try add the new folio, returns existing folio or NULL on failure. */ - result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); - if (result == folio) - *new_page_allocated = true; - else - folio_put(folio); - return result; + + swap_read_folio(folio, plug); + if (readahead) { + folio_set_readahead(folio); + count_vm_event(SWAP_RA); + } + + return folio; } /** @@ -573,15 +585,35 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, */ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) { + int ret; struct folio *swapcache; pgoff_t offset = swp_offset(entry); unsigned long nr_pages = folio_nr_pages(folio); entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); - if (swapcache == folio) - swap_read_folio(folio, NULL); - return swapcache; + for (;;) { + ret = __swap_cache_prepare_and_add(entry, folio, 0, true); + if (!ret) { + swap_read_folio(folio, NULL); + break; + } + + /* + * Large order allocation needs special handling on + * race: if a smaller folio exists in cache, swapin needs + * to fall back to order 0, and doing a swap cache lookup + * might return a folio that is irrelevant to the faulting + * entry because @entry is aligned down. Just return NULL. + */ + if (ret != -EEXIST || nr_pages > 1) + return NULL; + + swapcache = swap_cache_get_folio(entry); + if (swapcache) + return swapcache; + } + + return folio; } /* @@ -595,7 +627,6 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct swap_iocb **plug) { struct swap_info_struct *si; - bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; @@ -605,13 +636,9 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); - folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); + folio = swap_cache_read_folio(entry, gfp_mask, mpol, ilx, plug, false); mpol_cond_put(mpol); - if (page_allocated) - swap_read_folio(folio, plug); - put_swap_device(si); return folio; } @@ -696,7 +723,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx) + struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); @@ -706,7 +733,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; - bool page_allocated; + swp_entry_t ra_entry; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -723,18 +750,11 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - folio = swap_cache_alloc_folio( - swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, - &page_allocated); + ra_entry = swp_entry(swp_type(entry), offset); + folio = swap_cache_read_folio(ra_entry, gfp_mask, mpol, ilx, + &splug, offset != entry_offset); if (!folio) continue; - if (page_allocated) { - swap_read_folio(folio, &splug); - if (offset != entry_offset) { - folio_set_readahead(folio); - count_vm_event(SWAP_RA); - } - } folio_put(folio); } blk_finish_plug(&plug); @@ -742,11 +762,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); - if (unlikely(page_allocated)) - swap_read_folio(folio, NULL); - return folio; + return swap_cache_read_folio(entry, gfp_mask, mpol, ilx, NULL, false); } static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, @@ -812,8 +828,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; - pgoff_t ilx; - bool page_allocated; + pgoff_t ilx = targ_ilx; win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) @@ -847,19 +862,12 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!si) continue; } - folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); + folio = swap_cache_read_folio(entry, gfp_mask, mpol, ilx, + &splug, addr != vmf->address); if (si) put_swap_device(si); if (!folio) continue; - if (page_allocated) { - swap_read_folio(folio, &splug); - if (addr != vmf->address) { - folio_set_readahead(folio); - count_vm_event(SWAP_RA); - } - } folio_put(folio); } if (pte) @@ -869,10 +877,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ - folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); - if (unlikely(page_allocated)) - swap_read_folio(folio, NULL); + folio = swap_cache_read_folio(targ_entry, gfp_mask, mpol, targ_ilx, + NULL, false); return folio; } diff --git a/mm/zswap.c b/mm/zswap.c index 4b5149173b0e..e27f6e96f003 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -991,7 +991,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; - bool folio_was_allocated; struct swap_info_struct *si; int ret = 0; @@ -1002,22 +1001,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry, mpol = get_task_policy(current); folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated); + NO_INTERLEAVE_INDEX); put_swap_device(si); - if (!folio) - return -ENOMEM; /* - * Found an existing folio, we raced with swapin or concurrent - * shrinker. We generally writeback cold folios from zswap, and - * swapin means the folio just became hot, so skip this folio. - * For unlikely concurrent shrinker case, it will be unlinked - * and freed when invalidated by the concurrent shrinker anyway. + * Swap cache allocation might fail due to OOM, or the entry + * may already be cached due to concurrent swapin or have been + * freed. If already cached, a concurrent swapin made the folio + * hot, so skip it. For the unlikely concurrent shrinker case, + * it will be unlinked and freed when invalidated anyway. */ - if (!folio_was_allocated) { - ret = -EEXIST; - goto out; - } + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * folio is locked, and the swapcache is now secured against @@ -1057,7 +1052,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, __swap_writepage(folio, NULL); out: - if (ret && ret != -EEXIST) { + if (ret) { swap_cache_del_folio(folio); folio_unlock(folio); } From bebee474c1c1a3e9db2e1079639da1cd6e3ab0ba Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:41 +0800 Subject: [PATCH 166/321] mm, swap: move common swap cache operations into standalone helpers Move a few swap cache checking, adding, and deletion operations into standalone helpers to be used later. And while at it, add proper kernel doc. No feature or behavior change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-2-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/swap_state.c | 146 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 46 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 3bba82f6dc79..89fa19ec13f6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -137,8 +137,47 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -void __swap_cache_add_folio(struct swap_cluster_info *ci, - struct folio *folio, swp_entry_t entry) +/** + * __swap_cache_add_check - Check if a range is suitable for adding a folio. + * @ci: The locked swap cluster. + * @ci_off: Range start offset. + * @nr: Number of slots to check. + * @shadow: Returns the shadow value if one exists in the range. + * + * Check if all slots covered by given range have a swap count >= 1. + * Retrieves the shadow if there is one. + * + * Context: Caller must lock the cluster. + * Return: 0 if success, error code if failed. + */ +static int __swap_cache_add_check(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr, + void **shadow) +{ + unsigned int ci_end = ci_off + nr; + unsigned long old_tb; + + lockdep_assert_held(&ci->lock); + if (WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER)) + return -EINVAL; + + if (unlikely(!ci->table)) + return -ENOENT; + do { + old_tb = __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb))) + return -EEXIST; + if (unlikely(!__swp_tb_get_count(old_tb))) + return -ENOENT; + if (swp_tb_is_shadow(old_tb)) + *shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); + + return 0; +} + +static void __swap_cache_do_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) { unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); @@ -159,7 +198,28 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci, folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; +} +/** + * __swap_cache_add_folio - Add a folio to the swap cache and update stats. + * @ci: The locked swap cluster. + * @folio: The folio to be added. + * @entry: The swap entry corresponding to the folio. + * + * Unconditionally add a folio to the swap cache. The caller must ensure + * all slots are usable and have no conflicts. This assigns entry to + * @folio->swap, increases folio refcount by the number of pages, and + * updates swap cache stats. + * + * Context: Caller must ensure the folio is locked and lock the cluster + * that holds the entries. + */ +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) +{ + unsigned long nr_pages = folio_nr_pages(folio); + + __swap_cache_do_add_folio(ci, folio, entry); node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } @@ -168,9 +228,11 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci, * swap_cache_add_folio - Add a folio into the swap cache. * @folio: The folio to be added. * @entry: The swap entry corresponding to the folio. - * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. * + * Add a folio into the swap cache. Will return error if any slot is no + * longer a valid swapped out slot or already occupied by another folio. + * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. */ @@ -179,60 +241,31 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, { int err; void *shadow = NULL; - unsigned long old_tb; + unsigned int ci_off; struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); - ci_start = swp_cluster_offset(entry); - ci_end = ci_start + nr_pages; - ci_off = ci_start; ci = swap_cluster_lock(si, swp_offset(entry)); - if (unlikely(!ci->table)) { - err = -ENOENT; - goto failed; + ci_off = swp_cluster_offset(entry); + err = __swap_cache_add_check(ci, ci_off, nr_pages, &shadow); + if (err) { + swap_cluster_unlock(ci); + return err; } - do { - old_tb = __swap_table_get(ci, ci_off); - if (unlikely(swp_tb_is_folio(old_tb))) { - err = -EEXIST; - goto failed; - } - if (unlikely(!__swp_tb_get_count(old_tb))) { - err = -ENOENT; - goto failed; - } - if (swp_tb_is_shadow(old_tb)) - shadow = swp_tb_to_shadow(old_tb); - } while (++ci_off < ci_end); + __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); if (shadowp) *shadowp = shadow; - return 0; -failed: - swap_cluster_unlock(ci); - return err; + return 0; } -/** - * __swap_cache_del_folio - Removes a folio from the swap cache. - * @ci: The locked swap cluster. - * @folio: The folio. - * @entry: The first swap entry that the folio corresponds to. - * @shadow: shadow value to be filled in the swap cache. - * - * Removes a folio from the swap cache and fills a shadow in place. - * This won't put the folio's refcount. The caller has to do that. - * - * Context: Caller must ensure the folio is locked and in the swap cache - * using the index of @entry, and lock the cluster that holds the entries. - */ -void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, - swp_entry_t entry, void *shadow) +static void __swap_cache_do_del_folio(struct swap_cluster_info *ci, + struct folio *folio, + swp_entry_t entry, void *shadow) { int count; unsigned long old_tb; @@ -259,14 +292,12 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, folio_swapped = true; else need_free = true; - /* If shadow is NULL, we sets an empty shadow. */ + /* If shadow is NULL, we set an empty shadow. */ __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); - node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); - lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { __swap_cluster_free_entries(si, ci, ci_start, nr_pages); @@ -279,6 +310,29 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, } } +/** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. + */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, + swp_entry_t entry, void *shadow) +{ + unsigned long nr_pages = folio_nr_pages(folio); + + __swap_cache_do_del_folio(ci, folio, entry, shadow); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); +} + /** * swap_cache_del_folio - Removes a folio from the swap cache. * @folio: The folio. From 1dfbe92e702675964da45847ffe022a41bf4045e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:42 +0800 Subject: [PATCH 167/321] mm/huge_memory: move THP gfp limit helper into header Shmem has some special requirements for THP GFP and has to limit it in certain zones or provide a more lenient fallback. We'll use this helper for generic swap THP allocation, which needs to support shmem. For a typical GFP_HIGHUSER_MOVABLE swap-in, this helper is basically a no-op. But it's necessary for certain shmem users, mostly drivers. No feature change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-3-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 30 ++++++++++++++++++++++++++++++ mm/shmem.c | 30 +++--------------------------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2949e5acff35..58382e97a66d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -237,6 +237,31 @@ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, return true; } +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some shmem users want THP allocation to be done less aggressively + * and only in certain zone. + */ +static inline gfp_t thp_shmem_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + /* * Filter the bitfield of input orders to the ones suitable for use in the vma. * See thp_vma_suitable_order(). @@ -581,6 +606,11 @@ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, return false; } +static inline gfp_t thp_shmem_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + return huge_gfp; +} + static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long addr, unsigned long orders) { diff --git a/mm/shmem.c b/mm/shmem.c index bab3529af23c..6edb23b41bac 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1791,30 +1791,6 @@ static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, return folio; } -/* - * Make sure huge_gfp is always more limited than limit_gfp. - * Some of the flags set permissions, while others set limitations. - */ -static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) -{ - gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; - gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; - gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; - gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); - - /* Allow allocations only from the originally specified zones. */ - result |= zoneflags; - - /* - * Minimize the result gfp by taking the union with the deny flags, - * and the intersection of the allow flags. - */ - result |= (limit_gfp & denyflags); - result |= (huge_gfp & limit_gfp) & allowflags; - - return result; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { @@ -2065,7 +2041,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback; - alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + alloc_gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); @@ -2141,7 +2117,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); - gfp = limit_gfp_mask(huge_gfp, gfp); + gfp = thp_shmem_limit_gfp_mask(huge_gfp, gfp); } #endif @@ -2548,7 +2524,7 @@ repeat: gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); + huge_gfp = thp_shmem_limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { From e1e6750df3b47380a5c1ba9f517e634a8328283f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:43 +0800 Subject: [PATCH 168/321] mm, swap: add support for stable large allocation in swap cache directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make it possible to allocate large folios directly in swap cache, provide a new infrastructure helper to handle the swap cache status check, allocation, and order fallback in the swap cache layer The new helper replaces the existing swap_cache_alloc_folio. Based on this, all the separate swap folio allocation that is being done by anon / shmem before is converted to use this helper directly, unifying folio allocation for anon, shmem, and readahead. This slightly consolidates how allocation is synchronized, making it more stable and less prone to errors. The slot-count and cache-conflict check is now always performed with the cluster lock held before allocation, and repeated under the same lock right before cache insertion. This double check produces a stable result compared to the previous anon and shmem mTHP allocation implementation, avoids the false-negative conflict checks that the lockless path can return — large allocations no longer have to be unwound because the range turned out to be occupied — and aborts early for already-freed slots, which helps ordinary swapin and especially readahead, with only a marginal increase in cluster-lock contention (the lock is very lightly contended and stays local in the first place). Hence, callers of swap_cache_alloc_folio() no longer need to check the swap slot count or swap cache status themselves. And now whoever first successfully allocates a folio in the swap cache will be the one who charges it and performs the swap-in. The race window of swapping is also reduced since the loop is much more compact. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-4-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/swap.h | 3 +- mm/swap_state.c | 236 ++++++++++++++++++++++++++++++++++-------------- mm/zswap.c | 2 +- 3 files changed, 170 insertions(+), 71 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index ad8b17a93758..6774af10a943 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -280,7 +280,8 @@ bool swap_cache_has_folio(swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); void swap_cache_del_folio(struct folio *folio); -struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, +struct folio *swap_cache_alloc_folio(swp_entry_t target_entry, gfp_t gfp_mask, + unsigned long orders, struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx); /* Below helpers require the caller to lock and pass in the swap cluster. */ void __swap_cache_add_folio(struct swap_cluster_info *ci, diff --git a/mm/swap_state.c b/mm/swap_state.c index 89fa19ec13f6..0adb0565bbb1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -139,10 +139,10 @@ void *swap_cache_get_shadow(swp_entry_t entry) /** * __swap_cache_add_check - Check if a range is suitable for adding a folio. - * @ci: The locked swap cluster. - * @ci_off: Range start offset. - * @nr: Number of slots to check. - * @shadow: Returns the shadow value if one exists in the range. + * @ci: The locked swap cluster + * @targ_entry: The target swap entry to check, will be rounded down by @nr + * @nr: Number of slots to check, must be a power of 2 + * @shadowp: Returns the shadow value if one exists in the range. * * Check if all slots covered by given range have a swap count >= 1. * Retrieves the shadow if there is one. @@ -151,26 +151,40 @@ void *swap_cache_get_shadow(swp_entry_t entry) * Return: 0 if success, error code if failed. */ static int __swap_cache_add_check(struct swap_cluster_info *ci, - unsigned int ci_off, unsigned int nr, - void **shadow) + swp_entry_t targ_entry, + unsigned long nr, void **shadowp) { - unsigned int ci_end = ci_off + nr; + unsigned int ci_off, ci_end; unsigned long old_tb; lockdep_assert_held(&ci->lock); - if (WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER)) - return -EINVAL; + /* + * If the target slot is not swapped out or already cached, return + * -ENOENT or -EEXIST. If the batch is not suitable, could be a + * race with concurrent free or cache add, return -EBUSY. + */ if (unlikely(!ci->table)) return -ENOENT; + ci_off = swp_cluster_offset(targ_entry); + old_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_folio(old_tb)) + return -EEXIST; + if (!__swp_tb_get_count(old_tb)) + return -ENOENT; + if (swp_tb_is_shadow(old_tb) && shadowp) + *shadowp = swp_tb_to_shadow(old_tb); + + if (nr == 1) + return 0; + + ci_off = round_down(ci_off, nr); + ci_end = ci_off + nr; do { old_tb = __swap_table_get(ci, ci_off); - if (unlikely(swp_tb_is_folio(old_tb))) - return -EEXIST; - if (unlikely(!__swp_tb_get_count(old_tb))) - return -ENOENT; - if (swp_tb_is_shadow(old_tb)) - *shadow = swp_tb_to_shadow(old_tb); + if (unlikely(swp_tb_is_folio(old_tb) || + !__swp_tb_get_count(old_tb))) + return -EBUSY; } while (++ci_off < ci_end); return 0; @@ -241,15 +255,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, { int err; void *shadow = NULL; - unsigned int ci_off; struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); ci = swap_cluster_lock(si, swp_offset(entry)); - ci_off = swp_cluster_offset(entry); - err = __swap_cache_add_check(ci, ci_off, nr_pages, &shadow); + err = __swap_cache_add_check(ci, entry, nr_pages, &shadow); if (err) { swap_cluster_unlock(ci); return err; @@ -404,6 +416,142 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, } } +/* + * Try to allocate a folio of given order in the swap cache. + * + * This helper resolves the potential races of swap allocation + * and prepares a folio to be used for swap IO. May return following + * value: + * + * -ENOMEM / -EBUSY: Order is too large or in conflict with sub slot, + * caller should shrink the order and retry + * -ENOENT / -EEXIST: Target swap entry is unavailable or cached, the caller + * should abort or try to use the cached folio instead + */ +static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, + swp_entry_t targ_entry, gfp_t gfp, + unsigned int order, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx) +{ + int err; + swp_entry_t entry; + struct folio *folio; + void *shadow = NULL; + unsigned long address, nr_pages = 1UL << order; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + + VM_WARN_ON_ONCE(nr_pages > SWAPFILE_CLUSTER); + entry.val = round_down(targ_entry.val, nr_pages); + + /* Check if the slot and range are available, skip allocation if not */ + spin_lock(&ci->lock); + err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL); + spin_unlock(&ci->lock); + if (unlikely(err)) + return ERR_PTR(err); + + /* + * Limit THP gfp. The limitation is a no-op for typical + * GFP_HIGHUSER_MOVABLE but matters for shmem. + */ + if (order) + gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + + if (mpol || !vmf) { + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + } else { + address = round_down(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vmf->vma, address); + } + if (unlikely(!folio)) + return ERR_PTR(-ENOMEM); + + /* Double check the range is still not in conflict */ + spin_lock(&ci->lock); + err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow); + if (unlikely(err)) { + spin_unlock(&ci->lock); + folio_put(folio); + return ERR_PTR(err); + } + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __swap_cache_do_add_folio(ci, folio, entry); + spin_unlock(&ci->lock); + + if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, + gfp, entry)) { + spin_lock(&ci->lock); + __swap_cache_do_del_folio(ci, folio, entry, shadow); + spin_unlock(&ci->lock); + folio_unlock(folio); + /* nr_pages refs from swap cache, 1 from allocation */ + folio_put_refs(folio, nr_pages + 1); + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); + return ERR_PTR(-ENOMEM); + } + + /* For memsw accounting, swap is uncharged when folio is added to swap cache */ + memcg1_swapin(entry, 1 << order); + if (shadow) + workingset_refault(folio, shadow); + + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); + + /* Caller will initiate read into locked new_folio */ + folio_add_lru(folio); + return folio; +} + +/** + * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. + * @targ_entry: swap entry indicating the target slot + * @gfp: memory allocation flags + * @orders: allocation orders, must be non zero + * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE + * + * Allocate a folio in the swap cache for one swap slot, typically before + * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by + * @targ_entry must have a non-zero swap count (swapped out). + * + * Context: Caller must protect the swap device with reference count or locks. + * Return: Returns the folio if allocation succeeded and folio is in the swap + * cache. Returns error code if failed due to race, OOM or invalid arguments. + */ +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp, + unsigned long orders, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx) +{ + int order, err; + struct folio *ret; + struct swap_cluster_info *ci; + + ci = __swap_entry_to_cluster(targ_entry); + order = highest_order(orders); + + /* orders must be non-zero, and must not exceed cluster size. */ + if (WARN_ON_ONCE(!orders || (1UL << order) > SWAPFILE_CLUSTER)) + return ERR_PTR(-EINVAL); + + do { + ret = __swap_cache_alloc(ci, targ_entry, gfp, order, + vmf, mpol, ilx); + if (!IS_ERR(ret)) + break; + err = PTR_ERR(ret); + if (!order || (err && err != -EBUSY && err != -ENOMEM)) + break; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); + order = next_order(&orders, order); + } while (orders); + + return ret; +} + /* * If we are the only user, then try to free up the swap cache. * @@ -547,68 +695,18 @@ failed: return ret; } -/** - * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. - * @entry: the swapped out swap entry to be binded to the folio. - * @gfp_mask: memory allocation flags - * @mpol: NUMA memory allocation policy to be applied - * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE - * - * Allocate a folio in the swap cache for one swap slot, typically before - * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by - * @entry must have a non-zero swap count (swapped out). - * Currently only supports order 0. - * - * Context: Caller must protect the swap device with reference count or locks. - * Return: Returns the folio if allocation succeeded and folio is added to - * swap cache. Returns error code if allocation failed due to race or OOM. - */ -struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx) -{ - int err; - struct folio *folio; - - /* Allocate a new folio to be added into the swap cache. */ - folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) - return ERR_PTR(-ENOMEM); - - /* - * Try to add the new folio to the swap cache. It returns - * -EEXIST if the entry is already cached. - */ - err = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); - if (err) { - folio_put(folio); - return ERR_PTR(err); - } - - return folio; -} - static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, struct mempolicy *mpol, pgoff_t ilx, struct swap_iocb **plug, bool readahead) { - struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; - /* Check the swap cache again for readahead path. */ - folio = swap_cache_get_folio(entry); - if (folio) - return folio; - - /* Skip allocation for unused and bad swap slot for readahead. */ - if (!swap_entry_swapped(si, entry)) - return NULL; - do { folio = swap_cache_get_folio(entry); if (folio) return folio; - folio = swap_cache_alloc_folio(entry, gfp, mpol, ilx); + folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx); } while (PTR_ERR(folio) == -EEXIST); if (IS_ERR_OR_NULL(folio)) diff --git a/mm/zswap.c b/mm/zswap.c index e27f6e96f003..761cd699e0a3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1000,7 +1000,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -EEXIST; mpol = get_task_policy(current); - folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, + folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol, NO_INTERLEAVE_INDEX); put_swap_device(si); From 02d733a7ec1d751ddb624cf5d1eb953d0bf2f704 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:44 +0800 Subject: [PATCH 169/321] mm, swap: unify large folio allocation Now that direct large order allocation is supported in the swap cache, both anon and shmem can use it instead of implementing their own methods. This unifies the fallback and swap cache check, which also reduces the TOCTOU race window of swap cache state: previously, high order swapin required checking swap cache states first, then allocating and falling back separately. Now all these steps happen in the same compact loop. Order fallback and statistics are also unified, callers just need to check and pass the acceptable order bitmask. There is basically no behavior change. This only makes things more unified and prepares for later commits. Cgroup and zero map checks can also be moved into the compact loop, further reducing race windows and redundancy Link: https://lore.kernel.org/20260517-swap-table-p4-v5-5-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 80 ++++++--------------------- mm/shmem.c | 102 +++++++++++----------------------- mm/swap.h | 30 ++-------- mm/swap_state.c | 143 ++++++++---------------------------------------- mm/swapfile.c | 3 +- 5 files changed, 79 insertions(+), 279 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0c9d9c2cbf0e..da891bcce59c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4609,26 +4609,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -static struct folio *__alloc_swap_folio(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - softleaf_t entry; - - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); - if (!folio) - return NULL; - - entry = softleaf_from_pte(vmf->orig_pte); - if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - GFP_KERNEL, entry)) { - folio_put(folio); - return NULL; - } - - return folio; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Check if the PTEs within a range are contiguous swap entries @@ -4658,8 +4638,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; - if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) - return false; return true; } @@ -4687,16 +4665,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, return orders; } -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; - struct folio *folio; unsigned long addr; softleaf_t entry; spinlock_t *ptl; pte_t *pte; - gfp_t gfp; int order; /* @@ -4704,7 +4680,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) - goto fallback; + return 0; /* * A large swapped out folio could be partially or fully in zswap. We @@ -4712,7 +4688,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * folio. */ if (!zswap_never_enabled()) - goto fallback; + return 0; entry = softleaf_from_pte(vmf->orig_pte); /* @@ -4726,12 +4702,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) vmf->address, orders); if (!orders) - goto fallback; + return 0; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) - goto fallback; + return 0; /* * For do_swap_page, find the highest order where the aligned range is @@ -4747,29 +4723,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) pte_unmap_unlock(pte, ptl); - /* Try allocating the highest of the remaining orders. */ - gfp = vma_thp_gfp_mask(vma); - while (orders) { - addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr); - if (folio) { - if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - gfp, entry)) - return folio; - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); - folio_put(folio); - } - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); - order = next_order(&orders, order); - } - -fallback: - return __alloc_swap_folio(vmf); + return orders; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf) { - return __alloc_swap_folio(vmf); + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -4875,23 +4834,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (folio) swap_update_readahead(folio, vma, vmf->address); if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { - folio = alloc_swap_folio(vmf); - if (folio) { - /* - * folio is charged, so swapin can only fail due - * to raced swapin and return NULL. - */ - swapcache = swapin_folio(entry, folio); - if (swapcache != folio) - folio_put(folio); - folio = swapcache; - } - } else { + /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) + folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE, + thp_swapin_suitable_orders(vmf) | BIT(0), + vmf, NULL, 0); + else folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - } - if (!folio) { + if (IS_ERR_OR_NULL(folio)) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. @@ -4901,6 +4852,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; + folio = NULL; goto unlock; } diff --git a/mm/shmem.c b/mm/shmem.c index 6edb23b41bac..77a3e28e5160 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, vm_fault_t *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -2017,68 +2017,32 @@ unlock: } static struct folio *shmem_swap_alloc_folio(struct inode *inode, - struct vm_area_struct *vma, pgoff_t index, + struct vm_fault *vmf, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { + pgoff_t ilx; + struct folio *folio; + struct mempolicy *mpol; struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *new, *swapcache; - int nr_pages = 1 << order; - gfp_t alloc_gfp = gfp; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (WARN_ON_ONCE(order)) - return ERR_PTR(-EINVAL); - } else if (order) { - /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(entry, nr_pages) != nr_pages) - goto fallback; + if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) || + !zswap_never_enabled()) + order = 0; - alloc_gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); - } -retry: - new = shmem_alloc_folio(alloc_gfp, order, info, index); - if (!new) { - new = ERR_PTR(-ENOMEM); - goto fallback; +again: + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = swapin_sync(entry, gfp, BIT(order), vmf, mpol, ilx); + mpol_cond_put(mpol); + + if (!IS_ERR(folio)) + return folio; + + if (order) { + order = 0; + goto again; } - if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - alloc_gfp, entry)) { - folio_put(new); - new = ERR_PTR(-ENOMEM); - goto fallback; - } - - swapcache = swapin_folio(entry, new); - if (swapcache != new) { - folio_put(new); - if (!swapcache) { - /* - * The new folio is charged already, swapin can - * only fail due to another raced swapin. - */ - new = ERR_PTR(-EEXIST); - goto fallback; - } - } - return swapcache; -fallback: - /* Order 0 swapin failed, nothing to fallback to, abort */ - if (!order) - return new; - entry.val += index - round_down(index, nr_pages); - alloc_gfp = gfp; - nr_pages = 1; - order = 0; - goto retry; + return folio; } /* @@ -2265,11 +2229,12 @@ unlock: */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; - struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm = vmf ? vmf->vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap; softleaf_t index_entry; @@ -2310,20 +2275,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ - folio = shmem_swap_alloc_folio(inode, vma, index, - index_entry, order, gfp); - if (IS_ERR(folio)) { - error = PTR_ERR(folio); - folio = NULL; - goto failed; - } + folio = shmem_swap_alloc_folio(inode, vmf, index, + swap, order, gfp); } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { + } + if (IS_ERR_OR_NULL(folio)) { + if (IS_ERR(folio)) + error = PTR_ERR(folio); + else error = -ENOMEM; - goto failed; - } + folio = NULL; + goto failed; } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -2471,7 +2435,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, vmf, fault_type); if (error == -EEXIST) goto repeat; diff --git a/mm/swap.h b/mm/swap.h index 6774af10a943..8e57e9431624 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -300,7 +300,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); +struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); @@ -334,24 +335,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - int i; - - /* - * While allocating a large folio and doing mTHP swapin, we need to - * ensure all entries are not cached, otherwise, the mTHP folio will - * be in conflict with the folio in swap cache. - */ - for (i = 0; i < max_nr; i++) { - if (swap_cache_has_folio(entry)) - return i; - entry.val++; - } - - return i; -} - #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -433,7 +416,9 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +static inline struct folio *swapin_sync( + swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } @@ -493,10 +478,5 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, { return 0; } - -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - return 0; -} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 0adb0565bbb1..98c8691826fb 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -238,43 +238,6 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } -/** - * swap_cache_add_folio - Add a folio into the swap cache. - * @folio: The folio to be added. - * @entry: The swap entry corresponding to the folio. - * @shadowp: If a shadow is found, return the shadow. - * - * Add a folio into the swap cache. Will return error if any slot is no - * longer a valid swapped out slot or already occupied by another folio. - * - * Context: Caller must ensure @entry is valid and protect the swap device - * with reference count or locks. - */ -static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadowp) -{ - int err; - void *shadow = NULL; - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long nr_pages = folio_nr_pages(folio); - - si = __swap_entry_to_info(entry); - ci = swap_cluster_lock(si, swp_offset(entry)); - err = __swap_cache_add_check(ci, entry, nr_pages, &shadow); - if (err) { - swap_cluster_unlock(ci); - return err; - } - - __swap_cache_add_folio(ci, folio, entry); - swap_cluster_unlock(ci); - if (shadowp) - *shadowp = shadow; - - return 0; -} - static void __swap_cache_do_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) @@ -650,51 +613,6 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, } } -/** - * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. - * @entry: swap entry to be bound to the folio. - * @folio: folio to be added. - * @gfp: memory allocation flags for charge, can be 0 if @charged if true. - * @charged: if the folio is already charged. - * - * Update the swap_map and add folio as swap cache, typically before swapin. - * All swap slots covered by the folio must have a non-zero swap count. - * - * Context: Caller must protect the swap device with reference count or locks. - * Return: 0 if success, error code if failed. - */ -static int __swap_cache_prepare_and_add(swp_entry_t entry, - struct folio *folio, - gfp_t gfp, bool charged) -{ - void *shadow; - int ret; - - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - ret = -ENOMEM; - goto failed; - } - - ret = swap_cache_add_folio(folio, entry, &shadow); - if (ret) - goto failed; - - memcg1_swapin(entry, folio_nr_pages(folio)); - if (shadow) - workingset_refault(folio, shadow); - - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); - return 0; - -failed: - folio_unlock(folio); - return ret; -} - static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, struct mempolicy *mpol, pgoff_t ilx, struct swap_iocb **plug, bool readahead) @@ -705,7 +623,6 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, folio = swap_cache_get_folio(entry); if (folio) return folio; - folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx); } while (PTR_ERR(folio) == -EEXIST); @@ -722,49 +639,37 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, } /** - * swapin_folio - swap-in one or multiple entries skipping readahead. - * @entry: starting swap entry to swap in - * @folio: a new allocated and charged folio + * swapin_sync - swap-in one or multiple entries skipping readahead. + * @entry: swap entry indicating the target slot + * @gfp: memory allocation flags + * @orders: allocation orders + * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * - * Reads @entry into @folio, @folio will be added to the swap cache. - * If @folio is a large folio, the @entry will be rounded down to align - * with the folio size. + * This allocates a folio suitable for given @orders, or returns the + * existing folio in the swap cache for @entry. This initiates the IO, too, + * if needed. @entry is rounded down if @orders allow large allocation. * - * Return: returns pointer to @folio on success. If folio is a large folio - * and this raced with another swapin, NULL will be returned to allow fallback - * to order 0. Else, if another folio was already added to the swap cache, - * return that swap cache folio instead. + * Context: Caller must ensure @entry is valid and pin the swap device with refcount. + * Return: Returns the folio on success, error code if failed. */ -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { - int ret; - struct folio *swapcache; - pgoff_t offset = swp_offset(entry); - unsigned long nr_pages = folio_nr_pages(folio); + struct folio *folio; - entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); - for (;;) { - ret = __swap_cache_prepare_and_add(entry, folio, 0, true); - if (!ret) { - swap_read_folio(folio, NULL); - break; - } + do { + folio = swap_cache_get_folio(entry); + if (folio) + return folio; + folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx); + } while (PTR_ERR(folio) == -EEXIST); - /* - * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fall back to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. - */ - if (ret != -EEXIST || nr_pages > 1) - return NULL; - - swapcache = swap_cache_get_folio(entry); - if (swapcache) - return swapcache; - } + if (IS_ERR(folio)) + return folio; + swap_read_folio(folio, NULL); return folio; } diff --git a/mm/swapfile.c b/mm/swapfile.c index ee515a6fbccd..4ffd491cacca 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1853,8 +1853,7 @@ void folio_put_swap(struct folio *folio, struct page *subpage) * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() - * swap_cache_add_folio() - * // check swap_map + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before From 945578fee2ec17bebdec067371214d3cbed48822 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:45 +0800 Subject: [PATCH 170/321] mm/memcg, swap: tidy up cgroup v1 memsw swap helpers The cgroup v1 swap helpers always operate on swap cache folios whose swap entry is stable: the folio is locked and in the swap cache. There is no need to pass the swap entry or page count as separate parameters when they can be derived from the folio itself. Simplify the redundant parameters and add sanity checks to document the required preconditions. Also rename memcg1_swapout to __memcg1_swapout to indicate it requires special calling context: the folio must be isolated and dying, and the call must be made with interrupts disabled. No functional change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-6-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 ++++---- include/linux/swap.h | 10 ++++------ mm/huge_memory.c | 2 +- mm/memcontrol-v1.c | 33 ++++++++++++++++++++------------- mm/memcontrol.c | 9 ++++----- mm/swap_state.c | 4 ++-- mm/swapfile.c | 2 +- mm/vmscan.c | 2 +- 8 files changed, 37 insertions(+), 33 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dc3fa687759b..7d08128de1fd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1899,8 +1899,8 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } -void memcg1_swapout(struct folio *folio, swp_entry_t entry); -void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); +void __memcg1_swapout(struct folio *folio); +void memcg1_swapin(struct folio *folio); #else /* CONFIG_MEMCG_V1 */ static inline @@ -1929,11 +1929,11 @@ static inline void mem_cgroup_exit_user_fault(void) { } -static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) +static inline void __memcg1_swapout(struct folio *folio) { } -static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +static inline void memcg1_swapin(struct folio *folio) { } diff --git a/include/linux/swap.h b/include/linux/swap.h index 7a09df6977a5..f907d3df52d0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -571,13 +571,12 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); -static inline int mem_cgroup_try_charge_swap(struct folio *folio, - swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio); +static inline int mem_cgroup_try_charge_swap(struct folio *folio) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_try_charge_swap(folio, entry); + return __mem_cgroup_try_charge_swap(folio); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); @@ -591,8 +590,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline int mem_cgroup_try_charge_swap(struct folio *folio, - swp_entry_t entry) +static inline int mem_cgroup_try_charge_swap(struct folio *folio) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b7df167f7acb..1f14c5c48b4a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4446,7 +4446,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) /* * Exclude swapcache: originally to avoid a corrupt deferred split - * queue. Nowadays that is fully prevented by memcg1_swapout(); + * queue. Nowadays that is fully prevented by __memcg1_swapout(); * but if page reclaim is already handling the same folio, it is * unnecessary to handle it again in the shrinker, so excluding * swapcache here may still be a useful optimization. diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 433bba9dfe71..36c507d81dc5 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -604,18 +604,23 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) } /** - * memcg1_swapout - transfer a memsw charge to swap + * __memcg1_swapout - transfer a memsw charge to swap * @folio: folio whose memsw charge to transfer - * @entry: swap entry to move the charge to * - * Transfer the memsw charge of @folio to @entry. + * Transfer the memsw charge of @folio to the swap entry stored in + * folio->swap. + * + * Context: folio must be isolated, unmapped, locked and is just about + * to be freed, and caller must disable IRQs. */ -void memcg1_swapout(struct folio *folio, swp_entry_t entry) +void __memcg1_swapout(struct folio *folio) { struct mem_cgroup *memcg, *swap_memcg; struct obj_cgroup *objcg; unsigned int nr_entries; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); @@ -641,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), folio->swap); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -671,18 +676,20 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) obj_cgroup_put(objcg); } -/* +/** * memcg1_swapin - uncharge swap slot - * @entry: the first swap entry for which the pages are charged - * @nr_pages: number of pages which will be uncharged + * @folio: folio being swapped in * - * Call this function after successfully adding the charged page to swapcache. + * Call this function after successfully adding the charged + * folio to swapcache. * - * Note: This function assumes the page for which swap slot is being uncharged - * is order 0 page. + * Context: The folio has to be in swap cache and locked. */ -void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +void memcg1_swapin(struct folio *folio) { + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap @@ -701,7 +708,7 @@ void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(folio->swap, folio_nr_pages(folio)); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 431cad99189f..c3d0f79dc84e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5473,13 +5473,12 @@ int __init mem_cgroup_init(void) /** * __mem_cgroup_try_charge_swap - try charging swap space for a folio * @folio: folio being added to swap - * @entry: swap entry to charge * - * Try to charge @folio's memcg for the swap space at @entry. + * Try to charge @folio's memcg for the swap space at folio->swap. * * Returns 0 on success, -ENOMEM on failure. */ -int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio) { unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; @@ -5496,7 +5495,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - if (!entry.val) { + if (!folio_test_swapcache(folio)) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); rcu_read_unlock(); return 0; @@ -5515,7 +5514,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) } mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(memcg), folio->swap); return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 98c8691826fb..7a80494fa37f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -455,8 +455,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, return ERR_PTR(-ENOMEM); } - /* For memsw accounting, swap is uncharged when folio is added to swap cache */ - memcg1_swapin(entry, 1 << order); + /* memsw uncharges swap when folio is added to swap cache */ + memcg1_swapin(folio); if (shadow) workingset_refault(folio, shadow); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4ffd491cacca..4875b3d3e658 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1757,7 +1757,7 @@ again: } /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ - if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) + if (unlikely(mem_cgroup_try_charge_swap(folio))) swap_cache_del_folio(folio); if (unlikely(!folio_test_swapcache(folio))) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4b0984387658..3231af682fa7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -739,7 +739,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - memcg1_swapout(folio, swap); + __memcg1_swapout(folio); __swap_cache_del_folio(ci, folio, swap, shadow); swap_cluster_unlock_irq(ci); } else { From c1fd92589c0dc15f4daa798c3a83d190a1ce674a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:46 +0800 Subject: [PATCH 171/321] mm, swap: support flexible batch freeing of slots in different memcgs Instead of requiring the caller to ensure all slots are in the same memcg, make the function handle different memcgs at once. This is both a micro optimization and required for removing the memcg lookup in the page table layer, so it can be unified at the swap layer. We are not removing the memcg lookup in the page table in this commit. It has to be done after the memcg lookup is deferred to the swap layer. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-7-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/swapfile.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4875b3d3e658..60d8f0df3f32 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1899,21 +1899,46 @@ void __swap_cluster_free_entries(struct swap_info_struct *si, unsigned int ci_start, unsigned int nr_pages) { unsigned long old_tb; + unsigned int type = si->type; + unsigned short batch_id = 0, id_cur; unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; - unsigned long offset = cluster_offset(si, ci) + ci_start; + unsigned long ci_head = cluster_offset(si, ci); + unsigned int batch_off = ci_off; + swp_entry_t entry; VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { old_tb = __swap_table_get(ci, ci_off); - /* Release the last ref, or after swap cache is dropped */ + /* + * Freeing is done after release of the last swap count + * ref, or after swap cache is dropped + */ VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); __swap_table_set(ci, ci_off, null_to_swp_tb()); + + /* + * Uncharge swap slots by memcg in batches. Consecutive + * slots with the same cgroup id are uncharged together. + */ + entry = swp_entry(type, ci_head + ci_off); + id_cur = lookup_swap_cgroup_id(entry); + if (batch_id != id_cur) { + if (batch_id) + mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off), + ci_off - batch_off); + batch_id = id_cur; + batch_off = ci_off; + } } while (++ci_off < ci_end); - mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); - swap_range_free(si, offset, nr_pages); + if (batch_id) { + mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off), + ci_off - batch_off); + } + + swap_range_free(si, ci_head + ci_start, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) From bc34e87a51d9e51d398ef6d8c2c35cf1a4ff38b9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:47 +0800 Subject: [PATCH 172/321] mm, swap: delay and unify memcg lookup and charging for swapin Instead of checking the cgroup private ID during page table walk in swap_pte_batch(), move the memcg lookup into __swap_cache_add_check() under the cluster lock. The first pre-alloc check is speculative and skips the memcg check since the post-alloc stable check ensures all slots covered by the folio belong to the same memcg. It is very rare for contiguous and aligned entries across a contiguous region of a page table of the same process or shmem mapping to belong to different memcgs. This also prepares for recording the memcg info in the cluster's table. Also make the order check and fallback more compact. There should be no user-observable behavior change. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-8-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 +++--- mm/internal.h | 10 +--------- mm/memcontrol.c | 10 ++++------ mm/swap_state.c | 28 +++++++++++++++++++--------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7d08128de1fd..a013f37f24aa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -646,8 +646,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, - gfp_t gfp, swp_entry_t entry); +int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id, + struct mm_struct *mm, gfp_t gfp); void __mem_cgroup_uncharge(struct folio *folio); @@ -1137,7 +1137,7 @@ static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, - struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) + unsigned short id, struct mm_struct *mm, gfp_t gfp) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index 09931b1e535f..9dbd8e3c991f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -451,24 +451,16 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; - const softleaf_t entry = softleaf_from_pte(pte); pte_t *ptep = start_ptep + 1; - unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); - VM_WARN_ON(!softleaf_is_swap(entry)); + VM_WARN_ON(!softleaf_is_swap(softleaf_from_pte(pte))); - cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { - softleaf_t entry; - pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - entry = softleaf_from_pte(pte); - if (lookup_swap_cgroup_id(entry) != cgroup_id) - break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3d0f79dc84e..1b58b314cb18 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5079,27 +5079,25 @@ out: /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. - * @folio: folio to charge. + * @folio: the folio to charge + * @id: memory cgroup id * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the folio is allocated * * This function charges a folio allocated for swapin. Please call this before * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, - gfp_t gfp, swp_entry_t entry) +int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id, + struct mm_struct *mm, gfp_t gfp) { struct mem_cgroup *memcg; - unsigned short id; int ret; if (mem_cgroup_disabled()) return 0; - id = lookup_swap_cgroup_id(entry); rcu_read_lock(); memcg = mem_cgroup_from_private_id(id); if (!memcg || !css_tryget_online(&memcg->css)) diff --git a/mm/swap_state.c b/mm/swap_state.c index 7a80494fa37f..bdd949ae0044 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -142,17 +142,21 @@ void *swap_cache_get_shadow(swp_entry_t entry) * @ci: The locked swap cluster * @targ_entry: The target swap entry to check, will be rounded down by @nr * @nr: Number of slots to check, must be a power of 2 - * @shadowp: Returns the shadow value if one exists in the range. + * @shadowp: Returns the shadow value if one exists in the range + * @memcg_id: Returns the memory cgroup id, NULL to ignore cgroup check * * Check if all slots covered by given range have a swap count >= 1. - * Retrieves the shadow if there is one. + * Retrieves the shadow if there is one. If @memcg_id is not NULL, also + * checks if all slots belong to the same cgroup and return the cgroup + * private id. * * Context: Caller must lock the cluster. * Return: 0 if success, error code if failed. */ static int __swap_cache_add_check(struct swap_cluster_info *ci, swp_entry_t targ_entry, - unsigned long nr, void **shadowp) + unsigned long nr, void **shadowp, + unsigned short *memcg_id) { unsigned int ci_off, ci_end; unsigned long old_tb; @@ -172,19 +176,24 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci, return -EEXIST; if (!__swp_tb_get_count(old_tb)) return -ENOENT; - if (swp_tb_is_shadow(old_tb) && shadowp) + if (shadowp && swp_tb_is_shadow(old_tb)) *shadowp = swp_tb_to_shadow(old_tb); + if (memcg_id) + *memcg_id = lookup_swap_cgroup_id(targ_entry); if (nr == 1) return 0; + targ_entry.val = round_down(targ_entry.val, nr); ci_off = round_down(ci_off, nr); ci_end = ci_off + nr; do { old_tb = __swap_table_get(ci, ci_off); if (unlikely(swp_tb_is_folio(old_tb) || - !__swp_tb_get_count(old_tb))) + !__swp_tb_get_count(old_tb) || + (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry)))) return -EBUSY; + targ_entry.val++; } while (++ci_off < ci_end); return 0; @@ -400,6 +409,7 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, swp_entry_t entry; struct folio *folio; void *shadow = NULL; + unsigned short memcg_id; unsigned long address, nr_pages = 1UL << order; struct vm_area_struct *vma = vmf ? vmf->vma : NULL; @@ -408,7 +418,7 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, /* Check if the slot and range are available, skip allocation if not */ spin_lock(&ci->lock); - err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL); + err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL, NULL); spin_unlock(&ci->lock); if (unlikely(err)) return ERR_PTR(err); @@ -431,7 +441,7 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, /* Double check the range is still not in conflict */ spin_lock(&ci->lock); - err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow); + err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow, &memcg_id); if (unlikely(err)) { spin_unlock(&ci->lock); folio_put(folio); @@ -443,8 +453,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, __swap_cache_do_add_folio(ci, folio, entry); spin_unlock(&ci->lock); - if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, - gfp, entry)) { + if (mem_cgroup_swapin_charge_folio(folio, memcg_id, + vmf ? vmf->vma->vm_mm : NULL, gfp)) { spin_lock(&ci->lock); __swap_cache_do_del_folio(ci, folio, entry, shadow); spin_unlock(&ci->lock); From cdd77f84d96675c9e8c776073df8d58d2af10607 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:48 +0800 Subject: [PATCH 173/321] mm, swap: consolidate cluster allocation helpers Swap cluster table management is spread across several narrow helpers. As a result, the allocation and fallback sequences are open-coded in multiple places. A few more per-cluster tables will be added soon, so avoid duplicating these sequences per table type. Fold the existing pairs into cluster-oriented helpers, and rename for consistency. No functional change, only a few sanity checks are slightly adjusted. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-9-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/swapfile.c | 110 ++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 61 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 60d8f0df3f32..2ddabc0f3a88 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -411,20 +411,7 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static struct swap_table *swap_table_alloc(gfp_t gfp) -{ - struct folio *folio; - - if (!SWP_TABLE_USE_PAGE) - return kmem_cache_zalloc(swap_table_cachep, gfp); - - folio = folio_alloc(gfp | __GFP_ZERO, 0); - if (folio) - return folio_address(folio); - return NULL; -} - -static void swap_table_free_folio_rcu_cb(struct rcu_head *head) +static void swap_cluster_free_table_folio_rcu_cb(struct rcu_head *head) { struct folio *folio; @@ -432,15 +419,46 @@ static void swap_table_free_folio_rcu_cb(struct rcu_head *head) folio_put(folio); } -static void swap_table_free(struct swap_table *table) +static void swap_cluster_free_table(struct swap_cluster_info *ci) { + struct swap_table *table; + + table = (struct swap_table *)rcu_dereference_protected(ci->table, true); + if (!table) + return; + + rcu_assign_pointer(ci->table, NULL); if (!SWP_TABLE_USE_PAGE) { kmem_cache_free(swap_table_cachep, table); return; } call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), - swap_table_free_folio_rcu_cb); + swap_cluster_free_table_folio_rcu_cb); +} + +static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) +{ + struct swap_table *table = NULL; + struct folio *folio; + + /* The cluster must be empty and not on any list during allocation. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + if (rcu_access_pointer(ci->table)) + return 0; + + if (SWP_TABLE_USE_PAGE) { + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (folio) + table = folio_address(folio); + } else { + table = kmem_cache_zalloc(swap_table_cachep, gfp); + } + if (!table) + return -ENOMEM; + + rcu_assign_pointer(ci->table, table); + return 0; } /* @@ -471,27 +489,15 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); } -static void swap_cluster_free_table(struct swap_cluster_info *ci) -{ - struct swap_table *table; - - /* Only empty cluster's table is allow to be freed */ - lockdep_assert_held(&ci->lock); - table = (void *)rcu_dereference_protected(ci->table, true); - rcu_assign_pointer(ci->table, NULL); - - swap_table_free(table); -} - /* * Allocate swap table for one cluster. Attempt an atomic allocation first, * then fallback to sleeping allocation. */ static struct swap_cluster_info * -swap_cluster_alloc_table(struct swap_info_struct *si, +swap_cluster_populate(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_table *table; + int ret; /* * Only cluster isolation from the allocator does table allocation. @@ -502,14 +508,9 @@ swap_cluster_alloc_table(struct swap_info_struct *si, lockdep_assert_held(&si->global_cluster_lock); lockdep_assert_held(&ci->lock); - /* The cluster must be free and was just isolated from the free list. */ - VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); - - table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (table) { - rcu_assign_pointer(ci->table, table); + if (!swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC | + __GFP_NOWARN)) return ci; - } /* * Try a sleep allocation. Each isolated free cluster may cause @@ -521,7 +522,8 @@ swap_cluster_alloc_table(struct swap_info_struct *si, spin_unlock(&si->global_cluster_lock); local_unlock(&percpu_swap_cluster.lock); - table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + ret = swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC | + GFP_KERNEL); /* * Back to atomic context. We might have migrated to a new CPU with a @@ -536,20 +538,11 @@ swap_cluster_alloc_table(struct swap_info_struct *si, spin_lock(&si->global_cluster_lock); spin_lock(&ci->lock); - /* Nothing except this helper should touch a dangling empty cluster. */ - if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { - if (table) - swap_table_free(table); - return ci; - } - - if (!table) { + if (ret) { move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); spin_unlock(&ci->lock); return NULL; } - - rcu_assign_pointer(ci->table, table); return ci; } @@ -621,12 +614,11 @@ static struct swap_cluster_info *isolate_lock_cluster( } spin_unlock(&si->lock); - if (found && !cluster_table_is_alloced(found)) { - /* Only an empty free cluster's swap table can be freed. */ - VM_WARN_ON_ONCE(flags != CLUSTER_FLAG_FREE); + /* Cluster's table is freed when and only when it's on the free list. */ + if (found && flags == CLUSTER_FLAG_FREE) { VM_WARN_ON_ONCE(list != &si->free_clusters); - VM_WARN_ON_ONCE(!cluster_is_empty(found)); - return swap_cluster_alloc_table(si, found); + VM_WARN_ON_ONCE(cluster_table_is_alloced(found)); + return swap_cluster_populate(si, found); } return found; @@ -769,7 +761,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - struct swap_table *table; int ret = 0; /* si->max may got shrunk by swap swap_activate() */ @@ -790,12 +781,9 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, } ci = cluster_info + idx; - if (!ci->table) { - table = swap_table_alloc(GFP_KERNEL); - if (!table) - return -ENOMEM; - rcu_assign_pointer(ci->table, table); - } + /* Need to allocate swap table first for initial bad slot marking. */ + if (!ci->count && swap_cluster_alloc_table(ci, GFP_KERNEL)) + return -ENOMEM; spin_lock(&ci->lock); /* Check for duplicated bad swap slots. */ if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) { @@ -2920,7 +2908,7 @@ static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, ci = cluster_info + i; /* Cluster with bad marks count will have a remaining table */ spin_lock(&ci->lock); - if (rcu_dereference_protected(ci->table, true)) { + if (cluster_table_is_alloced(ci)) { swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true); swap_cluster_free_table(ci); } From b197d41462c2076bc88c79fead7f400e48881c19 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:49 +0800 Subject: [PATCH 174/321] mm/memcg, swap: store cgroup id in cluster table directly Drop the usage of the swap_cgroup_ctrl, and use the dynamic cluster table instead. The per-cluster memcg table is 1024 / 512 bytes on most archs, and does not need RCU protection: the cgroup data is only read and written under the cluster lock. That keeps things simple, lets the allocation use plain kmalloc with immediate kfree (no deferred free), and keeps fragmentation acceptable. [akpm@linux-foundation.org: memcgv1: don't compile swap functions when CONFIG_SWAP=n] Link: https://lore.kernel.org/202605281711.bSeZlErK-lkp@intel.com [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] Link: https://lore.kernel.org/20260517-swap-table-p4-v5-10-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 19 +++++++---- include/linux/swap.h | 8 ++--- mm/memcontrol-v1.c | 44 +++++++++++++++++--------- mm/memcontrol.c | 13 +++++--- mm/swap.h | 4 +++ mm/swap_state.c | 6 ++-- mm/swap_table.h | 64 ++++++++++++++++++++++++++++++++++++++ mm/swapfile.c | 37 +++++++++++++++------- mm/vmscan.c | 2 +- 9 files changed, 150 insertions(+), 47 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a013f37f24aa..8f2662db166b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,6 +29,7 @@ struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct swap_cluster_info; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -1899,9 +1900,6 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } -void __memcg1_swapout(struct folio *folio); -void memcg1_swapin(struct folio *folio); - #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1929,14 +1927,23 @@ static inline void mem_cgroup_exit_user_fault(void) { } -static inline void __memcg1_swapout(struct folio *folio) +#endif /* CONFIG_MEMCG_V1 */ + +#if defined(CONFIG_MEMCG_V1) && defined(CONFIG_SWAP) + +void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci); +void memcg1_swapin(struct folio *folio); + +#else + +static inline void __memcg1_swapout(struct folio *folio, + struct swap_cluster_info *ci) { } static inline void memcg1_swapin(struct folio *folio) { } - -#endif /* CONFIG_MEMCG_V1 */ +#endif #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index f907d3df52d0..200e7c345f26 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -579,12 +579,12 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio) return __mem_cgroup_try_charge_swap(folio); } -extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages); +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge_swap(entry, nr_pages); + __mem_cgroup_uncharge_swap(id, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); @@ -595,7 +595,7 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio) return 0; } -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 36c507d81dc5..517b21236672 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -14,6 +14,7 @@ #include "internal.h" #include "swap.h" +#include "swap_table.h" #include "memcontrol-v1.h" /* @@ -603,17 +604,19 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) local_irq_restore(flags); } +#ifdef CONFIG_SWAP /** * __memcg1_swapout - transfer a memsw charge to swap * @folio: folio whose memsw charge to transfer + * @ci: the locked swap cluster holding the swap entries * * Transfer the memsw charge of @folio to the swap entry stored in * folio->swap. * - * Context: folio must be isolated, unmapped, locked and is just about - * to be freed, and caller must disable IRQs. + * Context: folio must be isolated, unmapped, locked and is just about to + * be freed, and caller must disable IRQs and hold the swap cluster lock. */ -void __memcg1_swapout(struct folio *folio) +void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci) { struct mem_cgroup *memcg, *swap_memcg; struct obj_cgroup *objcg; @@ -646,7 +649,8 @@ void __memcg1_swapout(struct folio *folio) swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), folio->swap); + __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries, + mem_cgroup_private_id(swap_memcg)); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -661,8 +665,7 @@ void __memcg1_swapout(struct folio *folio) } /* - * Interrupts should be disabled here because the caller holds the - * i_pages lock which is taken with interrupts-off. It is + * The caller must hold the swap cluster lock with IRQ off. It is * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ @@ -677,7 +680,7 @@ void __memcg1_swapout(struct folio *folio) } /** - * memcg1_swapin - uncharge swap slot + * memcg1_swapin - uncharge swap slot on swapin * @folio: folio being swapped in * * Call this function after successfully adding the charged @@ -687,6 +690,10 @@ void __memcg1_swapout(struct folio *folio) */ void memcg1_swapin(struct folio *folio) { + struct swap_cluster_info *ci; + unsigned long nr_pages; + unsigned short id; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); @@ -702,15 +709,22 @@ void memcg1_swapin(struct folio *folio) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (do_memsw_account()) { - /* - * The swap entry might not get freed for a long time, - * let's not wait for it. The page already received a - * memory+swap charge, drop the swap entry duplicate. - */ - mem_cgroup_uncharge_swap(folio->swap, folio_nr_pages(folio)); - } + if (!do_memsw_account()) + return; + + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + nr_pages = folio_nr_pages(folio); + ci = swap_cluster_get_and_lock(folio); + id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap), + nr_pages); + swap_cluster_unlock(ci); + mem_cgroup_uncharge_swap(id, nr_pages); } +#endif void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b58b314cb18..beecfc6f376d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -64,6 +64,7 @@ #include #include #include "internal.h" +#include "swap_table.h" #include #include #include "slab.h" @@ -5479,6 +5480,7 @@ int __init mem_cgroup_init(void) int __mem_cgroup_try_charge_swap(struct folio *folio) { unsigned int nr_pages = folio_nr_pages(folio); + struct swap_cluster_info *ci; struct page_counter *counter; struct mem_cgroup *memcg; struct obj_cgroup *objcg; @@ -5512,22 +5514,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio) } mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, mem_cgroup_private_id(memcg), folio->swap); + ci = swap_cluster_get_and_lock(folio); + __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages, + mem_cgroup_private_id(memcg)); + swap_cluster_unlock(ci); return 0; } /** * __mem_cgroup_uncharge_swap - uncharge swap space - * @entry: swap entry to uncharge + * @id: cgroup id to uncharge * @nr_pages: the amount of swap space to uncharge */ -void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { struct mem_cgroup *memcg; - unsigned short id; - id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_private_id(id); if (memcg) { diff --git a/mm/swap.h b/mm/swap.h index 8e57e9431624..5b2f095fff6e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -5,6 +5,7 @@ #include /* for atomic_long_t */ struct mempolicy; struct swap_iocb; +struct swap_memcg_table; extern int page_cluster; @@ -38,6 +39,9 @@ struct swap_cluster_info { u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ unsigned int *extend_table; /* For large swap count, protected by ci->lock */ +#ifdef CONFIG_MEMCG + struct swap_memcg_table *memcg_table; /* Swap table entries' cgroup record */ +#endif struct list_head list; }; diff --git a/mm/swap_state.c b/mm/swap_state.c index bdd949ae0044..873cb3f26337 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -179,21 +179,19 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci, if (shadowp && swp_tb_is_shadow(old_tb)) *shadowp = swp_tb_to_shadow(old_tb); if (memcg_id) - *memcg_id = lookup_swap_cgroup_id(targ_entry); + *memcg_id = __swap_cgroup_get(ci, ci_off); if (nr == 1) return 0; - targ_entry.val = round_down(targ_entry.val, nr); ci_off = round_down(ci_off, nr); ci_end = ci_off + nr; do { old_tb = __swap_table_get(ci, ci_off); if (unlikely(swp_tb_is_folio(old_tb) || !__swp_tb_get_count(old_tb) || - (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry)))) + (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off)))) return -EBUSY; - targ_entry.val++; } while (++ci_off < ci_end); return 0; diff --git a/mm/swap_table.h b/mm/swap_table.h index 8415ffbe2b9c..b4e1100f8296 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -11,6 +11,11 @@ struct swap_table { atomic_long_t entries[SWAPFILE_CLUSTER]; }; +/* For storing memcg private id */ +struct swap_memcg_table { + unsigned short id[SWAPFILE_CLUSTER]; +}; + #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) /* @@ -247,4 +252,63 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci, return swp_tb; } + +#ifdef CONFIG_MEMCG +static inline void __swap_cgroup_set(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned long nr, unsigned short id) +{ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER); + if (WARN_ON_ONCE(!ci->memcg_table)) + return; + do { + ci->memcg_table->id[ci_off++] = id; + } while (--nr); +} + +static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER); + if (unlikely(!ci->memcg_table)) + return 0; + return ci->memcg_table->id[ci_off]; +} + +static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci, + unsigned int ci_off, + unsigned long nr) +{ + unsigned short old = __swap_cgroup_get(ci, ci_off); + + if (!old) + return 0; + do { + VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old); + ci->memcg_table->id[ci_off++] = 0; + } while (--nr); + + return old; +} +#else +static inline void __swap_cgroup_set(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned long nr, unsigned short id) +{ +} + +static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + return 0; +} + +static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci, + unsigned int ci_off, + unsigned long nr) +{ + return 0; +} +#endif + #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 2ddabc0f3a88..bd141eb9ef10 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -423,7 +423,12 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci) { struct swap_table *table; - table = (struct swap_table *)rcu_dereference_protected(ci->table, true); +#ifdef CONFIG_MEMCG + kfree(ci->memcg_table); + ci->memcg_table = NULL; +#endif + + table = (struct swap_table *)rcu_access_pointer(ci->table); if (!table) return; @@ -441,6 +446,7 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) { struct swap_table *table = NULL; struct folio *folio; + int ret = 0; /* The cluster must be empty and not on any list during allocation. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); @@ -458,7 +464,19 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) return -ENOMEM; rcu_assign_pointer(ci->table, table); - return 0; + +#ifdef CONFIG_MEMCG + if (!mem_cgroup_disabled()) { + VM_WARN_ON_ONCE(ci->memcg_table); + ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp); + if (!ci->memcg_table) + ret = -ENOMEM; + } +#endif + if (ret) + swap_cluster_free_table(ci); + + return ret; } /* @@ -483,6 +501,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, bad_slots++; else WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + WARN_ON_ONCE(__swap_cgroup_get(ci, ci_off)); } while (++ci_off < ci_end); WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); @@ -1887,12 +1906,10 @@ void __swap_cluster_free_entries(struct swap_info_struct *si, unsigned int ci_start, unsigned int nr_pages) { unsigned long old_tb; - unsigned int type = si->type; unsigned short batch_id = 0, id_cur; unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; unsigned long ci_head = cluster_offset(si, ci); unsigned int batch_off = ci_off; - swp_entry_t entry; VM_WARN_ON(ci->count < nr_pages); @@ -1910,21 +1927,17 @@ void __swap_cluster_free_entries(struct swap_info_struct *si, * Uncharge swap slots by memcg in batches. Consecutive * slots with the same cgroup id are uncharged together. */ - entry = swp_entry(type, ci_head + ci_off); - id_cur = lookup_swap_cgroup_id(entry); + id_cur = __swap_cgroup_clear(ci, ci_off, 1); if (batch_id != id_cur) { if (batch_id) - mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off), - ci_off - batch_off); + mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off); batch_id = id_cur; batch_off = ci_off; } } while (++ci_off < ci_end); - if (batch_id) { - mem_cgroup_uncharge_swap(swp_entry(type, ci_head + batch_off), - ci_off - batch_off); - } + if (batch_id) + mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off); swap_range_free(si, ci_head + ci_start, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3231af682fa7..3c856a78c0a5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -739,7 +739,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __memcg1_swapout(folio); + __memcg1_swapout(folio, ci); __swap_cache_del_folio(ci, folio, swap, shadow); swap_cluster_unlock_irq(ci); } else { From 4e8e1c498de9f97628207d1ef84506058b06bb51 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:50 +0800 Subject: [PATCH 175/321] mm/memcg: remove no longer used swap cgroup array Now all swap cgroup records are stored in the swap cluster directly, the static array is no longer needed. Link: https://lore.kernel.org/20260517-swap-table-p4-v5-11-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Youngjun Park Cc: Zi Yan Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - include/linux/swap_cgroup.h | 47 ---------- mm/Makefile | 3 - mm/internal.h | 1 - mm/memcontrol-v1.c | 1 - mm/memcontrol.c | 1 - mm/swap_cgroup.c | 174 ------------------------------------ mm/swapfile.c | 8 -- 8 files changed, 236 deletions(-) delete mode 100644 include/linux/swap_cgroup.h delete mode 100644 mm/swap_cgroup.c diff --git a/MAINTAINERS b/MAINTAINERS index 461a3eed6129..782ed63e4e67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6573,7 +6573,6 @@ F: mm/memcontrol.c F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c -F: mm/swap_cgroup.c F: samples/cgroup/* F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h deleted file mode 100644 index 91cdf12190a0..000000000000 --- a/include/linux/swap_cgroup.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_SWAP_CGROUP_H -#define __LINUX_SWAP_CGROUP_H - -#include - -#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) - -extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent); -extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); -extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -extern int swap_cgroup_swapon(int type, unsigned long max_pages); -extern void swap_cgroup_swapoff(int type); - -#else - -static inline -void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) -{ -} - -static inline -unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) -{ - return 0; -} - -static inline -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return 0; -} - -static inline int -swap_cgroup_swapon(int type, unsigned long max_pages) -{ - return 0; -} - -static inline void swap_cgroup_swapoff(int type) -{ - return; -} - -#endif - -#endif /* __LINUX_SWAP_CGROUP_H */ diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab08244e..eff9f9e7e061 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,9 +103,6 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_LIVEUPDATE_MEMFD) += memfd_luo.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o -ifdef CONFIG_SWAP -obj-$(CONFIG_MEMCG) += swap_cgroup.o -endif ifdef CONFIG_BPF_SYSCALL obj-$(CONFIG_MEMCG) += bpf_memcontrol.o endif diff --git a/mm/internal.h b/mm/internal.h index 9dbd8e3c991f..5602393054f3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -17,7 +17,6 @@ #include #include #include -#include #include /* Internal core VMA manipulation functions. */ diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 517b21236672..765069211567 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index beecfc6f376d..92269740eef1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c deleted file mode 100644 index 95c38e54dd58..000000000000 --- a/mm/swap_cgroup.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include - -#include /* depends on mm.h include */ - -static DEFINE_MUTEX(swap_cgroup_mutex); - -/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ -#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) -#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) -#define ID_MASK (BIT(ID_SHIFT) - 1) -struct swap_cgroup { - atomic_t ids; -}; - -struct swap_cgroup_ctrl { - struct swap_cgroup *map; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; - -static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, - pgoff_t offset) -{ - unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; - unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); - - BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); - BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); - - return (old_ids >> shift) & ID_MASK; -} - -static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, - pgoff_t offset, - unsigned short new_id) -{ - unsigned short old_id; - struct swap_cgroup *sc = &map[offset / ID_PER_SC]; - unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; - unsigned int new_ids, old_ids = atomic_read(&sc->ids); - - do { - old_id = (old_ids >> shift) & ID_MASK; - new_ids = (old_ids & ~(ID_MASK << shift)); - new_ids |= ((unsigned int)new_id) << shift; - } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); - - return old_id; -} - -/** - * swap_cgroup_record - record mem_cgroup for a set of swap entries. - * These entries must belong to one single folio, and that folio - * must be being charged for swap space (swap out), and these - * entries must not have been charged - * - * @folio: the folio that the swap entry belongs to - * @id: mem_cgroup ID to be recorded - * @ent: the first swap entry to be recorded - */ -void swap_cgroup_record(struct folio *folio, unsigned short id, - swp_entry_t ent) -{ - unsigned int nr_ents = folio_nr_pages(folio); - struct swap_cgroup *map; - pgoff_t offset, end; - unsigned short old; - - offset = swp_offset(ent); - end = offset + nr_ents; - map = swap_cgroup_ctrl[swp_type(ent)].map; - - do { - old = __swap_cgroup_id_xchg(map, offset, id); - VM_BUG_ON(old); - } while (++offset != end); -} - -/** - * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. - * These entries must be being uncharged from swap. They either - * belongs to one single folio in the swap cache (swap in for - * cgroup v1), or no longer have any users (slot freeing). - * - * @ent: the first swap entry to be recorded into - * @nr_ents: number of swap entries to be recorded - * - * Returns the existing old value. - */ -unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) -{ - pgoff_t offset, end; - struct swap_cgroup *map; - unsigned short old, iter = 0; - - offset = swp_offset(ent); - end = offset + nr_ents; - map = swap_cgroup_ctrl[swp_type(ent)].map; - - do { - old = __swap_cgroup_id_xchg(map, offset, 0); - if (!iter) - iter = old; - VM_BUG_ON(iter != old); - } while (++offset != end); - - return old; -} - -/** - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry - * @ent: swap entry to be looked up. - * - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) - */ -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return 0; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (unlikely(!ctrl->map)) - return 0; - return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - struct swap_cgroup *map; - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return 0; - - BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != - sizeof(struct swap_cgroup)); - map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * - sizeof(struct swap_cgroup)); - if (!map) - goto nomem; - - ctrl = &swap_cgroup_ctrl[type]; - mutex_lock(&swap_cgroup_mutex); - ctrl->map = map; - mutex_unlock(&swap_cgroup_mutex); - - return 0; -nomem: - pr_info("couldn't allocate enough memory for swap_cgroup\n"); - pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); - return -ENOMEM; -} - -void swap_cgroup_swapoff(int type) -{ - struct swap_cgroup *map; - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return; - - mutex_lock(&swap_cgroup_mutex); - ctrl = &swap_cgroup_ctrl[type]; - map = ctrl->map; - ctrl->map = NULL; - mutex_unlock(&swap_cgroup_mutex); - - vfree(map); -} diff --git a/mm/swapfile.c b/mm/swapfile.c index bd141eb9ef10..992e77b7105d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,7 +45,6 @@ #include #include -#include #include "swap_table.h" #include "internal.h" #include "swap.h" @@ -3058,8 +3057,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); - /* Destroy swap account information */ - swap_cgroup_swapoff(p->type); inode = mapping->host; @@ -3590,10 +3587,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - error = swap_cgroup_swapon(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. @@ -3704,7 +3697,6 @@ bad_swap: si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si, swap_file); - swap_cgroup_swapoff(si->type); free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; kvfree(si->zeromap); From d9ceded101a142cd56f1e88fc7e893560ee59f4d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 17 May 2026 23:39:51 +0800 Subject: [PATCH 176/321] mm, swap: merge zeromap into swap table By allocating one additional bit in the swap table entry's flags field alongside the count, we can store the zeromap inline For 64 bit systems, zeromap will store in the swap table, avoiding zeromap allocation. It reduces the allocated memory. That is the happy path. For certain 32-bit archs, there might not be enough bits in the swap table to contain both PFN and flags. Therefore, conditionally let each cluster have a zeromap field at build time, and use that instead. If the swapfile cluster is not fully used, it will still save memory for zeromap. The empty cluster does not allocate a zeromap. In the worst case, all cluster are fully populated. We will use memory similar to the previous zeromap implementation. A few macros were moved to different headers for build time struct definition. [akpm@linux-foundation.org: swap_cluster_alloc_table(): remove unused local `ret] [akpm@linux-foundation.org: fix unused label `err_free'] Link: https://lore.kernel.org/20260517-swap-table-p4-v5-12-88ae43e064c7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Youngjun Park Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chengming Zhou Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/memory.c | 11 +---- mm/page_io.c | 61 +++++++++++++++++++---- mm/swap.h | 50 ++++++++----------- mm/swap_state.c | 14 +++--- mm/swap_table.h | 115 ++++++++++++++++++++++++++++++++----------- mm/swapfile.c | 57 ++++++++++----------- 7 files changed, 192 insertions(+), 117 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 200e7c345f26..8c43bc3055c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -252,7 +252,6 @@ struct swap_info_struct { struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* size of this swap device */ - unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ diff --git a/mm/memory.c b/mm/memory.c index da891bcce59c..7c020995eafc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4611,13 +4611,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * Check if the PTEs within a range are contiguous swap entries - * and have consistent swapcache, zeromap. + * Check if the PTEs within a range are contiguous swap entries. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - softleaf_t entry; int idx; pte_t pte; @@ -4627,18 +4625,13 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry = softleaf_from_pte(pte); - if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) - return false; - /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ - if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) + if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; - return true; } diff --git a/mm/page_io.c b/mm/page_io.c index 7ed76592e20d..f2d8fe7fd057 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,6 +26,7 @@ #include #include #include "swap.h" +#include "swap_table.h" static void __end_swap_bio_write(struct bio *bio) { @@ -204,15 +205,20 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); + struct swap_cluster_info *ci; swp_entry_t entry; unsigned int i; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + + ci = swap_cluster_get_and_lock(folio); for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); - set_bit(swp_offset(entry), sis->zeromap); + __swap_table_set_zero(ci, swp_cluster_offset(entry)); } + swap_cluster_unlock(ci); count_vm_events(SWPOUT_ZERO, nr_pages); if (objcg) { @@ -223,14 +229,19 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); + struct swap_cluster_info *ci; swp_entry_t entry; unsigned int i; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + + ci = swap_cluster_get_and_lock(folio); for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); - clear_bit(swp_offset(entry), sis->zeromap); + __swap_table_clear_zero(ci, swp_cluster_offset(entry)); } + swap_cluster_unlock(ci); } /* @@ -255,10 +266,9 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) } /* - * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. - * The bits in zeromap are protected by the locked swapcache folio - * and atomic updates are used to protect against read-modify-write - * corruption due to other zero swap entries seeing concurrent updates. + * Use the swap table zero mark to avoid doing IO for zero-filled + * pages. The zero mark is protected by the cluster lock, which is + * acquired internally by swap_zeromap_folio_set/clear. */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); @@ -509,19 +519,52 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +/* + * Return the count of contiguous swap entries that share the same + * zeromap status as the starting entry. If is_zerop is not NULL, + * it will return the zeromap status of the starting entry. + * + * Context: Caller must ensure the cluster containing the entries + * that will be checked won't be freed. + */ +static int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *is_zerop) +{ + int i; + bool is_zero; + unsigned int ci_start = swp_cluster_offset(entry); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + + VM_WARN_ON_ONCE(ci_start + max_nr > SWAPFILE_CLUSTER); + + rcu_read_lock(); + is_zero = __swap_table_test_zero(ci, ci_start); + for (i = 1; i < max_nr; i++) + if (is_zero != __swap_table_test_zero(ci, ci_start + i)) + break; + rcu_read_unlock(); + if (is_zerop) + *is_zerop = is_zero; + + return i; +} + static bool swap_read_folio_zeromap(struct folio *folio) { int nr_pages = folio_nr_pages(folio); struct obj_cgroup *objcg; bool is_zeromap; + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). + * Folio lock stabilizes the cluster and map, so the check is safe. */ if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, - &is_zeromap) != nr_pages)) + &is_zeromap) != nr_pages)) return true; if (!is_zeromap) diff --git a/mm/swap.h b/mm/swap.h index 5b2f095fff6e..77d2d14eda42 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,12 +3,29 @@ #define _MM_SWAP_H #include /* for atomic_long_t */ +#include /* for PAGE_SHIFT */ struct mempolicy; struct swap_iocb; struct swap_memcg_table; extern int page_cluster; +#if defined(MAX_POSSIBLE_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) +#elif defined(MAX_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#else +#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#endif + +/* Swap table marker, 0x1 means shadow, 0x2 means PFN (SWP_TB_PFN_MARK) */ +#define SWAP_CACHE_PFN_MARK_BITS 2 +/* At least 2 bits are needed to distinguish SWP_TB_COUNT_MAX, 1 and 0 */ +#define SWAP_COUNT_MIN_BITS 2 +/* If there are enough bits besides PFN and marker, store zero flag inline */ +#define SWAP_TABLE_HAS_ZEROFLAG ((BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - \ + SWAP_CACHE_PFN_BITS) > SWAP_COUNT_MIN_BITS) + #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) @@ -41,6 +58,9 @@ struct swap_cluster_info { unsigned int *extend_table; /* For large swap count, protected by ci->lock */ #ifdef CONFIG_MEMCG struct swap_memcg_table *memcg_table; /* Swap table entries' cgroup record */ +#endif +#if !SWAP_TABLE_HAS_ZEROFLAG + unsigned long *zero_bitmap; #endif struct list_head list; }; @@ -314,31 +334,6 @@ static inline unsigned int folio_swap_flags(struct folio *folio) return __swap_entry_to_info(folio->swap)->flags; } -/* - * Return the count of contiguous swap entries that share the same - * zeromap status as the starting entry. If is_zeromap is not NULL, - * it will return the zeromap status of the starting entry. - */ -static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, - bool *is_zeromap) -{ - struct swap_info_struct *sis = __swap_entry_to_info(entry); - unsigned long start = swp_offset(entry); - unsigned long end = start + max_nr; - bool first_bit; - - first_bit = test_bit(start, sis->zeromap); - if (is_zeromap) - *is_zeromap = first_bit; - - if (max_nr <= 1) - return max_nr; - if (first_bit) - return find_next_zero_bit(sis->zeromap, end, start) - start; - else - return find_next_bit(sis->zeromap, end, start) - start; -} - #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -477,10 +472,5 @@ static inline unsigned int folio_swap_flags(struct folio *folio) return 0; } -static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, - bool *has_zeromap) -{ - return 0; -} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 873cb3f26337..04f5ce992401 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -160,6 +160,7 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci, { unsigned int ci_off, ci_end; unsigned long old_tb; + bool is_zero; lockdep_assert_held(&ci->lock); @@ -184,12 +185,14 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci, if (nr == 1) return 0; + is_zero = __swap_table_test_zero(ci, ci_off); ci_off = round_down(ci_off, nr); ci_end = ci_off + nr; do { old_tb = __swap_table_get(ci, ci_off); if (unlikely(swp_tb_is_folio(old_tb) || !__swp_tb_get_count(old_tb) || + is_zero != __swap_table_test_zero(ci, ci_off) || (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off)))) return -EBUSY; } while (++ci_off < ci_end); @@ -213,7 +216,7 @@ static void __swap_cache_do_add_folio(struct swap_cluster_info *ci, do { old_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); - __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -249,7 +252,6 @@ static void __swap_cache_do_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { - int count; unsigned long old_tb; struct swap_info_struct *si; unsigned int ci_start, ci_off, ci_end; @@ -269,13 +271,13 @@ static void __swap_cache_do_del_folio(struct swap_cluster_info *ci, old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); - count = __swp_tb_get_count(old_tb); - if (count) + if (__swp_tb_get_count(old_tb)) folio_swapped = true; else need_free = true; /* If shadow is NULL, we set an empty shadow. */ - __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, + __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); folio->swap.val = 0; @@ -369,7 +371,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, do { old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); - __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); /* diff --git a/mm/swap_table.h b/mm/swap_table.h index b4e1100f8296..e6613e62f8d0 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -26,12 +26,14 @@ struct swap_memcg_table { * Swap table entry type and bits layouts: * * NULL: |---------------- 0 ---------------| - Free slot - * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot - * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot + * Shadow: |SWAP_COUNT|Z|---- SHADOW_VAL ---|1| - Swapped out slot + * PFN: |SWAP_COUNT|Z|------ PFN -------|10| - Cached slot * Pointer: |----------- Pointer ----------|100| - (Unused) * Bad: |------------- 1 -------------|1000| - Bad slot * - * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long. + * COUNT is `SWP_TB_COUNT_BITS` long, Z is the `SWP_TB_ZERO_FLAG` bit, + * and together they form the `SWP_TB_FLAGS_BITS` wide flags field. + * Each entry is an atomic long. * * Usages: * @@ -54,14 +56,6 @@ struct swap_memcg_table { * - Bad: Swap slot is reserved, protects swap header or holes on swap devices. */ -#if defined(MAX_POSSIBLE_PHYSMEM_BITS) -#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -#elif defined(MAX_PHYSMEM_BITS) -#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#else -#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) -#endif - /* NULL Entry, all 0 */ #define SWP_TB_NULL 0UL @@ -69,22 +63,26 @@ struct swap_memcg_table { #define SWP_TB_SHADOW_MARK 0b1UL /* Cached: PFN */ -#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS) +#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWAP_CACHE_PFN_MARK_BITS) #define SWP_TB_PFN_MARK 0b10UL -#define SWP_TB_PFN_MARK_BITS 2 -#define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1) +#define SWP_TB_PFN_MARK_MASK (BIT(SWAP_CACHE_PFN_MARK_BITS) - 1) -/* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */ -#define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS) +/* Flags: For PFN or shadow, contains SWAP_COUNT, width changes */ +#define SWP_TB_FLAGS_BITS min(5, BITS_PER_LONG - SWP_TB_PFN_BITS) +#define SWP_TB_COUNT_BITS (SWP_TB_FLAGS_BITS - SWAP_TABLE_HAS_ZEROFLAG) +#define SWP_TB_FLAGS_MASK (~((~0UL) >> SWP_TB_FLAGS_BITS)) #define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) +#define SWP_TB_FLAGS_SHIFT (BITS_PER_LONG - SWP_TB_FLAGS_BITS) #define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) #define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) +/* The first flag is zero bit (SWAP_TABLE_HAS_ZEROFLAG) */ +#define SWP_TB_ZERO_FLAG BIT(BITS_PER_LONG - SWP_TB_FLAGS_BITS) /* Bad slot: ends with 0b1000 and rests of bits are all 1 */ #define SWP_TB_BAD ((~0UL) << 3) /* Macro for shadow offset calculation */ -#define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS +#define SWAP_COUNT_SHIFT SWP_TB_FLAGS_BITS /* * Helpers for casting one type of info into a swap table entry. @@ -102,40 +100,47 @@ static inline unsigned long __count_to_swp_tb(unsigned char count) * used (count > 0 && count < SWP_TB_COUNT_MAX), and * overflow (count == SWP_TB_COUNT_MAX). */ - BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2); + BUILD_BUG_ON(SWP_TB_COUNT_BITS < SWAP_COUNT_MIN_BITS); VM_WARN_ON(count > SWP_TB_COUNT_MAX); return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; } -static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count) +static inline unsigned long __flags_to_swp_tb(unsigned char flags) +{ + BUILD_BUG_ON(SWP_TB_FLAGS_BITS > BITS_PER_BYTE); + VM_WARN_ON(flags >> SWP_TB_FLAGS_BITS); + return ((unsigned long)flags) << SWP_TB_FLAGS_SHIFT; +} + +static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned char flags) { unsigned long swp_tb; BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > - (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS)); + (BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - SWP_TB_FLAGS_BITS)); - swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK; - VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK); + swp_tb = (pfn << SWAP_CACHE_PFN_MARK_BITS) | SWP_TB_PFN_MARK; + VM_WARN_ON_ONCE(swp_tb & SWP_TB_FLAGS_MASK); - return swp_tb | __count_to_swp_tb(count); + return swp_tb | __flags_to_swp_tb(flags); } -static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count) +static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned char flags) { - return pfn_to_swp_tb(folio_pfn(folio), count); + return pfn_to_swp_tb(folio_pfn(folio), flags); } -static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count) +static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned char flags) { BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != BITS_PER_BYTE * sizeof(unsigned long)); BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK); VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); - VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK)); + VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_FLAGS_MASK)); - return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK; + return (unsigned long)shadow | SWP_TB_SHADOW_MARK | __flags_to_swp_tb(flags); } /* @@ -173,14 +178,14 @@ static inline bool swp_tb_is_countable(unsigned long swp_tb) static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_folio(swp_tb)); - return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS); + return pfn_folio((swp_tb & ~SWP_TB_FLAGS_MASK) >> SWAP_CACHE_PFN_MARK_BITS); } static inline void *swp_tb_to_shadow(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); /* No shift needed, xa_value is stored as it is in the lower bits. */ - return (void *)(swp_tb & ~SWP_TB_COUNT_MASK); + return (void *)(swp_tb & ~SWP_TB_FLAGS_MASK); } static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) @@ -189,6 +194,12 @@ static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); } +static inline unsigned char __swp_tb_get_flags(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return ((swp_tb & SWP_TB_FLAGS_MASK) >> SWP_TB_FLAGS_SHIFT); +} + static inline int swp_tb_get_count(unsigned long swp_tb) { if (swp_tb_is_countable(swp_tb)) @@ -253,6 +264,50 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci, return swp_tb; } +static inline void __swap_table_set_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ +#if SWAP_TABLE_HAS_ZEROFLAG + unsigned long swp_tb = __swap_table_get(ci, ci_off); + + BUILD_BUG_ON(SWP_TB_ZERO_FLAG & ~SWP_TB_FLAGS_MASK); + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + swp_tb |= SWP_TB_ZERO_FLAG; + __swap_table_set(ci, ci_off, swp_tb); +#else + lockdep_assert_held(&ci->lock); + __set_bit(ci_off, ci->zero_bitmap); +#endif +} + +static inline bool __swap_table_test_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ +#if SWAP_TABLE_HAS_ZEROFLAG + unsigned long swp_tb = __swap_table_get(ci, ci_off); + + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return !!(swp_tb & SWP_TB_ZERO_FLAG); +#else + return test_bit(ci_off, ci->zero_bitmap); +#endif +} + +static inline void __swap_table_clear_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ +#if SWAP_TABLE_HAS_ZEROFLAG + unsigned long swp_tb = __swap_table_get(ci, ci_off); + + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + swp_tb &= ~SWP_TB_ZERO_FLAG; + __swap_table_set(ci, ci_off, swp_tb); +#else + lockdep_assert_held(&ci->lock); + __clear_bit(ci_off, ci->zero_bitmap); +#endif +} + #ifdef CONFIG_MEMCG static inline void __swap_cgroup_set(struct swap_cluster_info *ci, unsigned int ci_off, unsigned long nr, unsigned short id) diff --git a/mm/swapfile.c b/mm/swapfile.c index 992e77b7105d..615d90867111 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -427,6 +427,11 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci) ci->memcg_table = NULL; #endif +#if !SWAP_TABLE_HAS_ZEROFLAG + kfree(ci->zero_bitmap); + ci->zero_bitmap = NULL; +#endif + table = (struct swap_table *)rcu_access_pointer(ci->table); if (!table) return; @@ -445,7 +450,6 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) { struct swap_table *table = NULL; struct folio *folio; - int ret = 0; /* The cluster must be empty and not on any list during allocation. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); @@ -468,14 +472,22 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) if (!mem_cgroup_disabled()) { VM_WARN_ON_ONCE(ci->memcg_table); ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp); - if (!ci->memcg_table) - ret = -ENOMEM; + if (!ci->memcg_table) { + swap_cluster_free_table(ci); + return -ENOMEM; + } } #endif - if (ret) - swap_cluster_free_table(ci); - return ret; +#if !SWAP_TABLE_HAS_ZEROFLAG + VM_WARN_ON_ONCE(ci->zero_bitmap); + ci->zero_bitmap = bitmap_zalloc(SWAPFILE_CLUSTER, gfp); + if (!ci->zero_bitmap) { + swap_cluster_free_table(ci); + return -ENOMEM; + } +#endif + return 0; } /* @@ -928,8 +940,8 @@ static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, order = 0; nr_pages = 1; swap_cluster_assert_empty(ci, ci_off, 1, false); - /* Sets a fake shadow as placeholder */ - __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); + /* Fake shadow placeholder with no flag, hibernation does not use the zeromap */ + __swap_table_set(ci, ci_off, __swp_tb_mk_count(shadow_to_swp_tb(NULL, 0), 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -1302,14 +1314,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; - /* - * Use atomic clear_bit operations only on zeromap instead of non-atomic - * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. - */ - for (i = 0; i < nr_entries; i++) { - clear_bit(offset + i, si->zeromap); + for (i = 0; i < nr_entries; i++) zswap_invalidate(swp_entry(si->type, offset + i)); - } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = @@ -1920,7 +1926,11 @@ void __swap_cluster_free_entries(struct swap_info_struct *si, * ref, or after swap cache is dropped */ VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); + + /* Resetting the slot to NULL also clears the inline flags. */ __swap_table_set(ci, ci_off, null_to_swp_tb()); + if (!SWAP_TABLE_HAS_ZEROFLAG) + __swap_table_clear_zero(ci, ci_off); /* * Uncharge swap slots by memcg in batches. Consecutive @@ -2954,7 +2964,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; @@ -3042,8 +3051,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - zeromap = p->zeromap; - p->zeromap = NULL; maxpages = p->max; cluster_info = p->cluster_info; p->max = 0; @@ -3055,7 +3062,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; - kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); inode = mapping->host; @@ -3587,17 +3593,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - /* - * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might - * be above MAX_PAGE_ORDER incase of a large swap file. - */ - si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), - GFP_KERNEL | __GFP_ZERO); - if (!si->zeromap) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; @@ -3699,8 +3694,6 @@ bad_swap: destroy_swap_extents(si, swap_file); free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; - kvfree(si->zeromap); - si->zeromap = NULL; /* * Clear the SWP_USED flag after all resources are freed so * alloc_swap_info can reuse this si safely. From 0c946c54a7013742157b8f79b241140b0c670764 Mon Sep 17 00:00:00 2001 From: Sakurai Shun Date: Sun, 17 May 2026 19:36:35 +0900 Subject: [PATCH 177/321] docs/mm: fix typo in process_addrs.rst Replace "presense" with "presence" Link: https://lore.kernel.org/20260517103640.45444-1-ssh1326@icloud.com Signed-off-by: Sakurai Shun Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index 851680ead45f..042d64d72421 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -775,7 +775,7 @@ lock, releasing or downgrading the mmap write lock also releases the VMA write lock so there is no :c:func:`!vma_end_write` function. Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily -modified so that readers can detect the presense of a writer. The reference counter is +modified so that readers can detect the presence of a writer. The reference counter is restored once the vma sequence number used for serialisation is updated. This ensures the semantics we require - VMA write locks provide exclusive write From 6fa411adff39e4955f11aa3e854a876680444d2a Mon Sep 17 00:00:00 2001 From: Qiang Liu Date: Fri, 15 May 2026 15:03:11 +0800 Subject: [PATCH 178/321] lib/test_hmm: fix error path in dmirror_devmem_fault() Handle migrate_vma_setup() failure via goto err for unified cleanup. Link: https://lore.kernel.org/20260515070312.130435-1-liuqiangneo@163.com Signed-off-by: Qiang Liu Reviewed-by: Alistair Popple Cc: Jason Gunthorpe Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- lib/test_hmm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 38996c4baa40..63bf77dee987 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1679,8 +1679,14 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) if (order) args.flags |= MIGRATE_VMA_SELECT_COMPOUND; - if (migrate_vma_setup(&args)) - return VM_FAULT_SIGBUS; + /* + * In practice migrate_vma_setup() should never fail unless the + * test is wrong as it just tests some static VMA properties. + */ + if (migrate_vma_setup(&args)) { + ret = VM_FAULT_SIGBUS; + goto err; + } ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); if (ret) From 5ee5ff9dbce0b80297794fb77857bd80782b92db Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Fri, 15 May 2026 10:01:43 +0800 Subject: [PATCH 179/321] mm/memory-failure: remove hugetlb output parameter from try_memory_failure_hugetlb() Use -ENOENT return value to distinguish "not a hugetlb page" from "hugetlb handled", instead of carrying an extra output parameter. Link: https://lore.kernel.org/20260515020144.164941-1-ye.liu@linux.dev Signed-off-by: Ye Liu Suggested-by: Oscar Salvador Acked-by: Miaohe Lin Acked-by: Oscar Salvador (SUSE) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index eff405a21c68..1b8d0bade04a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2027,13 +2027,14 @@ out_unlock: * So some of prechecks for hwpoison (pinning, and testing/setting * PageHWPoison) should be done in single hugetlb_lock range. * Returns: - * 0 - not hugetlb, or recovered + * 0 - recovered + * -ENOENT - no hugetlb page * -EBUSY - not recovered * -EOPNOTSUPP - hwpoison_filter'ed * -EHWPOISON - folio or exact page already poisoned * -EFAULT - kill_accessing_process finds current->mm null */ -static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +static int try_memory_failure_hugetlb(unsigned long pfn, int flags) { int res, rv; struct page *p = pfn_to_page(pfn); @@ -2041,13 +2042,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb unsigned long page_flags; bool migratable_cleared = false; - *hugetlb = 1; retry: res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); switch (res) { case MF_HUGETLB_NON_HUGEPAGE: /* fallback to normal page handling */ - *hugetlb = 0; - return 0; + return -ENOENT; case MF_HUGETLB_RETRY: if (!(flags & MF_NO_RETRY)) { flags |= MF_NO_RETRY; @@ -2108,9 +2107,9 @@ retry: } #else -static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags) { - return 0; + return -ENOENT; } static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) @@ -2348,7 +2347,6 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - int hugetlb = 0; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -2387,8 +2385,11 @@ int memory_failure(unsigned long pfn, int flags) } try_again: - res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); - if (hugetlb) + res = try_memory_failure_hugetlb(pfn, flags); + /* + * -ENOENT means the page we found is not hugetlb, so proceed with normal page handling + */ + if (res != -ENOENT) goto unlock_mutex; if (TestSetPageHWPoison(p)) { From 45c49d9fd6089e344663176b8488c97d905ca3ac Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:49 -0700 Subject: [PATCH 180/321] mm/damon/core: introduce struct damon_probe Patch series "mm/damon: introduce data attributes monitoring". TL; DR ====== Extend DAMON for monitoring general data attributes other than accesses. The short term motivation is lightweight page type (e.g., belonging cgroup) aware monitoring. In long term, this will help extending DAMON for multiple access events capture primitives (e.g., page faults and PMU) and eventually pivotting DAMON to a "Data Attributes Monitoring and Operations eNgine" in long term. Background: High Cost of Page Level Properties Monitoring ========================================================= DAMON is initially introduced as a Data Access MONitor. It has been extended for not only access monitoring but also data access-aware system operations (DAMOS). But still the monitoring part is only for data accesses. Data access patterns is good information, but some users need more holistic views. Particularly, users want to show the access pattern information together with the types of the memory. For example, users who work for making huge pages efficiently want to know how much of DAMON-found hot/cold regions are backed by huge pages. Users who run multiple workloads with different cgroups want to know how much of DAMON-found hot/cold regions belong to specific cgroups. For the user demand, we developed a DAMOS extension for page level properties based monitoring [1], which has landed on 6.14. Using the feature, users can inform the page level data properties that they are interested in, in a flexible format that uses DAMOS filters. Then, DAMON applies the filters to each folio of the entire DAMON region and lets users know how many bytes of memory in each DAMON region passed the given filters. This gives page level detailed and deterministic information to users. But, because the operation is done at page level, the overhead is proportional to the memory size. It was useful for test or debugging purposes on a small number of machines. But it was obviously too heavy to be enabled always on all machines running the real user workloads. For real world workloads, it was recommended to use the feature with user-space controlled sampling approaches. For example, users could do the page level monitoring only once per hour, on randomly selected one percent of machines of their fleet. If the runtime and the size of the fleet is long and big enough, it should provide statistically meaningful data. But users are too busy to implement such controls on their own. Data Attributes Monitoring ========================== Extend DAMON to monitor not only data accesses, but also general data attributes. Do the extension while keeping the main promise of DAMON, the bounded and best-effort minimum overhead. Allow users to specify what data attributes in addition to the data access they want to monitor. Users can install one 'data probe' per data attribute of their interest for this purpose. The 'data probe' should be able to be applied to any memory, and determine if the given memory has the appropriate data attribute. E.g., if memory of physical address 42 belongs to cgroup A. Each 'data probe' is configured with filters that are very similar to the DAMOS filters. When DAMON checks if each sampling address memory of each region is accessed since the last check, it applies data probes if registered. Same to the number of access check-positive samples accounting (nr_accesses), it accounts the number of each data probe-positive samples in another per-region counters array, namely 'probe_hits'. When DAMON resets nr_accesses every aggregation interval, it resets 'probe_hits' together. Users can read 'probe_hits' just before the values are reset. In this way, users can know how many hot/cold memory regions have data attributes of their interest. E.g., 30 percent of this system's hot memory is belonging to cgroup A, and 80 percent of the cgroup A-belonging hot memory is backed by huge pages. Patches Sequence ================ First eight patches implement the core feature, interface and the working support. Patch 1 introduces data probe data structure, namely damon_probe. Patch 2 extends damon_ctx for installing data probes. Patch 3 introduces another data structure for filters of each data probe, namely damon_filter. Patch 4 updates damon_ctx commit function to handle the probes. Patch 5 extends damon_region for the per-region per-probe positive samples counter, namely probe_hits. Patch 6 extends damon_operations for applying probes on the underlying DAMON operations implementation. Patch 7 updates kdamond_fn() to invoke the probes applying callback. Patch 8 finally implements the probes support on paddr ops. Ten changes for user interface (patches 9-18) come next. Patches 9-13 implements sysfs directories and files for setting data probes, namely probes directory, probe directory, filters directory, filter directory and filter directory internal files, respectively. Patch 14 connects the user inputs that are made via the sysfs files to DAMON core. Following three patches (patches 15-17) implement sysfs directories and files for showing the probe_hits to users, namely probes directory, probe directory and hits files, respectively. Patch 18 introduces a new tracepoint for showing the probe_hits via tracefs. Patch 19 adds a selftest for the sysfs files. Patches 20 and 21 documents the design and usage of the new feature, respectively. Seven additional patches (patches 22-28) for monitoring belonging memory cgroup follow. Depending on the feedback, this part might be separated to another series in future. Patch 22 defines the DAMON filter type for the new attribute, namely DAMON_FILTER_TYPE_MEMCG. Patch 23 add the support on paddr ops. Patch 24 updates the sysfs interface for setup of the target memcg. Patch 25 move code for easy reuse of the filter target memcg setup. Patch 26 connects the user input to the core layer. Finally, patches 27 and 28 update the design and usage documents for the memcg attribute monitoring support. Discussion ========== This allows the page properties monitoring with overhead that is low enough to be enabled always on real world workloads. Because the sampling time for access check is reused for data attributes check, the upper-bounded and best-effort minimum overhead of DAMON is kept. Because the sampling memory for access check is reused for data attributes check, additional overhead is minimum. Still DAMOS-based page level properties monitoring should be useful, because it provides a deterministic page level information. When in doubt of the sampling based information, running DAMOS-based one together and comparing the results would be useful, for debugging and tuning. Future Works: Mid Term ======================== This version of implementation is limiting the maximum number of data probes to four. I will try to find a way to remove the limit in future. I personally think it should be enough for common use cases, though, and therefore not giving high priority at the moment. Future Works: Long Term ======================= There are user requests for extending DAMON with detailed access information, for example, per-CPUs/threads/read/writes monitoring. For that, I was working [2] on extending DAMON to use page fault events as another access check primitives, and making the infrastructure flexible for future use of yet another access check primitive. Actually there is another ongoing work [3] for extending DAMON with PMU events. The motivation of the work is reducing the overhead, though. In my work [2], I was introducing a new interface for access sampling primitives control. Now I think this data probe interface can be used for that, too. That is, data access becomes just one type of data attribute. Also, pg_idle-confirmed access, page fault-confirmed access, and PMU event-confirmed access will be different types of data attributes. The regions adjustment mechanism is currently working based on the access information. That's because DAMON is designed for data access monitoring. That is, data access information is the primary interest, and therefore DAMON adjusts regions in a way that can best-present the information. Once data access becomes just one of data attributes, there is no reason to think data access that special. There might be some users not interested in access at all but want to know the location of memory of specific type. Data probes interface will allow doing that. Further, we could extend the interface to let users set any data attribute as the 'primary' attribute. Then, DAMON will split and merge regions in a way that can best-present the 'primary' attributes. DAMOS will also be extended, to specify targets based on not only the data access pattern, but all user-registered data attributes. From this stage, we may be able to call DAMON as a "Data Attributes Monitoring and Operations eNgine". This patch (of 28): Introduce a data structure for data attribute probe. It is just a linked list header at this step. It will be extended in a way that it can determine if a given memory has a specific data attribute. Link: https://lore.kernel.org/20260518234119.97569-1-sj@kernel.org Link: https://lore.kernel.org/20260518234119.97569-2-sj@kernel.org Link: https://lore.kernel.org/20250106193401.109161-1-sj@kernel.org [1] Link: https://lore.kernel.org/20251208062943.68824-1-sj@kernel.org/ [2] Link: https://lore.kernel.org/20260423004211.7037-1-akinobu.mita@gmail.com [3] Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 4d4f031bcb45..4794931fa2ea 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -730,6 +730,15 @@ struct damon_intervals_goal { unsigned long max_sample_us; }; +/** + * struct damon_probe - Data region attribute probe. + * + * @list: Siblings list. + */ +struct damon_probe { + struct list_head list; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * From 18c777859f28d5e9b65d94c4fdc64f240250df3a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:50 -0700 Subject: [PATCH 181/321] mm/damon/core: embed damon_probe objects in damon_ctx Let damon_probe objects be able to be installed on a given damon_ctx, by adding a linked list header for storing the objects. Add initialization and cleanup of the new field with helper functions, too. Link: https://lore.kernel.org/20260518234119.97569-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 9 +++++++++ mm/damon/core.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 4794931fa2ea..43d71eb24ccb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -857,6 +857,7 @@ struct damon_ctx { /* public: */ struct damon_operations ops; + struct list_head probes; unsigned long addr_unit; unsigned long min_region_sz; bool pause; @@ -909,6 +910,11 @@ static inline unsigned long damon_sz_region(struct damon_region *r) return r->ar.end - r->ar.start; } +#define damon_for_each_probe(p, ctx) \ + list_for_each_entry(p, &(ctx)->probes, list) + +#define damon_for_each_probe_safe(p, next, ctx) \ + list_for_each_entry_safe(p, next, &(ctx)->probes, list) #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) @@ -951,6 +957,9 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #ifdef CONFIG_DAMON +struct damon_probe *damon_new_probe(void); +void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); + struct damon_region *damon_new_region(unsigned long start, unsigned long end); /* diff --git a/mm/damon/core.c b/mm/damon/core.c index 3a8725e400c6..8a55cc61d297 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -113,6 +113,38 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return err; } +struct damon_probe *damon_new_probe(void) +{ + struct damon_probe *p; + + p = kmalloc_obj(*p); + if (!p) + return NULL; + INIT_LIST_HEAD(&p->list); + return p; +} + +void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe) +{ + list_add_tail(&probe->list, &ctx->probes); +} + +static void damon_del_probe(struct damon_probe *p) +{ + list_del(&p->list); +} + +static void damon_free_probe(struct damon_probe *p) +{ + kfree(p); +} + +static void damon_destroy_probe(struct damon_probe *p) +{ + damon_del_probe(p); + damon_free_probe(p); +} + #ifdef CONFIG_DAMON_DEBUG_SANITY static void damon_verify_new_region(unsigned long start, unsigned long end) { @@ -605,6 +637,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + INIT_LIST_HEAD(&ctx->probes); + ctx->addr_unit = 1; ctx->min_region_sz = DAMON_MIN_REGION_SZ; @@ -627,12 +661,16 @@ static void damon_destroy_targets(struct damon_ctx *ctx) void damon_destroy_ctx(struct damon_ctx *ctx) { struct damos *s, *next_s; + struct damon_probe *p, *next_p; damon_destroy_targets(ctx); damon_for_each_scheme_safe(s, next_s, ctx) damon_destroy_scheme(s); + damon_for_each_probe_safe(p, next_p, ctx) + damon_destroy_probe(p); + kfree(ctx); } From f557693dd8ac9cd87d2a1ae1025ee9f568e916e6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:51 -0700 Subject: [PATCH 182/321] mm/damon/core: introduce damon_filter Define a data structure for constructing damon_probe's attributes check, namely damon_filter. It is very similar to damos_filter but works only for monitoring purposes. Also embed that into damon_probe, implement essential handling of the link, with fundamental helpers. Link: https://lore.kernel.org/20260518234119.97569-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 36 ++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 43d71eb24ccb..f8b679dd944d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -730,12 +730,38 @@ struct damon_intervals_goal { unsigned long max_sample_us; }; +/** + * enum damon_filter_type - Type of &struct damon_filter + * + * @DAMON_FILTER_TYPE_ANON: Anonymous pages. + */ +enum damon_filter_type { + DAMON_FILTER_TYPE_ANON, +}; + +/** + * struct damon_filter - DAMON region filter for &struct damon_probe. + * + * @type: Type of the region. + * @matching: Whether this filter is for the type-matching ones. + * @allow: Whether the @type-@matching ones should pass this filter. + * @list: Siblings list. + */ +struct damon_filter { + enum damon_filter_type type; + bool matching; + bool allow; + struct list_head list; +}; + /** * struct damon_probe - Data region attribute probe. * + * @filters: Filters for assessing if a given region is for this probe. * @list: Siblings list. */ struct damon_probe { + struct list_head filters; struct list_head list; }; @@ -910,6 +936,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) return r->ar.end - r->ar.start; } +#define damon_for_each_filter(f, p) \ + list_for_each_entry(f, &(p)->filters, list) + +#define damon_for_each_filter_safe(f, next, p) \ + list_for_each_entry_safe(f, next, &(p)->filters, list) + #define damon_for_each_probe(p, ctx) \ list_for_each_entry(p, &(ctx)->probes, list) @@ -957,6 +989,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #ifdef CONFIG_DAMON +struct damon_filter *damon_new_filter(enum damon_filter_type type, + bool matching, bool allow); +void damon_add_filter(struct damon_probe *probe, struct damon_filter *f); + struct damon_probe *damon_new_probe(void); void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); diff --git a/mm/damon/core.c b/mm/damon/core.c index 8a55cc61d297..d01417955a3b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -113,6 +113,31 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return err; } +struct damon_filter *damon_new_filter(enum damon_filter_type type, + bool matching, bool allow) +{ + struct damon_filter *filter; + + filter = kmalloc_obj(*filter); + if (!filter) + return NULL; + filter->type = type; + filter->matching = matching; + filter->allow = allow; + INIT_LIST_HEAD(&filter->list); + return filter; +} + +void damon_add_filter(struct damon_probe *p, struct damon_filter *f) +{ + list_add_tail(&f->list, &p->filters); +} + +static void damon_free_filter(struct damon_filter *f) +{ + kfree(f); +} + struct damon_probe *damon_new_probe(void) { struct damon_probe *p; @@ -120,6 +145,7 @@ struct damon_probe *damon_new_probe(void) p = kmalloc_obj(*p); if (!p) return NULL; + INIT_LIST_HEAD(&p->filters); INIT_LIST_HEAD(&p->list); return p; } @@ -136,6 +162,10 @@ static void damon_del_probe(struct damon_probe *p) static void damon_free_probe(struct damon_probe *p) { + struct damon_filter *f, *next; + + damon_for_each_filter_safe(f, next, p) + damon_free_filter(f); kfree(p); } From d0de4b29c722d903e3b82dfc035cb78c015b46e0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:52 -0700 Subject: [PATCH 183/321] mm/damon/core: commit probes Update damon_commit_ctx() to commit installed data probes, too. Link: https://lore.kernel.org/20260518234119.97569-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index d01417955a3b..240cae1420c1 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -133,11 +133,34 @@ void damon_add_filter(struct damon_probe *p, struct damon_filter *f) list_add_tail(&f->list, &p->filters); } +static void damon_del_filter(struct damon_filter *f) +{ + list_del(&f->list); +} + static void damon_free_filter(struct damon_filter *f) { kfree(f); } +static void damon_destroy_filter(struct damon_filter *f) +{ + damon_del_filter(f); + damon_free_filter(f); +} + +static struct damon_filter *damon_nth_filter(int n, struct damon_probe *p) +{ + struct damon_filter *f; + int i = 0; + + damon_for_each_filter(f, p) { + if (i++ == n) + return f; + } + return NULL; +} + struct damon_probe *damon_new_probe(void) { struct damon_probe *p; @@ -175,6 +198,18 @@ static void damon_destroy_probe(struct damon_probe *p) damon_free_probe(p); } +static struct damon_probe *damon_nth_probe(int n, struct damon_ctx *ctx) +{ + struct damon_probe *p; + int i = 0; + + damon_for_each_probe(p, ctx) { + if (i++ == n) + return p; + } + return NULL; +} + #ifdef CONFIG_DAMON_DEBUG_SANITY static void damon_verify_new_region(unsigned long start, unsigned long end) { @@ -1386,6 +1421,72 @@ static int damon_commit_targets( return 0; } +static void damon_commit_filter(struct damon_filter *dst, + struct damon_filter *src) +{ + dst->type = src->type; + dst->matching = src->matching; + dst->allow = src->allow; +} + +static int damon_commit_filters(struct damon_probe *dst, + struct damon_probe *src) +{ + struct damon_filter *dst_filter, *next, *src_filter, *new_filter; + int i = 0, j = 0; + + damon_for_each_filter_safe(dst_filter, next, dst) { + src_filter = damon_nth_filter(i++, src); + if (src_filter) + damon_commit_filter(dst_filter, src_filter); + else + damon_destroy_filter(dst_filter); + } + + damon_for_each_filter_safe(src_filter, next, src) { + if (j++ < i) + continue; + + new_filter = damon_new_filter(src_filter->type, + src_filter->matching, src_filter->allow); + if (!new_filter) + return -ENOMEM; + damon_add_filter(dst, new_filter); + } + return 0; +} + +static int damon_commit_probes(struct damon_ctx *dst, struct damon_ctx *src) +{ + struct damon_probe *dst_probe, *next, *src_probe, *new_probe; + int i = 0, j = 0, err; + + damon_for_each_probe_safe(dst_probe, next, dst) { + src_probe = damon_nth_probe(i++, src); + if (src_probe) { + err = damon_commit_filters(dst_probe, src_probe); + if (err) + return err; + } else { + damon_destroy_probe(dst_probe); + } + } + + damon_for_each_probe_safe(src_probe, next, src) { + if (j++ < i) + continue; + + new_probe = damon_new_probe(); + if (!new_probe) + return -ENOMEM; + damon_add_probe(dst, new_probe); + err = damon_commit_filters(new_probe, src_probe); + if (err) + return err; + } + return 0; +} + /** * damon_commit_ctx() - Commit parameters of a DAMON context to another. * @dst: The commit destination DAMON context. @@ -1442,6 +1543,9 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) } dst->pause = src->pause; dst->ops = src->ops; + err = damon_commit_probes(dst, src); + if (err) + return err; dst->addr_unit = src->addr_unit; dst->min_region_sz = src->min_region_sz; From 57c6332f2548d94f137f51bd18111e4316fd1ba4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:53 -0700 Subject: [PATCH 184/321] mm/damon/core: introduce damon_region->probe_hits Add an array for the per-region per-probe positive samples count. For simple and efficient implementation, add a limit to the number of data probes and set the array to support only the limited number of counters. Link: https://lore.kernel.org/20260518234119.97569-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++++ mm/damon/core.c | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index f8b679dd944d..3a30af119ac6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -17,6 +17,8 @@ /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION_SZ PAGE_SIZE +/* Maximum number of monitoring probes. */ +#define DAMON_MAX_PROBES (4) /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) @@ -47,6 +49,7 @@ struct damon_size_range { * @nr_accesses: Access frequency of this region. * @nr_accesses_bp: @nr_accesses in basis point (0.01%) that updated for * each sampling interval. + * @probe_hits: Number of probe-positive region samples. * @list: List head for siblings. * @age: Age of this region. * @@ -75,6 +78,7 @@ struct damon_region { unsigned long sampling_addr; unsigned int nr_accesses; unsigned int nr_accesses_bp; + unsigned char probe_hits[DAMON_MAX_PROBES]; struct list_head list; unsigned int age; diff --git a/mm/damon/core.c b/mm/damon/core.c index 240cae1420c1..0f6b3b66d1de 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -229,6 +229,7 @@ static void damon_verify_new_region(unsigned long start, unsigned long end) struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; + int i; damon_verify_new_region(start, end); region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); @@ -239,6 +240,8 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->ar.end = end; region->nr_accesses = 0; region->nr_accesses_bp = 0; + for (i = 0; i < DAMON_MAX_PROBES; i++) + region->probe_hits[i] = 0; INIT_LIST_HEAD(®ion->list); region->age = 0; @@ -2980,12 +2983,17 @@ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); + int i; l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); l->nr_accesses_bp = l->nr_accesses * 10000; l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; + /* todo: do this for only installed probes */ + for (i = 0; i < DAMON_MAX_PROBES; i++) + l->probe_hits[i] = (l->probe_hits[i] * sz_l + r->probe_hits[i] + * sz_r) / (sz_l + sz_r); damon_verify_merge_two_regions(l, r); damon_destroy_region(r, t); } @@ -3108,6 +3116,8 @@ static void damon_split_region_at(struct damon_target *t, new->last_nr_accesses = r->last_nr_accesses; new->nr_accesses_bp = r->nr_accesses_bp; new->nr_accesses = r->nr_accesses; + /* todo: do this for only installed probes */ + memcpy(new->probe_hits, r->probe_hits, sizeof(r->probe_hits)); damon_insert_region(new, r, damon_next_region(r), t); } From 1a9e847589180359be4198c7d2a3d2ea15b2ddd0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:54 -0700 Subject: [PATCH 185/321] mm/damon/core: introduce damon_ops->apply_probes Extend damon_operations struct with a new callback, namely apply_probes. The callback will be invoked for data attributes monitoring. More specifically, the callback will apply damon_probe objects to each region and update the per-region per-probe counters for the number of encountered probe-positive samples. Link: https://lore.kernel.org/20260518234119.97569-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3a30af119ac6..1fb271a35e98 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -630,6 +630,7 @@ enum damon_ops_id { * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. + * @apply_probes: Apply probes for each region. * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. @@ -656,6 +657,8 @@ enum damon_ops_id { * last preparation and update the number of observed accesses of each region. * It should also return max number of observed accesses that made as a result * of its update. The value will be used for regions adjustment threshold. + * @apply_probes should apply the data attribute probes to each region and + * accordingly update the probe hits counter of the region. * @get_scheme_score should return the priority score of a region for a scheme * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided @@ -673,6 +676,7 @@ struct damon_operations { void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); + void (*apply_probes)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, struct damon_region *r, struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, From 9b1f8c8d015bc92cab358f1395ee053fd01d7b89 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:55 -0700 Subject: [PATCH 186/321] mm/damon/core: do data attributes monitoring Implement the data attributes monitoring execution. Update kdamond to invoke the probes application callback, and reset the aggregated number of per-region per-probe positive samples for every aggregation interval. Link: https://lore.kernel.org/20260518234119.97569-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 0f6b3b66d1de..500e8b08d441 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1910,10 +1910,14 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) struct damon_region *r; damon_for_each_region(r, t) { + int i; + trace_damon_aggregated(ti, r, damon_nr_regions(t)); damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; + for (i = 0; i < DAMON_MAX_PROBES; i++) + r->probe_hits[i] = 0; damon_verify_reset_aggregated(r, c); } ti++; @@ -3407,6 +3411,8 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); + if (ctx->ops.apply_probes) + ctx->ops.apply_probes(ctx); if (time_after_eq(ctx->passed_sample_intervals, next_aggregation_sis)) { From 09acfaced2d45b4f6d70e3999783d6e8ccec0ea7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:56 -0700 Subject: [PATCH 187/321] mm/damon/paddr: support data attributes monitoring Implement and register damon_operations->apply_probes() callback to support data attributes monitoring. Link: https://lore.kernel.org/20260518234119.97569-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index c4738cd5e221..9997c5174ef1 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -120,6 +120,67 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } +static bool damon_pa_filter_match(struct damon_filter *filter, + struct folio *folio) +{ + bool matched = false; + + switch (filter->type) { + case DAMON_FILTER_TYPE_ANON: + if (!folio) { + matched = false; + break; + } + matched = folio_test_anon(folio); + break; + default: + break; + } + return matched == filter->matching; +} + +static bool damon_pa_filter_pass(phys_addr_t pa, struct folio *folio, + struct damon_probe *p) +{ + struct damon_filter *f; + bool pass = true; + + damon_for_each_filter(f, p) { + if (damon_pa_filter_match(f, folio)) { + pass = f->allow; + break; + } + pass = !f->allow; + } + return pass; +} + +static void damon_pa_apply_probes(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + struct damon_probe *p; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + int i = 0; + phys_addr_t pa; + struct folio *folio; + + pa = damon_pa_phys_addr(r->sampling_addr, + ctx->addr_unit); + folio = damon_get_folio(PHYS_PFN(pa)); + damon_for_each_probe(p, ctx) { + if (damon_pa_filter_pass(pa, folio, p)) + r->probe_hits[i]++; + i++; + } + if (folio) + folio_put(folio); + } + } +} + /* * damos_pa_filter_out - Return true if the page should be filtered out. */ @@ -371,6 +432,7 @@ static int __init damon_pa_initcall(void) .update = NULL, .prepare_access_checks = damon_pa_prepare_access_checks, .check_accesses = damon_pa_check_accesses, + .apply_probes = damon_pa_apply_probes, .target_valid = NULL, .apply_scheme = damon_pa_apply_scheme, .get_scheme_score = damon_pa_scheme_score, From 90a8322934ae8ab4b3e9418ed006e81df0d33dfc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:57 -0700 Subject: [PATCH 188/321] mm/damon/sysfs: implement probes dir Implement sysfs directory that can be used by the users to install data probes. Link: https://lore.kernel.org/20260518234119.97569-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index d5863cc33d23..ccd19fc062f3 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -747,6 +747,35 @@ static const struct kobj_type damon_sysfs_intervals_ktype = { .default_groups = damon_sysfs_intervals_groups, }; +/* + * probes directory + */ + +struct damon_sysfs_probes { + struct kobject kobj; +}; + +static struct damon_sysfs_probes *damon_sysfs_probes_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_probes); +} + +static void damon_sysfs_probes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_probes, kobj)); +} + +static struct attribute *damon_sysfs_probes_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_probes); + +static const struct kobj_type damon_sysfs_probes_ktype = { + .release = damon_sysfs_probes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_probes_groups, +}; + /* * monitoring_attrs directory */ @@ -755,6 +784,7 @@ struct damon_sysfs_attrs { struct kobject kobj; struct damon_sysfs_intervals *intervals; struct damon_sysfs_ul_range *nr_regions_range; + struct damon_sysfs_probes *probes; }; static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void) @@ -771,6 +801,7 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) { struct damon_sysfs_intervals *intervals; struct damon_sysfs_ul_range *nr_regions_range; + struct damon_sysfs_probes *probes; int err; intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000); @@ -799,8 +830,22 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) if (err) goto put_nr_regions_intervals_out; attrs->nr_regions_range = nr_regions_range; + + probes = damon_sysfs_probes_alloc(); + if (!probes) { + err = -ENOMEM; + goto put_nr_regions_intervals_out; + } + err = kobject_init_and_add(&probes->kobj, + &damon_sysfs_probes_ktype, &attrs->kobj, "probes"); + if (err) + goto put_probes_out; + attrs->probes = probes; return 0; +put_probes_out: + kobject_put(&probes->kobj); + attrs->probes = NULL; put_nr_regions_intervals_out: kobject_put(&nr_regions_range->kobj); attrs->nr_regions_range = NULL; @@ -817,6 +862,7 @@ static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) kobject_put(&attrs->nr_regions_range->kobj); damon_sysfs_intervals_rm_dirs(attrs->intervals); kobject_put(&attrs->intervals->kobj); + kobject_put(&attrs->probes->kobj); } static void damon_sysfs_attrs_release(struct kobject *kobj) From 7d49f5aaee63bddded9e8f2fd15949596f69ae6b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:58 -0700 Subject: [PATCH 189/321] mm/damon/sysfs: implement probe dir Implement sysfs directory for letting users install each data probe. Link: https://lore.kernel.org/20260518234119.97569-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ccd19fc062f3..6cef3eaa4431 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -747,12 +747,43 @@ static const struct kobj_type damon_sysfs_intervals_ktype = { .default_groups = damon_sysfs_intervals_groups, }; +/* + * probe directory + */ + +struct damon_sysfs_probe { + struct kobject kobj; +}; + +static struct damon_sysfs_probe *damon_sysfs_probe_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_probe); +} + +static void damon_sysfs_probe_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_probe, kobj)); +} + +static struct attribute *damon_sysfs_probe_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_probe); + +static const struct kobj_type damon_sysfs_probe_ktype = { + .release = damon_sysfs_probe_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_probe_groups, +}; + /* * probes directory */ struct damon_sysfs_probes { struct kobject kobj; + struct damon_sysfs_probe **probes_arr; + int nr; }; static struct damon_sysfs_probes *damon_sysfs_probes_alloc(void) @@ -760,12 +791,99 @@ static struct damon_sysfs_probes *damon_sysfs_probes_alloc(void) return kzalloc_obj(struct damon_sysfs_probes); } +static void damon_sysfs_probes_rm_dirs( + struct damon_sysfs_probes *probes) +{ + struct damon_sysfs_probe **probes_arr = probes->probes_arr; + int i; + + for (i = 0; i < probes->nr; i++) + kobject_put(&probes_arr[i]->kobj); + probes->nr = 0; + kfree(probes_arr); + probes->probes_arr = NULL; +} + +static int damon_sysfs_probes_add_dirs( + struct damon_sysfs_probes *probes, int nr_probes) +{ + struct damon_sysfs_probe **probes_arr, *probe; + int err, i; + + damon_sysfs_probes_rm_dirs(probes); + if (!nr_probes) + return 0; + + probes_arr = kmalloc_objs(*probes_arr, nr_probes, + GFP_KERNEL | __GFP_NOWARN); + if (!probes_arr) + return -ENOMEM; + probes->probes_arr = probes_arr; + + for (i = 0; i < nr_probes; i++) { + probe = damon_sysfs_probe_alloc(); + if (!probe) { + damon_sysfs_probes_rm_dirs(probes); + return -ENOMEM; + } + + err = kobject_init_and_add(&probe->kobj, + &damon_sysfs_probe_ktype, &probes->kobj, + "%d", i); + if (err) { + kobject_put(&probe->kobj); + damon_sysfs_probes_rm_dirs(probes); + return err; + } + + probes_arr[i] = probe; + probes->nr++; + } + return 0; +} + +static ssize_t nr_probes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_probes *probes = container_of(kobj, + struct damon_sysfs_probes, kobj); + + return sysfs_emit(buf, "%d\n", probes->nr); +} + +static ssize_t nr_probes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_probes *probes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0 || nr > DAMON_MAX_PROBES) + return -EINVAL; + + probes = container_of(kobj, struct damon_sysfs_probes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_probes_add_dirs(probes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + static void damon_sysfs_probes_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_probes, kobj)); } +static struct kobj_attribute damon_sysfs_probes_nr_probes = + __ATTR_RW_MODE(nr_probes, 0600); + static struct attribute *damon_sysfs_probes_attrs[] = { + &damon_sysfs_probes_nr_probes.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_probes); @@ -862,6 +980,7 @@ static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) kobject_put(&attrs->nr_regions_range->kobj); damon_sysfs_intervals_rm_dirs(attrs->intervals); kobject_put(&attrs->intervals->kobj); + damon_sysfs_probes_rm_dirs(attrs->probes); kobject_put(&attrs->probes->kobj); } From af7cb41af9a9310a6e654942199d2bb29f4f0021 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:40:59 -0700 Subject: [PATCH 190/321] mm/damon/sysfs: implement filters directory Implement a directory for letting users to install data probe filters. Link: https://lore.kernel.org/20260518234119.97569-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6cef3eaa4431..dad4985a826d 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -747,12 +747,42 @@ static const struct kobj_type damon_sysfs_intervals_ktype = { .default_groups = damon_sysfs_intervals_groups, }; +/* + * filters directory + */ + +struct damon_sysfs_filters { + struct kobject kobj; +}; + +static struct damon_sysfs_filters *damon_sysfs_filters_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_filters); +} + +static void damon_sysfs_filters_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_filters, kobj)); +} + +static struct attribute *damon_sysfs_filters_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_filters); + +static const struct kobj_type damon_sysfs_filters_ktype = { + .release = damon_sysfs_filters_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_filters_groups, +}; + /* * probe directory */ struct damon_sysfs_probe { struct kobject kobj; + struct damon_sysfs_filters *filters; }; static struct damon_sysfs_probe *damon_sysfs_probe_alloc(void) @@ -760,6 +790,30 @@ static struct damon_sysfs_probe *damon_sysfs_probe_alloc(void) return kzalloc_obj(struct damon_sysfs_probe); } +static int damon_sysfs_probe_add_dirs(struct damon_sysfs_probe *attr) +{ + struct damon_sysfs_filters *filters; + int err; + + filters = damon_sysfs_filters_alloc(); + if (!filters) + return -ENOMEM; + attr->filters = filters; + + err = kobject_init_and_add(&filters->kobj, &damon_sysfs_filters_ktype, + &attr->kobj, "filters"); + if (err) { + kobject_put(&filters->kobj); + attr->filters = NULL; + } + return err; +} + +static void damon_sysfs_probe_rm_dirs(struct damon_sysfs_probe *attr) +{ + kobject_put(&attr->filters->kobj); +} + static void damon_sysfs_probe_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_probe, kobj)); @@ -797,8 +851,10 @@ static void damon_sysfs_probes_rm_dirs( struct damon_sysfs_probe **probes_arr = probes->probes_arr; int i; - for (i = 0; i < probes->nr; i++) + for (i = 0; i < probes->nr; i++) { + damon_sysfs_probe_rm_dirs(probes_arr[i]); kobject_put(&probes_arr[i]->kobj); + } probes->nr = 0; kfree(probes_arr); probes->probes_arr = NULL; @@ -836,6 +892,13 @@ static int damon_sysfs_probes_add_dirs( return err; } + err = damon_sysfs_probe_add_dirs(probe); + if (err) { + kobject_put(&probe->kobj); + damon_sysfs_probes_rm_dirs(probes); + return err; + } + probes_arr[i] = probe; probes->nr++; } From 956bf44e4576121a7aa2d9c7f4a9e065edd293f8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:00 -0700 Subject: [PATCH 191/321] mm/damon/sysfs: implement filter dir Implement a sysfs directory for letting the users to configure each data probe filter. Link: https://lore.kernel.org/20260518234119.97569-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index dad4985a826d..2dc475ea0f0f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -747,12 +747,46 @@ static const struct kobj_type damon_sysfs_intervals_ktype = { .default_groups = damon_sysfs_intervals_groups, }; +/* + * filter directory + */ + +struct damon_sysfs_filter { + struct kobject kobj; +}; + +static struct damon_sysfs_filter *damon_sysfs_filter_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_filter); +} + +static void damon_sysfs_filter_release(struct kobject *kobj) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + + kfree(filter); +} + +static struct attribute *damon_sysfs_filter_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_filter); + +static const struct kobj_type damon_sysfs_filter_ktype = { + .release = damon_sysfs_filter_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_filter_groups, +}; + /* * filters directory */ struct damon_sysfs_filters { struct kobject kobj; + struct damon_sysfs_filter **filters_arr; + int nr; }; static struct damon_sysfs_filters *damon_sysfs_filters_alloc(void) @@ -760,12 +794,98 @@ static struct damon_sysfs_filters *damon_sysfs_filters_alloc(void) return kzalloc_obj(struct damon_sysfs_filters); } +static void damon_sysfs_filters_rm_dirs(struct damon_sysfs_filters *filters) +{ + struct damon_sysfs_filter **filters_arr = filters->filters_arr; + int i; + + for (i = 0; i < filters->nr; i++) + kobject_put(&filters_arr[i]->kobj); + filters->nr = 0; + kfree(filters_arr); + filters->filters_arr = NULL; +} + +static int damon_sysfs_filters_add_dirs( + struct damon_sysfs_filters *filters, int nr_filters) +{ + struct damon_sysfs_filter **filters_arr, *filter; + int err, i; + + damon_sysfs_filters_rm_dirs(filters); + if (!nr_filters) + return 0; + + filters_arr = kmalloc_objs(*filters_arr, nr_filters, + GFP_KERNEL | __GFP_NOWARN); + if (!filters_arr) + return -ENOMEM; + filters->filters_arr = filters_arr; + + for (i = 0; i < nr_filters; i++) { + filter = damon_sysfs_filter_alloc(); + if (!filter) { + damon_sysfs_filters_rm_dirs(filters); + return -ENOMEM; + } + + err = kobject_init_and_add(&filter->kobj, + &damon_sysfs_filter_ktype, &filters->kobj, + "%d", i); + if (err) { + kobject_put(&filter->kobj); + damon_sysfs_filters_rm_dirs(filters); + return err; + } + + filters_arr[i] = filter; + filters->nr++; + } + return 0; +} + +static ssize_t nr_filters_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filters *filters = container_of(kobj, + struct damon_sysfs_filters, kobj); + + return sysfs_emit(buf, "%d\n", filters->nr); +} + +static ssize_t nr_filters_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filters *filters; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + filters = container_of(kobj, struct damon_sysfs_filters, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_filters_add_dirs(filters, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + static void damon_sysfs_filters_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_filters, kobj)); } +static struct kobj_attribute damon_sysfs_filters_nr_attr = + __ATTR_RW_MODE(nr_filters, 0600); + static struct attribute *damon_sysfs_filters_attrs[] = { + &damon_sysfs_filters_nr_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_filters); @@ -811,7 +931,10 @@ static int damon_sysfs_probe_add_dirs(struct damon_sysfs_probe *attr) static void damon_sysfs_probe_rm_dirs(struct damon_sysfs_probe *attr) { - kobject_put(&attr->filters->kobj); + if (attr->filters) { + damon_sysfs_filters_rm_dirs(attr->filters); + kobject_put(&attr->filters->kobj); + } } static void damon_sysfs_probe_release(struct kobject *kobj) From 8caba144827849293a65169c9e138b6353156285 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:01 -0700 Subject: [PATCH 192/321] mm/damon/sysfs: implement filter dir files Implement sysfs files under the data probe filter directory for letting users to configure each filter. Link: https://lore.kernel.org/20260518234119.97569-14-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 2dc475ea0f0f..51a4f05c9275 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -753,6 +753,9 @@ static const struct kobj_type damon_sysfs_intervals_ktype = { struct damon_sysfs_filter { struct kobject kobj; + enum damon_filter_type type; + bool matching; + bool allow; }; static struct damon_sysfs_filter *damon_sysfs_filter_alloc(void) @@ -760,6 +763,105 @@ static struct damon_sysfs_filter *damon_sysfs_filter_alloc(void) return kzalloc_obj(struct damon_sysfs_filter); } +struct damon_sysfs_filter_type_name { + enum damon_filter_type type; + char *name; +}; + +static const struct damon_sysfs_filter_type_name +damon_sysfs_filter_type_names[] = { + { + .type = DAMON_FILTER_TYPE_ANON, + .name = "anon", + }, +}; + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_filter_type_names); i++) { + const struct damon_sysfs_filter_type_name *type_name; + + type_name = &damon_sysfs_filter_type_names[i]; + if (type_name->type == filter->type) + return sysfs_emit(buf, "%s\n", type_name->name); + } + return -EINVAL; +} + +static ssize_t type_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + ssize_t ret = -EINVAL; + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_filter_type_names); i++) { + const struct damon_sysfs_filter_type_name *type_name; + + type_name = &damon_sysfs_filter_type_names[i]; + if (sysfs_streq(buf, type_name->name)) { + filter->type = type_name->type; + ret = count; + break; + } + } + return ret; +} + +static ssize_t matching_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N'); +} + +static ssize_t matching_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + bool matching; + int err = kstrtobool(buf, &matching); + + if (err) + return err; + + filter->matching = matching; + return count; +} + +static ssize_t allow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N'); +} + +static ssize_t allow_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + bool allow; + int err = kstrtobool(buf, &allow); + + if (err) + return err; + + filter->allow = allow; + return count; +} + static void damon_sysfs_filter_release(struct kobject *kobj) { struct damon_sysfs_filter *filter = container_of(kobj, @@ -768,7 +870,19 @@ static void damon_sysfs_filter_release(struct kobject *kobj) kfree(filter); } +static struct kobj_attribute damon_sysfs_filter_type_attr = + __ATTR_RW_MODE(type, 0600); + +static struct kobj_attribute damon_sysfs_filter_matching_attr = + __ATTR_RW_MODE(matching, 0600); + +static struct kobj_attribute damon_sysfs_filter_allow_attr = + __ATTR_RW_MODE(allow, 0600); + static struct attribute *damon_sysfs_filter_attrs[] = { + &damon_sysfs_filter_type_attr.attr, + &damon_sysfs_filter_matching_attr.attr, + &damon_sysfs_filter_allow_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_filter); From 24e969aa296c1b02b797420a428315a525540420 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:02 -0700 Subject: [PATCH 193/321] mm/damon/sysfs: setup probes on DAMON core API parameters Add user-installed data probes to DAMON core API parameters, so that user inputs for data probes are passed to DAMON core. Link: https://lore.kernel.org/20260518234119.97569-15-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 51a4f05c9275..eeb7fdd030cf 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1855,6 +1855,40 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, return damon_set_attrs(ctx, &attrs); } +static int damon_sysfs_set_probes(struct damon_ctx *ctx, + struct damon_sysfs_probes *sys_probes) +{ + int i; + + for (i = 0; i < sys_probes->nr; i++) { + struct damon_sysfs_filters *sys_filters = + sys_probes->probes_arr[i]->filters; + struct damon_probe *c; + int j; + + if (!sys_filters) + continue; + c = damon_new_probe(); + if (!c) + return -ENOMEM; + damon_add_probe(ctx, c); + + for (j = 0; j < sys_filters->nr; j++) { + struct damon_sysfs_filter *sys_filter = + sys_filters->filters_arr[j]; + struct damon_filter *filter; + + filter = damon_new_filter(sys_filter->type, + sys_filter->matching, + sys_filter->allow); + if (!filter) + return -ENOMEM; + damon_add_filter(c, filter); + } + } + return 0; +} + static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions, unsigned long min_region_sz) @@ -1967,6 +2001,9 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); ctx->pause = sys_ctx->pause; err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); + if (err) + return err; + err = damon_sysfs_set_probes(ctx, sys_ctx->attrs->probes); if (err) return err; err = damon_sysfs_add_targets(ctx, sys_ctx->targets); From b574a82d10de9c32ddc005c6a5d92e037f35ed43 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:03 -0700 Subject: [PATCH 194/321] mm/damon/sysfs-schemes: implement tried_regions//probes/ Implement a sysfs directory for showing the per-region probe hit counts. It is named 'probes/' and located under the DAMOS tried region directory. Link: https://lore.kernel.org/20260518234119.97569-16-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 67 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 0d3021db0b99..3b66c3a757b2 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -10,6 +10,32 @@ #include "sysfs-common.h" +/* + * probes directory + */ + +struct damos_sysfs_probes { + struct kobject kobj; +}; + +static struct damos_sysfs_probes *damos_sysfs_probes_alloc(void) +{ + return kzalloc_obj(struct damos_sysfs_probes); +} + +static void damos_sysfs_probes_release(struct kobject *kobj) +{ + struct damos_sysfs_probes *probes = container_of(kobj, + struct damos_sysfs_probes, kobj); + + kfree(probes); +} + +static const struct kobj_type damos_sysfs_probes_ktype = { + .release = damos_sysfs_probes_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + /* * scheme region directory */ @@ -20,6 +46,7 @@ struct damon_sysfs_scheme_region { unsigned int nr_accesses; unsigned int age; unsigned long sz_filter_passed; + struct damos_sysfs_probes *probes; struct list_head list; }; @@ -34,10 +61,36 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( sysfs_region->ar = region->ar; sysfs_region->nr_accesses = region->nr_accesses_bp / 10000; sysfs_region->age = region->age; + sysfs_region->probes = NULL; INIT_LIST_HEAD(&sysfs_region->list); return sysfs_region; } +static int damos_sysfs_region_add_dirs( + struct damon_sysfs_scheme_region *region) +{ + struct damos_sysfs_probes *probes = damos_sysfs_probes_alloc(); + int err; + + if (!probes) + return -ENOMEM; + err = kobject_init_and_add(&probes->kobj, &damos_sysfs_probes_ktype, + ®ion->kobj, "probes"); + if (err) { + kobject_put(&probes->kobj); + return err; + } + + region->probes = probes; + return 0; +} + +static void damos_sysfs_region_rm_dirs( + struct damon_sysfs_scheme_region *region) +{ + kobject_put(®ion->probes->kobj); +} + static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -163,6 +216,7 @@ static void damon_sysfs_scheme_regions_rm_dirs( struct damon_sysfs_scheme_region *r, *next; list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + damos_sysfs_region_rm_dirs(r); list_del(&r->list); kobject_put(&r->kobj); regions->nr_regions--; @@ -2995,12 +3049,17 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - sysfs_regions->nr_regions)) { - kobject_put(®ion->kobj); - return; - } + sysfs_regions->nr_regions)) + goto out; + if (damos_sysfs_region_add_dirs(region)) + goto out; + list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; + return; + +out: + kobject_put(®ion->kobj); } int damon_sysfs_schemes_clear_regions( From a1536db4dc8b9045e4ab13da4fe44b3d2b68f8ed Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:04 -0700 Subject: [PATCH 195/321] mm/damon/sysfs-schemes: implement probe dir Implement sysfs directory for showing per-probe hits count of each region. Link: https://lore.kernel.org/20260518234119.97569-17-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 101 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 6 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3b66c3a757b2..7e21e78d7751 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -10,12 +10,40 @@ #include "sysfs-common.h" +/* + * probe directory + */ + +struct damos_sysfs_probe { + struct kobject kobj; +}; + +static struct damos_sysfs_probe *damos_sysfs_probe_alloc(void) +{ + return kzalloc_obj(struct damos_sysfs_probe); +} + +static void damos_sysfs_probe_release(struct kobject *kobj) +{ + struct damos_sysfs_probe *probe = container_of(kobj, + struct damos_sysfs_probe, kobj); + + kfree(probe); +} + +static const struct kobj_type damos_sysfs_probe_ktype = { + .release = damos_sysfs_probe_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + /* * probes directory */ struct damos_sysfs_probes { struct kobject kobj; + struct damos_sysfs_probe **probes_arr; + int nr; }; static struct damos_sysfs_probes *damos_sysfs_probes_alloc(void) @@ -23,6 +51,60 @@ static struct damos_sysfs_probes *damos_sysfs_probes_alloc(void) return kzalloc_obj(struct damos_sysfs_probes); } +static void damos_sysfs_probes_rm_dirs(struct damos_sysfs_probes *probes) +{ + struct damos_sysfs_probe **probes_arr = probes->probes_arr; + int i; + + for (i = 0; i < probes->nr; i++) + kobject_put(&probes_arr[i]->kobj); + probes->nr = 0; + kfree(probes_arr); + probes->probes_arr = NULL; +} + +static int damos_sysfs_probes_add_dirs(struct damos_sysfs_probes *probes, + struct damon_ctx *ctx) +{ + struct damon_probe *probe; + struct damos_sysfs_probe **probes_arr; + int i = 0; + + damon_for_each_probe(probe, ctx) + i++; + + if (!i) + return 0; + + probes_arr = kmalloc_objs(*probes_arr, i); + if (!probes_arr) + return -ENOMEM; + probes->probes_arr = probes_arr; + + i = 0; + damon_for_each_probe(probe, ctx) { + struct damos_sysfs_probe *sys_probe; + int err; + + sys_probe = damos_sysfs_probe_alloc(); + if (!sys_probe) { + damos_sysfs_probes_rm_dirs(probes); + return -ENOMEM; + } + err = kobject_init_and_add(&sys_probe->kobj, + &damos_sysfs_probe_ktype, &probes->kobj, "%d", + i); + if (err) { + kobject_put(&sys_probe->kobj); + damos_sysfs_probes_rm_dirs(probes); + return err; + } + probes_arr[i++] = sys_probe; + probes->nr++; + } + return 0; +} + static void damos_sysfs_probes_release(struct kobject *kobj) { struct damos_sysfs_probes *probes = container_of(kobj, @@ -67,7 +149,8 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( } static int damos_sysfs_region_add_dirs( - struct damon_sysfs_scheme_region *region) + struct damon_sysfs_scheme_region *region, + struct damon_ctx *ctx) { struct damos_sysfs_probes *probes = damos_sysfs_probes_alloc(); int err; @@ -76,18 +159,24 @@ static int damos_sysfs_region_add_dirs( return -ENOMEM; err = kobject_init_and_add(&probes->kobj, &damos_sysfs_probes_ktype, ®ion->kobj, "probes"); - if (err) { - kobject_put(&probes->kobj); - return err; - } + if (err) + goto fail; + err = damos_sysfs_probes_add_dirs(probes, ctx); + if (err) + goto fail; region->probes = probes; return 0; + +fail: + kobject_put(&probes->kobj); + return err; } static void damos_sysfs_region_rm_dirs( struct damon_sysfs_scheme_region *region) { + damos_sysfs_probes_rm_dirs(region->probes); kobject_put(®ion->probes->kobj); } @@ -3051,7 +3140,7 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, &sysfs_regions->kobj, "%d", sysfs_regions->nr_regions)) goto out; - if (damos_sysfs_region_add_dirs(region)) + if (damos_sysfs_region_add_dirs(region, ctx)) goto out; list_add_tail(®ion->list, &sysfs_regions->regions_list); From 5b0de1bc3325c34e341fe0f5314292c57b4616b9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:05 -0700 Subject: [PATCH 196/321] mm/damon/sysfs-schemes: implement probe/hits file Implement sysfs file for showing the per-region per-probe hits count. Link: https://lore.kernel.org/20260518234119.97569-18-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 41 +++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 7e21e78d7751..e25f4824b72f 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -16,11 +16,27 @@ struct damos_sysfs_probe { struct kobject kobj; + unsigned char hits; }; -static struct damos_sysfs_probe *damos_sysfs_probe_alloc(void) +static struct damos_sysfs_probe *damos_sysfs_probe_alloc(unsigned char hits) { - return kzalloc_obj(struct damos_sysfs_probe); + struct damos_sysfs_probe *probe; + + probe = kzalloc_obj(*probe); + if (!probe) + return NULL; + probe->hits = hits; + return probe; +} + +static ssize_t hits_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damos_sysfs_probe *probe = container_of(kobj, + struct damos_sysfs_probe, kobj); + + return sysfs_emit(buf, "%hhu\n", probe->hits); } static void damos_sysfs_probe_release(struct kobject *kobj) @@ -31,9 +47,19 @@ static void damos_sysfs_probe_release(struct kobject *kobj) kfree(probe); } +static struct kobj_attribute damos_sysfs_probe_hits_attr = + __ATTR_RO_MODE(hits, 0400); + +static struct attribute *damos_sysfs_probe_attrs[] = { + &damos_sysfs_probe_hits_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_probe); + static const struct kobj_type damos_sysfs_probe_ktype = { .release = damos_sysfs_probe_release, .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_probe_groups, }; /* @@ -64,7 +90,7 @@ static void damos_sysfs_probes_rm_dirs(struct damos_sysfs_probes *probes) } static int damos_sysfs_probes_add_dirs(struct damos_sysfs_probes *probes, - struct damon_ctx *ctx) + struct damon_ctx *ctx, struct damon_region *region) { struct damon_probe *probe; struct damos_sysfs_probe **probes_arr; @@ -86,7 +112,7 @@ static int damos_sysfs_probes_add_dirs(struct damos_sysfs_probes *probes, struct damos_sysfs_probe *sys_probe; int err; - sys_probe = damos_sysfs_probe_alloc(); + sys_probe = damos_sysfs_probe_alloc(region->probe_hits[i]); if (!sys_probe) { damos_sysfs_probes_rm_dirs(probes); return -ENOMEM; @@ -150,7 +176,8 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( static int damos_sysfs_region_add_dirs( struct damon_sysfs_scheme_region *region, - struct damon_ctx *ctx) + struct damon_ctx *ctx, + struct damon_region *dregion) { struct damos_sysfs_probes *probes = damos_sysfs_probes_alloc(); int err; @@ -161,7 +188,7 @@ static int damos_sysfs_region_add_dirs( ®ion->kobj, "probes"); if (err) goto fail; - err = damos_sysfs_probes_add_dirs(probes, ctx); + err = damos_sysfs_probes_add_dirs(probes, ctx, dregion); if (err) goto fail; @@ -3140,7 +3167,7 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, &sysfs_regions->kobj, "%d", sysfs_regions->nr_regions)) goto out; - if (damos_sysfs_region_add_dirs(region, ctx)) + if (damos_sysfs_region_add_dirs(region, ctx, r)) goto out; list_add_tail(®ion->list, &sysfs_regions->regions_list); From b9b7bad279de29294c4d3314fe90fca345c38ea6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:06 -0700 Subject: [PATCH 197/321] mm/damon: trace probe_hits Introduce a new tracepoint for exposing the per-region per-probe positive sample count via tracefs. Link: https://lore.kernel.org/20260518234119.97569-19-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 38 ++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 9 +++++++++ 2 files changed, 47 insertions(+) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 7e25f4469b81..78388538acf4 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -130,6 +130,44 @@ TRACE_EVENT(damon_monitor_intervals_tune, TP_printk("sample_us=%lu", __entry->sample_us) ); +TRACE_EVENT_CONDITION(damon_region_aggregated, + + TP_PROTO(unsigned int target_id, struct damon_region *r, + unsigned int nr_regions, unsigned int nr_probes), + + TP_ARGS(target_id, r, nr_regions, nr_probes), + + TP_CONDITION(nr_probes > 0), + + TP_STRUCT__entry( + __field(unsigned long, target_id) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, nr_regions) + __field(unsigned int, nr_accesses) + __field(unsigned int, age) + __dynamic_array(unsigned char, probe_hits, nr_probes) + ), + + TP_fast_assign( + __entry->target_id = target_id; + __entry->start = r->ar.start; + __entry->end = r->ar.end; + __entry->nr_regions = nr_regions; + __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; + memcpy(__get_dynamic_array(probe_hits), r->probe_hits, + sizeof(*r->probe_hits) * nr_probes); + ), + + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u probe_hits=%s", + __entry->target_id, __entry->nr_regions, + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age, + __print_hex(__get_dynamic_array(probe_hits), + __get_dynamic_array_len(probe_hits))) +); + TRACE_EVENT(damon_aggregated, TP_PROTO(unsigned int target_id, struct damon_region *r, diff --git a/mm/damon/core.c b/mm/damon/core.c index 500e8b08d441..903fd6fc9789 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1905,6 +1905,13 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; unsigned int ti = 0; /* target's index */ + unsigned int nr_probes = 0; + struct damon_probe *probe; + + if (trace_damon_region_aggregated_enabled()) { + damon_for_each_probe(probe, c) + nr_probes++; + } damon_for_each_target(t, c) { struct damon_region *r; @@ -1913,6 +1920,8 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) int i; trace_damon_aggregated(ti, r, damon_nr_regions(t)); + trace_damon_region_aggregated(ti, r, + damon_nr_regions(t), nr_probes); damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; From 14885da09b0f3350004c80202fbe533d50336c8c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:07 -0700 Subject: [PATCH 198/321] selftests/damon/sysfs.sh: test probes dir Add simple existence tests for data probes sysfs directories and files. Link: https://lore.kernel.org/20260518234119.97569-20-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 83e3b7f63d81..1ac3e2ce8e44 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -291,11 +291,59 @@ test_intervals() ensure_file "$intervals_dir/update_us" "exist" "600" } +test_damon_filter() +{ + damon_filter_dir=$1 + ensure_file "$damon_filter_dir/type" "exist" "600" + ensure_write_succ "$damon_filter_dir/type" "anon" "valid input" + ensure_write_fail "$damon_filter_dir/type" "foo" "invalid input" + ensure_file "$damon_filter_dir/matching" "exist" "600" + ensure_file "$damon_filter_dir/allow" "exist" "600" +} + +test_damon_filters() +{ + filters_dir=$1 + ensure_dir "$filters_dir" "exist" + ensure_file "$filters_dir/nr_filters" "exist" "600" + ensure_write_succ "$filters_dir/nr_filters" "1" "valid input" + test_damon_filter "$filters_dir/0" + + ensure_write_succ "$filters_dir/nr_filters" "2" "valid input" + test_damon_filter "$filters_dir/0" + test_damon_filter "$filters_dir/1" + + ensure_write_succ "$filters_dir/nr_filters" "0" "valid input" + ensure_dir "$filters_dir/0" "not_exist" + ensure_dir "$filters_dir/1" "not_exist" +} + +test_probe() +{ + probe_dir=$1 + ensure_dir "$probe_dir" "exist" + test_damon_filters "$probe_dir/filters" +} + +test_probes() +{ + probes_dir=$1 + ensure_dir "$probes_dir" "exist" + ensure_file "$probes_dir/nr_probes" "exist" "600" + + ensure_write_succ "$probes_dir/nr_probes" "1" "valid input" + test_probe "$probes_dir/0" + + ensure_write_succ "$probes_dir/nr_probes" "0" "valid input" + ensure_dir "$probes_dir/0" "not_exist" +} + test_monitoring_attrs() { monitoring_attrs_dir=$1 ensure_dir "$monitoring_attrs_dir" "exist" test_intervals "$monitoring_attrs_dir/intervals" + test_probes "$monitoring_attrs_dir/probes" test_range "$monitoring_attrs_dir/nr_regions" } From f4e98954234b104c23902ee5bb4e59be6f9904a7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:08 -0700 Subject: [PATCH 199/321] Docs/mm/damon/design: document data attributes monitoring Update DAMON design document for newly added data attributes monitoring feature. Link: https://lore.kernel.org/20260518234119.97569-21-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 37 +++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index fa7392b5a331..6731c3102d0f 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -276,6 +276,43 @@ interval``, DAMON checks if the region's size and access frequency (``nr_accesses``) has significantly changed. If so, the counter is reset to zero. Otherwise, the counter is increased. +Data Attributes Monitoring +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Data access pattern is only one type of data attributes. In some use cases, +users need to know more data attributes information. For example, users may +need to know how much of a given hot or cold memory region is backed by +anonymous pages, or belong to a specific cgroup. For such use case, data +attributes monitoring feature is provided. + +Using the feature, users can register data attributes of their interest to the +DAMON :ref:`context `. The +registration is made by specifying a probe per attribute. Each of the probe +specifies a rule to determine if a given memory region has the related +attribute. The rule is constructed with multiple filters. The filters work +same to :ref:`DAMOS filters ` except the supported +filter types. Currently only ``anon`` filter type is supported for data +attributes monitoring. + +If such probes are registered, DAMON executes the probes for each region's +sampling memory when it does the access :ref:`sampling +`. The number of samples that identified +as having the data attribute (hitting the probe) per :ref:`aggregation interval +` is accounted in a per-region per-probe counter. +Users can therefore know how much of a given DAMON region has a specific data +attribute by reading the per-region per-probe probe hits counter after each +aggregation interval. + +This is a sampling based mechanism. Hence, it is lightweight but the output +may include some measurement errors. The output should be used with good +understanding of statistics. + +Another way to do this for higher accuracy is using :ref:`DAMOS filter +` with ``stat`` :ref:`action +` and ``sz_ops_filter_passed`` :ref:`stat +`. This approach provides the data attributes +information in page level. But, because it is operated in page level, the +overhead is proportional to the size of the memory. Dynamic Target Space Updates Handling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 69a743520114b2a9c47db37059d25abe2a84e8f5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:09 -0700 Subject: [PATCH 200/321] Docs/admin-guide/mm/damon/usage: document data attributes monitoring Update DAMON usage document for the newly added data attributes monitoring feature. Link: https://lore.kernel.org/20260518234119.97569-22-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 44 ++++++++++++++++++-- Documentation/mm/damon/design.rst | 2 + 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 11c75a598393..5cf55ff6de31 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -72,6 +72,11 @@ comma (","). │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us │ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us │ │ │ │ │ │ nr_regions/min,max + │ │ │ │ │ │ :ref:`probes `/nr_probes + │ │ │ │ │ │ │ 0/filters/nr_filters + │ │ │ │ │ │ │ │ 0/type,matching,allow + │ │ │ │ │ │ │ │ ... + │ │ │ │ │ │ │ ... │ │ │ │ │ :ref:`targets `/nr_targets │ │ │ │ │ │ :ref:`0 `/pid_target,obsolete_target │ │ │ │ │ │ │ :ref:`regions `/nr_regions @@ -98,6 +103,9 @@ comma (","). │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds,nr_snapshots,max_nr_snapshots │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed + │ │ │ │ │ │ │ │ │ probes + │ │ │ │ │ │ │ │ │ │ 0/hits + │ │ │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... │ │ │ │ ... @@ -227,8 +235,8 @@ contexts//monitoring_attrs/ Files for specifying attributes of the monitoring including required quality and efficiency of the monitoring are in ``monitoring_attrs`` directory. -Specifically, two directories, ``intervals`` and ``nr_regions`` exist in this -directory. +Specifically, three directories, ``intervals``, ``nr_regions`` and ``probes`` +exist in this directory. Under ``intervals`` directory, three files for DAMON's sampling interval (``sample_us``), aggregation interval (``aggr_us``), and update interval @@ -262,6 +270,27 @@ tuning-applied current values of the two intervals can be read from the ``sample_us`` and ``aggr_us`` files after writing ``update_tuned_intervals`` to the ``state`` file. +.. _damon_usage_sysfs_probes: + +contexts//monitoring_attrs/probes/ +------------------------------------- + +A directory for registering :ref:`data attributes monitoring +` probes. + +In the beginning, this directory has only one file, ``nr_probes``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each monitoring probe. + +In each probe directory, one directory, ``filters`` exists. The directory +contains files for installing filters for the probe, that is used to determine +the data attribute for the probe. + +In the beginning, ``filters`` directory has only one file, ``nr_filters``. +Writing a number (``N``) to the file creates the number of child directories +named ``0`` to ``N-1``. Each directory represents each filter and works in a +way similar to that for :ref:`DAMOS filter `. + .. _sysfs_targets: contexts//targets/ @@ -615,10 +644,19 @@ tried_regions// ------------------ In each region directory, you will find five files (``start``, ``end``, -``nr_accesses``, ``age``, and ``sz_filter_passed``). Reading the files will +``nr_accesses``, ``age`` and ``sz_filter_passed``). Reading the files will show the properties of the region that corresponding DAMON-based operation scheme ``action`` has tried to be applied. +tried_regions//probes/ +------------------------- + +In each region directory, one directory (``probes``) also exists. In the +directory, subdirectories named ``0`` to ``N-1`` exists. ``N`` is the number +of installed probes. In each number-named directory, a file (``hits``) exist. +Reading the file shows the number of data attributes monitoring probe-hit +positive samples of the region. + Example ~~~~~~~ diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 6731c3102d0f..887b45cbeb71 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -276,6 +276,8 @@ interval``, DAMON checks if the region's size and access frequency (``nr_accesses``) has significantly changed. If so, the counter is reset to zero. Otherwise, the counter is increased. +.. _damon_design_data_attrs_monitoring: + Data Attributes Monitoring ~~~~~~~~~~~~~~~~~~~~~~~~~~ From d9f23f2f822a59771fdc3cab648785d4f651e1b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:10 -0700 Subject: [PATCH 201/321] mm/damon/core: introduce DAMON_FILTER_TYPE_MEMCG Belonging memory cgoup is another data attribute that can be useful to monitor. Introduce a new DAMON filter type, namely DAMON_FILTER_TYPE_MEMCG, for monitoring of this attribute. Link: https://lore.kernel.org/20260518234119.97569-23-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1fb271a35e98..6a54c601889b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -742,9 +742,11 @@ struct damon_intervals_goal { * enum damon_filter_type - Type of &struct damon_filter * * @DAMON_FILTER_TYPE_ANON: Anonymous pages. + * @DAMON_FILTER_TYPE_MEMCG: Specific memcg's pages. */ enum damon_filter_type { DAMON_FILTER_TYPE_ANON, + DAMON_FILTER_TYPE_MEMCG, }; /** @@ -753,12 +755,16 @@ enum damon_filter_type { * @type: Type of the region. * @matching: Whether this filter is for the type-matching ones. * @allow: Whether the @type-@matching ones should pass this filter. + * @memcg_id: Memcg id of the question if @type is DAMON_FILTER_MEMCG. * @list: Siblings list. */ struct damon_filter { enum damon_filter_type type; bool matching; bool allow; + union { + u64 memcg_id; + }; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 903fd6fc9789..9a5a835a4d3f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1430,6 +1430,13 @@ static void damon_commit_filter(struct damon_filter *dst, dst->type = src->type; dst->matching = src->matching; dst->allow = src->allow; + switch (dst->type) { + case DAMON_FILTER_TYPE_MEMCG: + dst->memcg_id = src->memcg_id; + break; + default: + break; + } } static int damon_commit_filters(struct damon_probe *dst, @@ -1454,6 +1461,13 @@ static int damon_commit_filters(struct damon_probe *dst, src_filter->matching, src_filter->allow); if (!new_filter) return -ENOMEM; + switch (src_filter->type) { + case DAMON_FILTER_TYPE_MEMCG: + new_filter->memcg_id = src_filter->memcg_id; + break; + default: + break; + } damon_add_filter(dst, new_filter); } return 0; From ba3be5430ffa7e5debec2e0fe61518a2db0489ca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:11 -0700 Subject: [PATCH 202/321] mm/damon/paddr: support DAMON_FILTER_TYPE_MEMCG Implement the support of DAMON_FILTER_TYPE_MEMCG on the DAMON operation set implementation for the physical address space. Link: https://lore.kernel.org/20260518234119.97569-24-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 9997c5174ef1..d0598f5f2688 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -124,6 +124,7 @@ static bool damon_pa_filter_match(struct damon_filter *filter, struct folio *folio) { bool matched = false; + struct mem_cgroup *memcg; switch (filter->type) { case DAMON_FILTER_TYPE_ANON: @@ -133,6 +134,19 @@ static bool damon_pa_filter_match(struct damon_filter *filter, } matched = folio_test_anon(folio); break; + case DAMON_FILTER_TYPE_MEMCG: + if (!folio) { + matched = false; + break; + } + rcu_read_lock(); + memcg = folio_memcg_check(folio); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; default: break; } From c71f8e13462d6eab9928f579c15c0a4b16abab84 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:12 -0700 Subject: [PATCH 203/321] mm/damon/sysfs: add filters//path file Introduce a new DAMON sysfs file for letting users setup the target memory cgroup of the belonging memory cgroup attribute monitoring. The file is named 'path', located under the probe filter directory. Users can set the target memory cgroup by writing the path to the memory cgroup from the cgroup mount point to the file. Link: https://lore.kernel.org/20260518234119.97569-25-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index eeb7fdd030cf..0f6379caf481 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -756,6 +756,7 @@ struct damon_sysfs_filter { enum damon_filter_type type; bool matching; bool allow; + char *path; }; static struct damon_sysfs_filter *damon_sysfs_filter_alloc(void) @@ -774,6 +775,10 @@ damon_sysfs_filter_type_names[] = { .type = DAMON_FILTER_TYPE_ANON, .name = "anon", }, + { + .type = DAMON_FILTER_TYPE_MEMCG, + .name = "memcg", + }, }; static ssize_t type_show(struct kobject *kobj, @@ -862,11 +867,46 @@ static ssize_t allow_store(struct kobject *kobj, return count; } +static ssize_t path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + int len; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + len = sysfs_emit(buf, "%s\n", filter->path ? filter->path : ""); + mutex_unlock(&damon_sysfs_lock); + return len; +} + +static ssize_t path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + char *path = kmalloc_objs(*path, size_add(count, 1)); + + if (!path) + return -ENOMEM; + strscpy(path, buf, size_add(count, 1)); + if (!mutex_trylock(&damon_sysfs_lock)) { + kfree(path); + return -EBUSY; + } + kfree(filter->path); + filter->path = path; + mutex_unlock(&damon_sysfs_lock); + return count; +} + static void damon_sysfs_filter_release(struct kobject *kobj) { struct damon_sysfs_filter *filter = container_of(kobj, struct damon_sysfs_filter, kobj); + kfree(filter->path); kfree(filter); } @@ -879,10 +919,14 @@ static struct kobj_attribute damon_sysfs_filter_matching_attr = static struct kobj_attribute damon_sysfs_filter_allow_attr = __ATTR_RW_MODE(allow, 0600); +static struct kobj_attribute damon_sysfs_filter_path_attr = + __ATTR_RW_MODE(path, 0600); + static struct attribute *damon_sysfs_filter_attrs[] = { &damon_sysfs_filter_type_attr.attr, &damon_sysfs_filter_matching_attr.attr, &damon_sysfs_filter_allow_attr.attr, + &damon_sysfs_filter_path_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_filter); From b2025ce0662b186b3158c25f7f9c25b4e6931acc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:13 -0700 Subject: [PATCH 204/321] mm/damon/sysfs-schemes: move memcg_path_to_id() to sysfs-common The next commit will need to find the memcg id from the user-passed path to the memory cgroup, from sysfs.c. memcg_path_to_id() is doing that, but defined in sysfs-schemes.c as a static function. Move the function to sysfs-common.c and mark it as non-static, so that the next commit can reuse the function. Link: https://lore.kernel.org/20260518234119.97569-26-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.c | 41 ++++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs-common.h | 2 ++ mm/damon/sysfs-schemes.c | 41 ---------------------------------------- 3 files changed, 43 insertions(+), 41 deletions(-) diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 83e24a9b5a0d..bdc6ae2639e4 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -104,3 +104,44 @@ const struct kobj_type damon_sysfs_ul_range_ktype = { .default_groups = damon_sysfs_ul_range_groups, }; + +static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, + char *memcg_path_buf, char *path) +{ +#ifdef CONFIG_MEMCG + cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); + if (sysfs_streq(memcg_path_buf, path)) + return true; +#endif /* CONFIG_MEMCG */ + return false; +} + +int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) +{ + struct mem_cgroup *memcg; + char *path; + bool found = false; + + if (!memcg_path) + return -EINVAL; + + path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); + if (!path) + return -ENOMEM; + + for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; + memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + /* skip offlined memcg */ + if (!mem_cgroup_online(memcg)) + continue; + if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { + *id = mem_cgroup_id(memcg); + found = true; + mem_cgroup_iter_break(NULL, memcg); + break; + } + } + + kfree(path); + return found ? 0 : -EINVAL; +} diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 2099adee11d0..3079306966a9 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -59,3 +59,5 @@ int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, void damos_sysfs_update_effective_quotas( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); + +int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index e25f4824b72f..329cfd0bbe9f 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2791,47 +2791,6 @@ const struct kobj_type damon_sysfs_schemes_ktype = { .default_groups = damon_sysfs_schemes_groups, }; -static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, - char *memcg_path_buf, char *path) -{ -#ifdef CONFIG_MEMCG - cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); - if (sysfs_streq(memcg_path_buf, path)) - return true; -#endif /* CONFIG_MEMCG */ - return false; -} - -static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) -{ - struct mem_cgroup *memcg; - char *path; - bool found = false; - - if (!memcg_path) - return -EINVAL; - - path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); - if (!path) - return -ENOMEM; - - for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; - memcg = mem_cgroup_iter(NULL, memcg, NULL)) { - /* skip offlined memcg */ - if (!mem_cgroup_online(memcg)) - continue; - if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { - *id = mem_cgroup_id(memcg); - found = true; - mem_cgroup_iter_break(NULL, memcg); - break; - } - } - - kfree(path); - return found ? 0 : -EINVAL; -} - static int damon_sysfs_add_scheme_filters(struct damos *scheme, struct damon_sysfs_scheme_filters *sysfs_filters) { From 543ab01db7ace5bb28972ac70f321d55cc4f0214 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:14 -0700 Subject: [PATCH 205/321] mm/damon/sysfs: setup damon_filter->memcg_id from path Find and set the memcg_id for damon_filter from the user-passed memory cgroup path when updating the DAMON input parameters. Link: https://lore.kernel.org/20260518234119.97569-27-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 2 +- mm/damon/sysfs.c | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 6a54c601889b..4014fd0d463c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1006,6 +1006,7 @@ static inline unsigned long damon_sz_region(struct damon_region *r) struct damon_filter *damon_new_filter(enum damon_filter_type type, bool matching, bool allow); void damon_add_filter(struct damon_probe *probe, struct damon_filter *f); +void damon_destroy_filter(struct damon_filter *f); struct damon_probe *damon_new_probe(void); void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); diff --git a/mm/damon/core.c b/mm/damon/core.c index 9a5a835a4d3f..4e223857a0f9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -143,7 +143,7 @@ static void damon_free_filter(struct damon_filter *f) kfree(f); } -static void damon_destroy_filter(struct damon_filter *f) +void damon_destroy_filter(struct damon_filter *f) { damon_del_filter(f); damon_free_filter(f); diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 0f6379caf481..2e95e3bac774 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1927,6 +1927,17 @@ static int damon_sysfs_set_probes(struct damon_ctx *ctx, sys_filter->allow); if (!filter) return -ENOMEM; + if (filter->type == DAMON_FILTER_TYPE_MEMCG) { + int err; + + err = damon_sysfs_memcg_path_to_id( + sys_filter->path, + &filter->memcg_id); + if (err) { + damon_destroy_filter(filter); + return err; + } + } damon_add_filter(c, filter); } } From 2fd777ebdfaafaead833a04882cbe8b1cdc5bdf1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:15 -0700 Subject: [PATCH 206/321] Docs/mm/damon/design: update for memcg damon filter Update DAMON design document for the newly added belonging memory cgroup attribute monitoring feature. Link: https://lore.kernel.org/20260518234119.97569-28-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 887b45cbeb71..a24f9f00d183 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -293,8 +293,8 @@ registration is made by specifying a probe per attribute. Each of the probe specifies a rule to determine if a given memory region has the related attribute. The rule is constructed with multiple filters. The filters work same to :ref:`DAMOS filters ` except the supported -filter types. Currently only ``anon`` filter type is supported for data -attributes monitoring. +filter types. Currently only ``anon`` and ``memcg`` filter types are supported +for data attributes monitoring. If such probes are registered, DAMON executes the probes for each region's sampling memory when it does the access :ref:`sampling From 9d3678808a3e575088f22db306a000c4f4458dfe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 18 May 2026 16:41:16 -0700 Subject: [PATCH 207/321] Docs/admin-guide/mm/damon/usage: update for memcg damon filter Update DAMON usage document for the newly added belonging memory cgroup attribute monitoring feature. Link: https://lore.kernel.org/20260518234119.97569-29-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 5cf55ff6de31..0d6a27dc97b0 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -74,7 +74,7 @@ comma (","). │ │ │ │ │ │ nr_regions/min,max │ │ │ │ │ │ :ref:`probes `/nr_probes │ │ │ │ │ │ │ 0/filters/nr_filters - │ │ │ │ │ │ │ │ 0/type,matching,allow + │ │ │ │ │ │ │ │ 0/type,matching,allow,path │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ │ ... │ │ │ │ │ :ref:`targets `/nr_targets @@ -289,7 +289,9 @@ the data attribute for the probe. In the beginning, ``filters`` directory has only one file, ``nr_filters``. Writing a number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each filter and works in a -way similar to that for :ref:`DAMOS filter `. +way similar to that for :ref:`DAMOS filter `. When the filter +``type`` is ``memcg``, ``path`` file acts as ``memcg_path`` for :ref:`DAMOS +filter `. .. _sysfs_targets: From 4f1839e22527f1621767721a90fa00425fbb0877 Mon Sep 17 00:00:00 2001 From: Shivam Kalra Date: Tue, 19 May 2026 17:42:14 +0530 Subject: [PATCH 208/321] mm/vmalloc: extract vm_area_free_pages() helper from vfree() Patch series "mm/vmalloc: free unused pages on vrealloc() shrink", v14. This series implements the TODO in vrealloc() to unmap and free unused pages when shrinking across a page boundary. Problem: When vrealloc() shrinks an allocation, it updates bookkeeping (requested_size, KASAN shadow) but does not free the underlying physical pages. This wastes memory for the lifetime of the allocation. Solution: - Patch 1: Extracts a vm_area_free_pages(vm, start_idx, end_idx) helper from vfree() that frees a range of pages with memcg and nr_vmalloc_pages accounting. Freed page pointers are set to NULL to prevent stale references. - Patch 2: Update the grow-in-place check in vrealloc() to compare the requested size against the actual physical page count (vm->nr_pages) rather than the virtual area sizes. This is a prerequisite for shrinking. - Patch 3: For VM_ALLOC areas in vread_iter(), derive the vm area size from vm->nr_pages rather than get_vm_area_size(), which would overestimate the mapped range after a shrink. Other mapping types (vmap, ioremap) don't set nr_pages and keep using get_vm_area_size(). - Patch 4: Uses the helper to free tail pages when vrealloc() shrinks across a page boundary. - Patch 5: Adds a vrealloc test case to lib/test_vmalloc that exercises grow-realloc, shrink-across-boundary, shrink-within-page, and grow-in-place paths. The virtual address reservation is kept intact to preserve the range for potential future grow-in-place support. A concrete user is the Rust binder driver's KVVec::shrink_to [1], which performs explicit vrealloc() shrinks for memory reclamation. This patch (of 5): Extract page freeing and NR_VMALLOC stat accounting from vfree() into a reusable vm_area_free_pages() helper. The helper operates on a range [start_idx, end_idx) of pages from a vm_struct, making it suitable for both full free (vfree) and partial free (upcoming vrealloc shrink). Freed page pointers in vm->pages[] are set to NULL to prevent stale references when the vm_struct outlives the free (as in vrealloc shrink). Link: https://lore.kernel.org/20260519-vmalloc-shrink-v14-0-70b96ee3e9c9@zohomail.in Link: https://lore.kernel.org/20260519-vmalloc-shrink-v14-1-70b96ee3e9c9@zohomail.in Link: https://lore.kernel.org/all/20260216-binder-shrink-vec-v3-v6-0-ece8e8593e53@zohomail.in/ [1] Signed-off-by: Shivam Kalra Reviewed-by: Uladzislau Rezki (Sony) Cc: Alice Ryhl Cc: Danilo Krummrich Signed-off-by: Andrew Morton --- mm/vmalloc.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eabb86b13b7e..5555601b9529 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3416,6 +3416,32 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +/* + * vm_area_free_pages - free a range of pages from a vmalloc allocation + * @vm: the vm_struct containing the pages + * @start_idx: first page index to free (inclusive) + * @end_idx: last page index to free (exclusive) + * + * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting. + * Freed vm->pages[] entries are set to NULL. + * Caller is responsible for unmapping (vunmap_range) and KASAN + * poisoning before calling this. + */ +static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx, + unsigned int end_idx) +{ + unsigned int i; + + if (!(vm->flags & VM_MAP_PUT_PAGES)) { + for (i = start_idx; i < end_idx; i++) + mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); + } + free_pages_bulk(vm->pages + start_idx, end_idx - start_idx); + + for (i = start_idx; i < end_idx; i++) + vm->pages[i] = NULL; +} + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3436,7 +3462,6 @@ void vfree_atomic(const void *addr) void vfree(const void *addr) { struct vm_struct *vm; - int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); @@ -3460,12 +3485,7 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - if (!(vm->flags & VM_MAP_PUT_PAGES)) { - for (i = 0; i < vm->nr_pages; i++) - mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); - } - free_pages_bulk(vm->pages, vm->nr_pages); - + vm_area_free_pages(vm, 0, vm->nr_pages); kvfree(vm->pages); kfree(vm); } From d57ac904ffdce6c06e9a113fce603420c041b48c Mon Sep 17 00:00:00 2001 From: Shivam Kalra Date: Tue, 19 May 2026 17:42:15 +0530 Subject: [PATCH 209/321] mm/vmalloc: use physical page count for vrealloc() grow-in-place check Update the grow-in-place check in vrealloc() to compare the requested size against the actual physical page count (vm->nr_pages) rather than the virtual area size (alloced_size, derived from get_vm_area_size()). Currently both values are equivalent, but the upcoming vrealloc() shrink functionality will free pages without reducing the virtual reservation size. After such a shrink, the old alloced_size-based comparison would incorrectly allow a grow-in-place operation to succeed and attempt to access freed pages. Switch to vm->nr_pages now so the check remains correct once shrink support is added. Link: https://lore.kernel.org/20260519-vmalloc-shrink-v14-2-70b96ee3e9c9@zohomail.in Signed-off-by: Shivam Kalra Reviewed-by: Uladzislau Rezki (Sony) Cc: Alice Ryhl Cc: Danilo Krummrich Signed-off-by: Andrew Morton --- mm/vmalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5555601b9529..3e159b74cfab 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4343,6 +4343,12 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && nid != page_to_nid(vmalloc_to_page(p))) goto need_realloc; + } else { + /* + * If p is NULL, vrealloc behaves exactly like vmalloc. + * Skip the shrink and in-place grow paths. + */ + goto need_realloc; } /* @@ -4361,7 +4367,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align /* * We already have the bytes available in the allocation; use them. */ - if (size <= alloced_size) { + if (size <= vm->nr_pages << PAGE_SHIFT) { /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during From 0bca23804632cc7275fc5f67191b6be58993cd28 Mon Sep 17 00:00:00 2001 From: Shivam Kalra Date: Tue, 19 May 2026 17:42:16 +0530 Subject: [PATCH 210/321] mm/vmalloc: use physical page count in vread_iter() for VM_ALLOC areas For VM_ALLOC areas in vread_iter(), derive the vm area size from vm->nr_pages rather than get_vm_area_size(). Only VM_ALLOC areas are subject to vrealloc() shrinking, which frees pages without reducing the virtual reservation size. Switch to using vm->nr_pages for VM_ALLOC areas so the reader remains correct once shrink support is added. Other mapping types (vmap, ioremap) do not initialize nr_pages and will continue using get_vm_area_size(). [shivamkalra98@zohomail.in: add an nr_pages check] Link: https://lore.kernel.org/aff47da5-4fd5-481d-be18-e1eb99639490@zohomail.in Link: https://lore.kernel.org/20260519-vmalloc-shrink-v14-3-70b96ee3e9c9@zohomail.in Signed-off-by: Shivam Kalra Reviewed-by: Uladzislau Rezki (Sony) Cc: Alice Ryhl Cc: Danilo Krummrich Signed-off-by: Andrew Morton --- mm/vmalloc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3e159b74cfab..bc21bf8e188b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4666,7 +4666,18 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) smp_rmb(); vaddr = (char *) va->va_start; - size = vm ? get_vm_area_size(vm) : va_size(va); + if (vm) + /* + * For VM_ALLOC areas, use nr_pages rather than + * get_vm_area_size() because vrealloc() may shrink + * the mapping without updating area->size. Other + * mapping types (vmap, ioremap) don't set nr_pages. + */ + size = (vm->flags & VM_ALLOC && vm->nr_pages) ? + (vm->nr_pages << PAGE_SHIFT) : + get_vm_area_size(vm); + else + size = va_size(va); if (addr >= vaddr + size) goto next_va; From 5ea8ec74c57c0c920c26530ba586391a9a3f3e5f Mon Sep 17 00:00:00 2001 From: Shivam Kalra Date: Tue, 19 May 2026 17:42:17 +0530 Subject: [PATCH 211/321] mm/vmalloc: free unused pages on vrealloc() shrink When vrealloc() shrinks an allocation and the new size crosses a page boundary, unmap and free the tail pages that are no longer needed. This reclaims physical memory that was previously wasted for the lifetime of the allocation. The heuristic is simple: always free when at least one full page becomes unused. Huge page allocations (page_order > 0) are skipped, as partial freeing would require splitting. Allocations with VM_FLUSH_RESET_PERMS are also skipped, as their direct-map permissions must be reset before pages are returned to the page allocator, which is handled by vm_reset_perms() during vfree(). Additionally, allocations with VM_USERMAP are skipped because remap_vmalloc_range_partial() validates mapping requests against the unchanged vm->size; freeing tail pages would cause vmalloc_to_page() to return NULL for the unmapped range. To protect concurrent readers, the shrink path uses Node lock to synchronize before freeing the pages. Finally, we notify kmemleak of the reduced allocation size using kmemleak_free_part() to prevent the kmemleak scanner from faulting on the newly unmapped virtual addresses. The virtual address reservation (vm->size / vmap_area) is intentionally kept unchanged, preserving the address for potential future grow-in-place support. Link: https://lore.kernel.org/20260519-vmalloc-shrink-v14-4-70b96ee3e9c9@zohomail.in Signed-off-by: Shivam Kalra Suggested-by: Danilo Krummrich Reviewed-by: Uladzislau Rezki (Sony) Cc: Alice Ryhl Signed-off-by: Andrew Morton --- mm/vmalloc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bc21bf8e188b..1afca3568b9b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4351,14 +4351,62 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align goto need_realloc; } - /* - * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What - * would be a good heuristic for when to shrink the vm_area? - */ if (size <= old_size) { + unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + /* Zero out "freed" memory, potentially for future realloc. */ if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + + /* + * Free tail pages when shrink crosses a page boundary. + * + * Skip huge page allocations (page_order > 0) as partial + * freeing would require splitting. + * + * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must + * be reset before pages are returned to the allocator. + * + * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates + * mapping requests against the unchanged vm->size; freeing + * tail pages would cause vmalloc_to_page() to return NULL for + * the unmapped range. + * + * Skip if either GFP_NOFS or GFP_NOIO are used. + * kmemleak_free_part() internally allocates with + * GFP_KERNEL, which could trigger a recursive deadlock + * if we are under filesystem or I/O reclaim. + */ + if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) && + !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) && + gfp_has_io_fs(flags)) { + unsigned long addr = (unsigned long)kasan_reset_tag(p); + unsigned int old_nr_pages = vm->nr_pages; + + /* + * Use the node lock to synchronize with concurrent + * readers (vmalloc_info_show). + */ + struct vmap_node *vn = addr_to_node(addr); + + spin_lock(&vn->busy.lock); + vm->nr_pages = new_nr_pages; + spin_unlock(&vn->busy.lock); + + /* Notify kmemleak of the reduced allocation size before unmapping. */ + kmemleak_free_part( + (void *)addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + (unsigned long)(old_nr_pages - new_nr_pages) + << PAGE_SHIFT); + + vunmap_range(addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + addr + ((unsigned long)old_nr_pages + << PAGE_SHIFT)); + + vm_area_free_pages(vm, new_nr_pages, old_nr_pages); + } vm->requested_size = size; kasan_vrealloc(p, old_size, size); return (void *)p; From 3c3daeafcdb60e182554679fc32d2c912d1b0b6a Mon Sep 17 00:00:00 2001 From: Shivam Kalra Date: Tue, 19 May 2026 17:42:18 +0530 Subject: [PATCH 212/321] lib/test_vmalloc: add vrealloc test case Introduce a new test case "vrealloc_test" that exercises the vrealloc() shrink and in-place grow paths: - Grow beyond allocated pages (triggers full reallocation). - Shrink crossing a page boundary (frees tail pages). - Shrink within the same page (no page freeing). - Grow within the already allocated page count (in-place). Data integrity is validated after each realloc step by checking that the first byte of the original allocation is preserved. The test is gated behind run_test_mask bit 12 (id 4096). Link: https://lore.kernel.org/20260519-vmalloc-shrink-v14-5-70b96ee3e9c9@zohomail.in Signed-off-by: Shivam Kalra Reviewed-by: Uladzislau Rezki (Sony) Cc: Alice Ryhl Cc: Danilo Krummrich Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 876c72c18a0c..b23f85e8f8ca 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -55,6 +55,7 @@ __param(int, run_test_mask, 7, "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" "\t\tid: 1024, name: vm_map_ram_test\n" "\t\tid: 2048, name: no_block_alloc_test\n" + "\t\tid: 4096, name: vrealloc_test\n" /* Add a new test case description here. */ ); @@ -421,6 +422,66 @@ cleanup: return nr_allocated != map_nr_pages; } +static int vrealloc_test(void) +{ + void *ptr, *tmp; + int i; + + for (i = 0; i < test_loop_count; i++) { + int err = -1; + + ptr = vrealloc(NULL, PAGE_SIZE, GFP_KERNEL); + if (!ptr) + return -1; + + *((__u8 *)ptr) = 'a'; + + /* Grow: beyond allocated pages, triggers full realloc. */ + tmp = vrealloc(ptr, 4 * PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Shrink: crosses page boundary, frees tail pages. */ + tmp = vrealloc(ptr, PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Shrink: within same page, no page freeing. */ + tmp = vrealloc(ptr, PAGE_SIZE / 2, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Grow: within allocated page, in-place, no realloc. */ + tmp = vrealloc(ptr, PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + err = 0; +error: + vfree(ptr); + if (err) + return err; + } + + return 0; +} + struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -440,6 +501,7 @@ static struct test_case_desc test_case_array[] = { { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, }, { "vm_map_ram_test", vm_map_ram_test, }, { "no_block_alloc_test", no_block_alloc_test, true }, + { "vrealloc_test", vrealloc_test, }, /* Add a new test case here. */ }; From a2b8d7827f48ee54a686cb80e4a1d0ff954ec42a Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Thu, 14 May 2026 02:26:57 -0700 Subject: [PATCH 213/321] drivers/base/memory: set mem->altmap after successful device registration If __add_memory_block() fails at xa_store() (under memory pressure for example), device_unregister() is called, which eventually triggers memory_block_release() with mem->altmap still set, causing a WARN_ON(mem->altmap). This was triggered by modifying virtio-mem driver. Fix this by delaying the assignment of mem->altmap until after __add_memory_block() has succeeded. Link: https://lore.kernel.org/20260514092657.3057141-1-georgi.djakov@oss.qualcomm.com Fixes: 1a8c64e11043 ("mm/memory_hotplug: embed vmem_altmap details in memory block") Signed-off-by: Georgi Djakov Acked-by: Oscar Salvador (SUSE) Cc: Vishal Verma Cc: Mike Rapoport Cc: Richard Cheng Cc: David Hildenbrand Cc: Georgi Djakov Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Signed-off-by: Andrew Morton --- drivers/base/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index d31a421f7483..b318344426fa 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -797,7 +797,6 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = nid; - mem->altmap = altmap; INIT_LIST_HEAD(&mem->group_next); #ifndef CONFIG_NUMA @@ -815,6 +814,8 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state if (ret) return ret; + mem->altmap = altmap; + if (group) { mem->group = group; list_add(&mem->group_next, &group->memory_blocks); From a10848d98ec9c5372a979d7aa91a8b8fe50fd8f8 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 14 May 2026 16:57:54 +0800 Subject: [PATCH 214/321] mm/memory-failure: use zone_pcp_disable() for poison handling __page_handle_poison() used drain_all_pages() instead of zone_pcp_disable() because dissolve_free_hugetlb_folio() could restore HVO vmemmap pages and decrement hugetlb_optimize_vmemmap_key. That static key update took cpu_hotplug_lock through static_key_slow_dec(), while zone_pcp_disable() holds pcp_batch_high_lock. CPU hotplug takes the locks in the opposite order through page_alloc_cpu_online/dead(), so the combination could deadlock. That dependency no longer exists. Commit da3e2d1ca43d ("mm/hugetlb: remove hugetlb_optimize_vmemmap_key static key") removed the HVO static key and the static_branch_dec() from hugetlb_vmemmap_restore_folio(). The dissolve_free_hugetlb_folio() path no longer reaches static_key_slow_dec(). Use zone_pcp_disable() again while dissolving the hugetlb folio and taking the target page off the buddy allocator. This prevents the drained PCP lists from being refilled before take_page_off_buddy() runs, making the page isolation deterministic. Link: https://lore.kernel.org/20260514085754.84097-1-kaitao.cheng@linux.dev Signed-off-by: Kaitao Cheng Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1b8d0bade04a..51508a55c405 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -172,23 +172,11 @@ static int __page_handle_poison(struct page *page) { int ret; - /* - * zone_pcp_disable() can't be used here. It will - * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold - * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap - * optimization is enabled. This will break current lock dependency - * chain and leads to deadlock. - * Disabling pcp before dissolving the page was a deterministic - * approach because we made sure that those pages cannot end up in any - * PCP list. Draining PCP lists expels those pages to the buddy system, - * but nothing guarantees that those pages do not get back to a PCP - * queue if we need to refill those. - */ + zone_pcp_disable(page_zone(page)); ret = dissolve_free_hugetlb_folio(page_folio(page)); - if (!ret) { - drain_all_pages(page_zone(page)); + if (!ret) ret = take_page_off_buddy(page); - } + zone_pcp_enable(page_zone(page)); return ret; } From f30462fc7d2370761b84eaf5b3ed84a03bdf3266 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 12 May 2026 23:15:23 +0800 Subject: [PATCH 215/321] mm/damon/vaddr: attempt per-vma lock during page table walk Currently, DAMON virtual address operations use mmap_read_lock during page table walks, which can cause unnecessary contention under high concurrency. Introduce damon_va_walk_page_range() to first attempt acquiring a per-vma lock. If the VMA is found and the range is fully contained within it, the page table walk proceeds with the per-vma lock instead of mmap_read_lock. This optimization is expected to be particularly effective for damon_va_young() and damon_va_mkold(), which are frequently called and typically operate within a single VMA. Link: https://lore.kernel.org/20260512151523.2092638-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 69 ++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 1b0ebe3b6951..d27147603564 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -237,6 +237,35 @@ static void damon_va_update(struct damon_ctx *ctx) } } +static void damon_va_walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mm_walk_ops *ops, void *private) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, start); + if (!vma) + goto lock_mmap; + + if (end > vma->vm_end) { + vma_end_read(vma); + goto lock_mmap; + } + + if (!(vma->vm_flags & VM_PFNMAP)) { + ops->walk_lock = PGWALK_VMA_RDLOCK_VERIFY; + walk_page_range_vma(vma, start, end, ops, private); + } + + vma_end_read(vma); + return; + +lock_mmap: + mmap_read_lock(mm); + ops->walk_lock = PGWALK_RDLOCK; + walk_page_range(mm, start, end, ops, private); + mmap_read_unlock(mm); +} + static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -315,17 +344,14 @@ out: #define damon_mkold_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static const struct mm_walk_ops damon_mkold_ops = { - .pmd_entry = damon_mkold_pmd_entry, - .hugetlb_entry = damon_mkold_hugetlb_entry, - .walk_lock = PGWALK_RDLOCK, -}; - static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) { - mmap_read_lock(mm); - walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); - mmap_read_unlock(mm); + struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, + }; + + damon_va_walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); } /* @@ -445,12 +471,6 @@ out: #define damon_young_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static const struct mm_walk_ops damon_young_ops = { - .pmd_entry = damon_young_pmd_entry, - .hugetlb_entry = damon_young_hugetlb_entry, - .walk_lock = PGWALK_RDLOCK, -}; - static bool damon_va_young(struct mm_struct *mm, unsigned long addr, unsigned long *folio_sz) { @@ -459,9 +479,12 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, .young = false, }; - mmap_read_lock(mm); - walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); - mmap_read_unlock(mm); + struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, + }; + + damon_va_walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); return arg.young; } @@ -750,7 +773,6 @@ static unsigned long damos_va_migrate(struct damon_target *target, struct mm_walk_ops walk_ops = { .pmd_entry = damos_va_migrate_pmd_entry, .pte_entry = NULL, - .walk_lock = PGWALK_RDLOCK, }; use_target_nid = dests->nr_dests == 0; @@ -768,9 +790,7 @@ static unsigned long damos_va_migrate(struct damon_target *target, if (!mm) goto free_lists; - mmap_read_lock(mm); - walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); - mmap_read_unlock(mm); + damon_va_walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); mmput(mm); for (int i = 0; i < nr_dests; i++) { @@ -862,7 +882,6 @@ static unsigned long damos_va_stat(struct damon_target *target, struct mm_struct *mm; struct mm_walk_ops walk_ops = { .pmd_entry = damos_va_stat_pmd_entry, - .walk_lock = PGWALK_RDLOCK, }; priv.scheme = s; @@ -875,9 +894,7 @@ static unsigned long damos_va_stat(struct damon_target *target, if (!mm) return 0; - mmap_read_lock(mm); - walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); - mmap_read_unlock(mm); + damon_va_walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); mmput(mm); return 0; } From c33afe6f972d7bfada751c9ee83d9875ea38d6dc Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 20 May 2026 13:17:51 +0800 Subject: [PATCH 216/321] Documentation/admin-guide/mm: fix typos in transhuge.rst Fix these two typos: 1. approporiately -> appropriately 2. presure -> pressure Link: https://lore.kernel.org/20260520051751.74396-1-leon.hwang@linux.dev Signed-off-by: Leon Hwang Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Reviewed-by: Lance Yang Reviewed-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jonathan Corbet Cc: Leon Hwang Cc: Liam R. Howlett Cc: Michal Hocko Cc: Nico Pache Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5fbc3d89bb07..76f4eb14e262 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -57,7 +57,7 @@ prominent because the size of each page isn't as huge as the PMD-sized variant and there is less memory to clear in each page fault. Some architectures also employ TLB compression mechanisms to squeeze more entries in when a set of PTEs are virtually and physically contiguous -and approporiately aligned. In this case, TLB misses will occur less +and appropriately aligned. In this case, TLB misses will occur less often. THP can be enabled system wide or restricted to certain tasks or even @@ -210,7 +210,7 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size All THPs at fault and collapse time will be added to _deferred_list, -and will therefore be split under memory presure if they are considered +and will therefore be split under memory pressure if they are considered "underused". A THP is underused if the number of zero-filled pages in the THP is above max_ptes_none (see below). It is possible to disable this behaviour by writing 0 to shrink_underused, and enable it by writing From e186709b0a2d5a05c3cb38e46d13b399f5f5d3f9 Mon Sep 17 00:00:00 2001 From: niecheng Date: Tue, 19 May 2026 18:20:59 -0700 Subject: [PATCH 217/321] mm/damon/core: clarify next_intervals_tune_sis update path Patch series "mm/damon: documentation and comment fixes". This patch (of 3): damon_set_attrs() updates next_aggregation_sis and next_ops_update_sis for online attrs updates, but it does not update next_intervals_tune_sis there. This can look like a missing update when reading damon_set_attrs() alone, while next_intervals_tune_sis is actually updated in kdamond_fn(). Add a short comment to make this explicit. Link: https://lore.kernel.org/20260520012104.93602-1-sj@kernel.org Link: https://lore.kernel.org/20260520012104.93602-2-sj@kernel.org Suggested-by: SeongJae Park Signed-off-by: niecheng Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Sakurai Shun Cc: Zenghui Yu Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 4e223857a0f9..68b3b4bbc8fc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -909,6 +909,9 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) attrs->aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->passed_sample_intervals + attrs->ops_update_interval / sample_interval; + /* + * next_intervals_tune_sis will be updated inside kdamond_fn(). + */ damon_update_monitoring_results(ctx, attrs, aggregating); ctx->attrs = *attrs; From de5480aeffc59d0792855c59c98624038ce67b67 Mon Sep 17 00:00:00 2001 From: Sakurai Shun Date: Tue, 19 May 2026 18:21:00 -0700 Subject: [PATCH 218/321] Docs/mm/damon/design: fix three typos L140: "unsinged" -> "unsigned" L371: "sampleing" -> "sampling" L387: "multipled" -> "multiplied" Link: https://lore.kernel.org/20260520012104.93602-3-sj@kernel.org Signed-off-by: Sakurai Shun Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Acked-by: Mike Rapoport (Microsoft) Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Michal Hocko Cc: niecheng Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zenghui Yu Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index a24f9f00d183..2da7ca0d3d17 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -147,7 +147,7 @@ as Idle page tracking does. Address Unit ------------ -DAMON core layer uses ``unsinged long`` type for monitoring target address +DAMON core layer uses ``unsigned long`` type for monitoring target address ranges. In some cases, the address space for a given operations set could be too large to be handled with the type. ARM (32-bit) with large physical address extension is an example. For such cases, a per-operations set @@ -417,7 +417,7 @@ with theoretical maximum ``nr_accesses``, which can be calculated as ``aggregation interval / sampling interval``. The mechanism calculates the ratio of access events for ``aggrs`` aggregations, -and increases or decrease the ``sampleing interval`` and ``aggregation +and increases or decrease the ``sampling interval`` and ``aggregation interval`` in same ratio, if the observed access ratio is lower or higher than the target, respectively. The ratio of the intervals change is decided in proportion to the distance between current samples ratio and the target ratio. @@ -433,7 +433,7 @@ The tuning is turned off by default, and need to be set explicitly by the user. As a rule of thumbs and the Parreto principle, 4% access samples ratio target is recommended. Note that Parreto principle (80/20 rule) has applied twice. That is, assumes 4% (20% of 20%) DAMON-observed access events ratio (source) -to capture 64% (80% multipled by 80%) real access events (outcomes). +to capture 64% (80% multiplied by 80%) real access events (outcomes). To know how user-space can use this feature via :ref:`DAMON sysfs interface `, refer to :ref:`intervals_goal From 12e4d4bb6e5a4845f0c22e5d8820fdc6244653a4 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 19 May 2026 18:21:01 -0700 Subject: [PATCH 219/321] Docs/{ABI,admin-guide}/damon: fix various typoes ``damon_target_idx`` was wrongly written as ``target_idx`` in the docs. Fix it all over the place, as well as the wrong directory count, grammar, etc. Link: https://lore.kernel.org/20260520012104.93602-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Zenghui Yu Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: niecheng Cc: Sakurai Shun Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 2 +- Documentation/admin-guide/mm/damon/usage.rst | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index ee29d4e204ff..b73e6bc28ea5 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -452,7 +452,7 @@ Description: If 'hugepage_size' is written to the 'type' file, writing to or reading from this file sets or gets the maximum size of the hugepage for the filter. -What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//target_idx +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters//damon_target_idx Date: Feb 2025 Contact: SeongJae Park Description: If 'target' is written to the 'type' file, writing to or diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 0d6a27dc97b0..d46875e603d8 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -97,7 +97,7 @@ comma (","). │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid,path │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low │ │ │ │ │ │ │ :ref:`{core_,ops_,}filters `/nr_filters - │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max + │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,damon_target_idx,min,max │ │ │ │ │ │ │ :ref:`dests `/nr_dests │ │ │ │ │ │ │ │ 0/id,weight │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds,nr_snapshots,max_nr_snapshots @@ -374,7 +374,7 @@ to ``N-1``. Each directory represents each DAMON-based operation scheme. schemes// ------------ -In each scheme directory, eight directories (``access_pattern``, ``quotas``, +In each scheme directory, nine directories (``access_pattern``, ``quotas``, ``watermarks``, ``core_filters``, ``ops_filters``, ``filters``, ``dests``, ``stats``, and ``tried_regions``) and three files (``action``, ``target_nid`` and ``apply_interval``) exist. @@ -492,7 +492,7 @@ given DAMON-based operation scheme. Under the watermarks directory, five files (``metric``, ``interval_us``, ``high``, ``mid``, and ``low``) for setting the metric, the time interval between check of the metric, and the three watermarks exist. You can set and -get the five values by writing to the files, respectively. +get the five values by writing to and reading from the files, respectively. Keywords and meanings of those that can be written to the ``metric`` file are as below. @@ -500,7 +500,7 @@ as below. - none: Ignore the watermarks - free_mem_rate: System's free memory rate (per thousand) -The ``interval`` should written in microseconds unit. +The ``interval_us`` should be written in microseconds unit. .. _sysfs_filters: @@ -528,9 +528,9 @@ in the numeric order. Each filter directory contains nine files, namely ``type``, ``matching``, ``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, ``min``, ``max`` -and ``target_idx``. To ``type`` file, you can write the type of the filter. -Refer to :ref:`the design doc ` for available type -names, their meaning and on what layer those are handled. +and ``damon_target_idx``. To ``type`` file, you can write the type of the +filter. Refer to :ref:`the design doc ` for +available type names, their meaning and on what layer those are handled. For ``memcg`` type, you can specify the memory cgroup of the interest by writing the path of the memory cgroup from the cgroups mount point to @@ -540,7 +540,7 @@ files, respectively. For ``hugepage_size`` type, you can specify the minimum and maximum size of the range (closed interval) to ``min`` and ``max`` files, respectively. For ``target`` type, you can specify the index of the target between the list of the DAMON context's monitoring targets list to -``target_idx`` file. +``damon_target_idx`` file. You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter is for memory that matches the ``type``. You can write ``Y`` or ``N`` to @@ -731,7 +731,7 @@ show results using tracepoint supporting tools like ``perf``. For example:: Each line of the perf script output represents each monitoring region. The first five fields are as usual other tracepoint outputs. The sixth field -(``target_id=X``) shows the ide of the monitoring target of the region. The +(``target_id=X``) shows the id of the monitoring target of the region. The seventh field (``nr_regions=X``) shows the total number of monitoring regions for the target. The eighth field (``X-Y:``) shows the start (``X``) and end (``Y``) addresses of the region in bytes. The ninth field (``X``) shows the From 6414f790f21d2ba648d4d2a713d61f9014123fcf Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 20 May 2026 17:17:12 -0400 Subject: [PATCH 220/321] MAINTAINERS: add more files to PAGE CACHE section Add include/linux/writeback.h and include/trace/events/{filemap.h,readahead.h,writeback.h}. Link: https://lore.kernel.org/20260520-page-cache-maintainers-v1-1-f93438d2186d@columbia.edu Signed-off-by: Tal Zussman Cc: Jan Kara Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 782ed63e4e67..1e94e8cc6ad1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20174,6 +20174,10 @@ T: git git://git.infradead.org/users/willy/pagecache.git F: Documentation/filesystems/locking.rst F: Documentation/filesystems/vfs.rst F: include/linux/pagemap.h +F: include/linux/writeback.h +F: include/trace/events/filemap.h +F: include/trace/events/readahead.h +F: include/trace/events/writeback.h F: mm/filemap.c F: mm/page-writeback.c F: mm/readahead.c From 4c0ed883e0516aee79496b6277cbea63a08b2676 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Wed, 20 May 2026 12:22:28 +0000 Subject: [PATCH 221/321] mm/page_alloc: fix defrag_mode for non-reclaimable allocations When defrag_mode is enabled, ALLOC_NOFRAGMENT is enforced to prevent migratetype fallbacks and keep pageblocks clean. The allocator relies on reclaim and compaction to free pages of the correct type before allowing fallback as a last resort. However, non-reclaimable allocations such as GFP_ATOMIC cannot invoke direct reclaim or compaction. With defrag_mode=1, these allocations hit the !can_direct_reclaim bailout in __alloc_pages_slowpath() with ALLOC_NOFRAGMENT still set, and fail without ever attempting a fallback. This causes a large number of SLUB allocation failures for skbuff_head_cache under network-heavy workloads, despite free memory being available in other migratetype freelists. We observed it on a few of the Meta workloads that adopted defrag_mode=1. For the service under load there were 85509 SLUB allocation failures messages in dmesg within 2 hours. All of them are GFP_ATOMIC allocations for skbuff_head_cache, despite free pages being available in other migratetype freelists (~13 GB free). Since it is networking path from the practical point of view, this means dropped packets, failed RPC requests, tail latency spikes and overall service degradation. Clear ALLOC_NOFRAGMENT and retry for allocations that request kswapd reclaim but cannot do direct reclaim themselves (GFP_ATOMIC). Purely speculative allocations like GFP_TRANSHUGE_LIGHT that don't set __GFP_KSWAPD_RECLAIM are left to fail, since they have reasonable fallbacks and should not cause fragmentation. Link: https://lore.kernel.org/20260520122228.201550-1-d@ilvokhin.com Fixes: e3aa7df331bc ("mm: page_alloc: defrag_mode") Signed-off-by: Dmitry Ilvokhin Acked-by: Johannes Weiner Acked-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ebffb0bb98b..7e3c79e79e5b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4853,8 +4853,19 @@ retry: } /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) + if (!can_direct_reclaim) { + /* + * Reclaim/compaction cannot run, so defrag_mode's strategy + * of enforcing ALLOC_NOFRAGMENT cannot be fulfilled. Allow + * fallbacks rather than failing the allocation outright. + */ + if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT) && + (gfp_mask & __GFP_KSWAPD_RECLAIM)) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } goto nopage; + } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) From df0d6a6d4b33b4d9468538954bd2fc2a69b40ea3 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 20 May 2026 02:03:36 +0000 Subject: [PATCH 222/321] selftests/mm/split_huge_page_test.c: close fd on write error When create_pagecache_thp_and_fd() write returns error on /proc/sys/vm/dropcache, it just "goto err_out_unlink", which left fd still open. Use "goto err_out_close" to close the fd. Link: https://lore.kernel.org/20260520020336.28914-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Dev Jain Reviewed-by: SeongJae Park Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index a8725942ee51..40a5093917e7 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -625,7 +625,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, } if (write(*fd, "3", 1) != 1) { ksft_perror("write to drop_caches"); - goto err_out_unlink; + goto err_out_close; } close(*fd); From 17986198a7b99485d7b2bc4eb8d700fbf8c8629e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 2 Jun 2026 12:06:25 +0100 Subject: [PATCH 223/321] drivers/char/mem: eliminate unnecessary use of success_hook Patch series "remove mmap_action success, error hooks", v3. The mmap_action->success_hook was a strange beast added to enable code which appeared to absolutely require access to a VMA pointer to work correctly. Primarily this was for hugetlb, however a different approach will be taken there, as clearly more work is required to figure out a sensible way of converting hugetlb to use mmap_prepare. The other user was the memory char driver, specifically /dev/zero which has the unusual property of explicitly setting file-backed VMAs anonymous. Providing the success hook was always foolish, as it allowed drivers a way to workaround the restriction that they should not access a pointer to a not-yet-correctly-initialised VMA - which defeats the purpose of the mmap_prepare work. We can achieve the same thing in memory char driver without needing the success hook, so this series removes that, then removes the success hook altogether. The error hook is also unnecessary - the motivation for this was for functions which need to override the error code when performing an mmap action in order to avoid breaking userspace. We can achieve this by just providing a field for the error code. Doing this means we don't have to worry about the hook doing anything odd. We also add a check to ensure the error code is in fact valid. Again the memory char driver is the only current user of this, so this series updates it to use that. After this change mmap_action has no custom hooks at all, which seems rather more cromulent than before. This patch (of 3): /dev/zero, uniquely, marks memory mapped there as anonymous. This is currently achieved using the mmap_action->success_hook. However this hook circumvents the abstraction of VMA initialisation so it's preferable to do things a different way. To achieve this, this patch firstly defaults the VMA descriptor's vm_ops field to the dummy VMA operations, which is what file-backed VMAs default this field to. That way, we can detect whether a driver sets this field to NULL in order to mark it anonymous. We then introduce vma_desc_set_anonymous() to do this explicitly, and invoke it in mmap_zero_prepare(). This way, any driver which does not explicitly set desc->vm_ops, retains the dummy vm_ops as they would previously. We also update set_vma_user_defined_fields() to make clear that we are either setting vma->vm_ops to what is provided by the driver (or defaulting to dummy_vm_ops if not set), or setting the VMA anonymous. This lays the groundwork for removing the success hook. Link: https://lore.kernel.org/cover.1780397980.git.ljs@kernel.org Link: https://lore.kernel.org/010579cca6787cf7bb057ab1f7228978b10601c8.1780397980.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/char/mem.c | 17 +++++------------ include/linux/mm.h | 5 +++++ mm/util.c | 1 + mm/vma.c | 3 +++ tools/testing/vma/include/dup.h | 1 + 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 5fd421e48c04..a4297eb39887 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -504,17 +504,6 @@ static ssize_t read_zero(struct file *file, char __user *buf, return cleared; } -static int mmap_zero_private_success(const struct vm_area_struct *vma) -{ - /* - * This is a highly unique situation where we mark a MAP_PRIVATE mapping - * of /dev/zero anonymous, despite it not being. - */ - vma_set_anonymous((struct vm_area_struct *)vma); - - return 0; -} - static int mmap_zero_prepare(struct vm_area_desc *desc) { #ifndef CONFIG_MMU @@ -523,7 +512,11 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) if (vma_desc_test(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); - desc->action.success_hook = mmap_zero_private_success; + /* + * This is a highly unique situation where we mark a MAP_PRIVATE mapping + * of /dev/zero anonymous, despite it not being. + */ + vma_desc_set_anonymous(desc); return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 11f440e9d7cd..0f2612a70fb1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1489,6 +1489,11 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) vma->vm_ops = NULL; } +static inline void vma_desc_set_anonymous(struct vm_area_desc *desc) +{ + desc->vm_ops = NULL; +} + static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; diff --git a/mm/util.c b/mm/util.c index 3cc949a0b7ed..2b2a9df689d7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1192,6 +1192,7 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, desc->vm_file = vma->vm_file; desc->vma_flags = vma->flags; desc->page_prot = vma->vm_page_prot; + desc->vm_ops = vma->vm_ops; /* Default. */ desc->action.type = MMAP_NOTHING; diff --git a/mm/vma.c b/mm/vma.c index d90791b00a7b..9eea2850818a 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2697,6 +2697,8 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma, { if (map->vm_ops) vma->vm_ops = map->vm_ops; + else /* Only /dev/zero should do this. */ + vma_set_anonymous(vma); vma->vm_private_data = map->vm_private_data; } @@ -2744,6 +2746,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, .action = { .type = MMAP_NOTHING, /* Default to no further action. */ }, + .vm_ops = &vma_dummy_vm_ops, }; bool allocated_new = false; int error; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 9e0dfd3a85b0..306171d061e7 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1303,6 +1303,7 @@ static inline void compat_set_desc_from_vma(struct vm_area_desc *desc, desc->vm_file = vma->vm_file; desc->vma_flags = vma->flags; desc->page_prot = vma->vm_page_prot; + desc->vm_ops = vma->vm_ops; /* Default. */ desc->action.type = MMAP_NOTHING; From 8876dc0780f23eb499b42cc84df2dd795aada6be Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 2 Jun 2026 12:06:26 +0100 Subject: [PATCH 224/321] mm/vma: remove mmap_action->success_hook This hook was introduced to work around code that seemed to absolutely require access to a VMA pointer upon mmap(). However, providing this hook leaves a backdoor to drivers getting access to the very thing mmap_prepare eliminates - a pointer to the VMA. Let's solve this contradiction by removing it. The key intended user was hugetlb, however it seems that the best course now is to avoid allowing all drivers the ability to work around mmap_prepare, and find a different solution there. Link: https://lore.kernel.org/f79434e6d30af6d92999be6b76e197f1847105fa.1780397980.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 10 ---------- mm/util.c | 2 -- tools/testing/vma/include/dup.h | 10 ---------- 3 files changed, 22 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a308e2c23b82..945c0a5386d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -843,16 +843,6 @@ struct mmap_action { }; enum mmap_action_type type; - /* - * If specified, this hook is invoked after the selected action has been - * successfully completed. Note that the VMA write lock still held. - * - * The absolute minimum ought to be done here. - * - * Returns 0 on success, or an error code. - */ - int (*success_hook)(const struct vm_area_struct *vma); - /* * If specified, this hook is invoked when an error occurred when * attempting the selected action. diff --git a/mm/util.c b/mm/util.c index 2b2a9df689d7..4e172990afcd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1397,8 +1397,6 @@ static int mmap_action_finish(struct vm_area_struct *vma, if (!err) err = call_vma_mapped(vma); - if (!err && action->success_hook) - err = action->success_hook(vma); /* do_munmap() might take rmap lock, so release if held. */ maybe_rmap_unlock_action(vma, action); diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 306171d061e7..fddfd1b57c09 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -482,16 +482,6 @@ struct mmap_action { }; enum mmap_action_type type; - /* - * If specified, this hook is invoked after the selected action has been - * successfully completed. Note that the VMA write lock still held. - * - * The absolute minimum ought to be done here. - * - * Returns 0 on success, or an error code. - */ - int (*success_hook)(const struct vm_area_struct *vma); - /* * If specified, this hook is invoked when an error occurred when * attempting the selection action. From 4f5b8759262e5e65373638346307836de1290b22 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 2 Jun 2026 12:06:27 +0100 Subject: [PATCH 225/321] mm/vma: eliminate mmap_action->error_hook, introduce error_override Rather than providing a hook, simplify things by providing the ability to override mmap action errors. This allows us to more carefully validate the value provided and thus ensure only a valid error code is specified, and simplifies the interface. This way, we eliminate all hooks but mmap_prepare and allow only mmap actions to be specified (which core mm controls). This significantly improves robustness and eliminates any unnecessary code duplication in driver mmap hooks. We also update the /dev/mem logic (the only user) to use mmap_action->error_override instead. Link: https://lore.kernel.org/55d13f7d016b827c459946d46a56105635be111c.1780397980.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/char/mem.c | 8 +------- include/linux/mm_types.h | 9 +++------ mm/util.c | 29 +++++++++++++++++++++-------- tools/testing/vma/include/dup.h | 9 +++------ 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index a4297eb39887..63253d1de5d7 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -322,11 +322,6 @@ static const struct vm_operations_struct mmap_mem_ops = { #endif }; -static int mmap_filter_error(int err) -{ - return -EAGAIN; -} - static int mmap_mem_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; @@ -362,8 +357,7 @@ static int mmap_mem_prepare(struct vm_area_desc *desc) /* Remap-pfn-range will mark the range with the I/O flag. */ mmap_action_remap_full(desc, desc->pgoff); - /* We filter remap errors to -EAGAIN. */ - desc->action.error_hook = mmap_filter_error; + desc->action.error_override = -EAGAIN; return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 945c0a5386d6..5ef78617ce93 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -844,13 +844,10 @@ struct mmap_action { enum mmap_action_type type; /* - * If specified, this hook is invoked when an error occurred when - * attempting the selected action. - * - * The hook can return an error code in order to filter the error, but - * it is not valid to clear the error here. + * If non-zero, replace errors that arise from mmap actions with this + * value instead. Only valid error codes may be specified. */ - int (*error_hook)(int err); + int error_override; /* * This should be set in rare instances where the operation required diff --git a/mm/util.c b/mm/util.c index 4e172990afcd..af2c2103f0d9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1414,16 +1414,22 @@ static int mmap_action_finish(struct vm_area_struct *vma, */ len = vma_pages(vma) << PAGE_SHIFT; do_munmap(current->mm, vma->vm_start, len, NULL); - if (action->error_hook) { - /* We may want to filter the error. */ - err = action->error_hook(err); - /* The caller should not clear the error. */ - VM_WARN_ON_ONCE(!err); - } - return err; + + return action->error_override ?: err; } #ifdef CONFIG_MMU + +static int check_mmap_action(struct mmap_action *action) +{ + const unsigned long override = action->error_override; + + if (WARN_ON_ONCE(override && !IS_ERR_VALUE(override))) + return -EINVAL; + + return 0; +} + /** * mmap_action_prepare - Perform preparatory setup for an VMA descriptor * action which need to be performed. @@ -1433,7 +1439,14 @@ static int mmap_action_finish(struct vm_area_struct *vma, */ int mmap_action_prepare(struct vm_area_desc *desc) { - switch (desc->action.type) { + struct mmap_action *action = &desc->action; + int err; + + err = check_mmap_action(action); + if (err) + return err; + + switch (action->type) { case MMAP_NOTHING: return 0; case MMAP_REMAP_PFN: diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index fddfd1b57c09..bf26b3f48d3a 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -483,13 +483,10 @@ struct mmap_action { enum mmap_action_type type; /* - * If specified, this hook is invoked when an error occurred when - * attempting the selection action. - * - * The hook can return an error code in order to filter the error, but - * it is not valid to clear the error here. + * If non-zero, replace errors that arise from mmap actions with this + * value instead. Only valid error codes may be specified. */ - int (*error_hook)(int err); + int error_override; /* * This should be set in rare instances where the operation required From ce71e5aa8dc83e703a5301644cef57dfc3caaf44 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:12 -0700 Subject: [PATCH 226/321] mm/damon/core: safely handle no region case in damon_set_regions() Patch series "mm/damon: minor improvements for code readability and tests". Implement minor improvements on code readability and tests for DAMON. First seven patches are for DAMON code readability and resulting maintenance. Patches 1 and 2 make damon_set_regions() safer and easier to read. Patches 3 and 4 remove fragmented DAMON API use cases. Patches 5-7 hides unused core functions that are unnecessarily exposed to API callers. The following seven patches are for DAMON tests improvement. Patches 8 and 9 adds and removes DAMON_DEBUG_SANITY verifications to ensure reasonable test coverage without too high overhead. Patch 10 adds a new kunit test for damon_set_regions(). Patch 11 makes sysfs.py selftest more gracefully finishes under test failures. Patches 12-13 adds simple sysfs.sh test cases for the monitoring intervals goal directory, the addr_unit file and the pause file. This patch (of 14): damon_set_regions() calls damon_first_region() regardless of the number of DAMON regions in a given DAMON target. damon_first_region() internally uses list_first_entry(), which clearly documents the list is expected to be not empty. Due to the internal implementation of the macro, damon_set_regions() is safe for now. But the internal implementation of the macro can be changed in future. Refactor the function to explicitly and safely handle the empty region list case without depending on the internal implementation. No behavioral change is intended. Link: https://lore.kernel.org/20260522154026.80546-1-sj@kernel.org Link: https://lore.kernel.org/20260522154026.80546-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 68b3b4bbc8fc..8360cb4c506e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -356,6 +356,19 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, damon_destroy_region(r, t); } + if (!damon_nr_regions(t)) { + for (i = 0; i < nr_ranges; i++) { + r = damon_new_region( + ALIGN_DOWN(ranges[i].start, + min_region_sz), + ALIGN(ranges[i].end, min_region_sz)); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + } + return 0; + } + r = damon_first_region(t); /* Add new regions or resize existing regions to fit in the ranges */ for (i = 0; i < nr_ranges; i++) { From b23dbda659b645482e3234ad10773d991a288e2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:13 -0700 Subject: [PATCH 227/321] mm/damon/core: do not use region out of a loop in damon_set_regions() damon_set_regions() assumes the DAMON region iterator is referencing the last region after the region iteration loop is completed. The code is indeed implemented in the way, but that is not a documented safe behavior. Hence it is unreliable and difficult to read. Cleanup the code to avoid the case. No behavioral change is intended. Link: https://lore.kernel.org/20260522154026.80546-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 8360cb4c506e..c9946ac8e279 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -374,6 +374,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, for (i = 0; i < nr_ranges; i++) { struct damon_region *first = NULL, *last, *newr; struct damon_addr_range *range; + bool insert_before_r = false; range = &ranges[i]; /* Get the first/last regions intersecting with the range */ @@ -383,8 +384,10 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, first = r; last = r; } - if (r->ar.start >= range->end) + if (r->ar.start >= range->end) { + insert_before_r = true; break; + } } if (!first) { /* no region intersects with this range */ @@ -394,7 +397,11 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, ALIGN(range->end, min_region_sz)); if (!newr) return -ENOMEM; - damon_insert_region(newr, damon_prev_region(r), r, t); + if (insert_before_r) + damon_insert_region(newr, damon_prev_region(r), + r, t); + else + damon_add_region(newr, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, From cd036cc8c384b8593b5020a82b4b73ee3d10e22c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:14 -0700 Subject: [PATCH 228/321] samples/damon/mtier: replace damon_add_region() with damon_set_regions() mtier DAMON sample module and DAMON virtual address operation set (vaddr) unit tests are using damon_add_region() for setup of DAMON monitoring target region boundaries setup. But, damon_set_regions() is designed for exactly the purpose. All other DAMON API callers use the function for the purpose. Replace damon_add_region() usage in mtier sample module with damon_set_regions(), for unifying the use case and reducing the maintenance cost. Link: https://lore.kernel.org/20260522154026.80546-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- samples/damon/mtier.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c index 775838a23d93..eb1143de8df1 100644 --- a/samples/damon/mtier.c +++ b/samples/damon/mtier.c @@ -75,11 +75,11 @@ static struct damon_ctx *damon_sample_mtier_build_ctx(bool promote) struct damon_ctx *ctx; struct damon_attrs attrs; struct damon_target *target; - struct damon_region *region; struct damos *scheme; struct damos_quota_goal *quota_goal; struct damos_filter *filter; struct region_range addr; + struct damon_addr_range range; int ret; ctx = damon_new_ctx(); @@ -120,10 +120,12 @@ static struct damon_ctx *damon_sample_mtier_build_ctx(bool promote) addr.end = promote ? node1_end_addr : node0_end_addr; } - region = damon_new_region(addr.start, addr.end); - if (!region) + range.start = addr.start; + range.end = addr.end; + + ret = damon_set_regions(target, &range, 1, DAMON_MIN_REGION_SZ); + if (ret) goto free_out; - damon_add_region(region, target); scheme = damon_new_scheme( /* access pattern */ From 9ace949ad8f58f7eb175b88cc20a1d1c11a2d40f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:15 -0700 Subject: [PATCH 229/321] mm/damon/tests/vaddr-kunit: replace damon_add_region() with damon_set_regions() DAMON virtual address operation set (vaddr) unit tests is using damon_add_region() for setup of DAMON monitoring target region boundaries setup. But, damon_set_regions() is designed for exactly the purpose. All other DAMON API callers use the function for the purpose. Replace damon_add_region() usage in the unit tests with damon_set_regions(), for unifying the use case and reducing the maintenance cost. Link: https://lore.kernel.org/20260522154026.80546-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 98e734d77d51..563fbc7e3f44 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -132,22 +132,35 @@ static void damon_do_test_apply_three_regions(struct kunit *test, unsigned long *expected, int nr_expected) { struct damon_target *t; + struct damon_addr_range *ranges; struct damon_region *r; int i; t = damon_new_target(); if (!t) kunit_skip(test, "target alloc fail"); - for (i = 0; i < nr_regions / 2; i++) { - r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); - if (!r) { - damon_destroy_target(t, NULL); - kunit_skip(test, "region alloc fail"); - } - damon_add_region(r, t); - } - damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ); + ranges = kmalloc_array(nr_regions / 2, sizeof(*ranges), GFP_KERNEL); + if (!ranges) { + damon_destroy_target(t, NULL); + kunit_skip(test, "ranges alloc fail"); + } + for (i = 0; i < nr_regions / 2; i++) { + ranges[i].start = regions[i * 2]; + ranges[i].end = regions[i * 2 + 1]; + } + if (damon_set_regions(t, ranges, nr_regions / 2, + DAMON_MIN_REGION_SZ)) { + kfree(ranges); + damon_destroy_target(t, NULL); + kunit_skip(test, "damon_set_regions() fail"); + } + kfree(ranges); + + if (damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ)) { + damon_destroy_target(t, NULL); + kunit_skip(test, "second damon_set_regions() fail"); + } for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); From 9cf7ef2d6665dff35b3b522c84509c2d256bf3aa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:16 -0700 Subject: [PATCH 230/321] mm/damon/core: hide damon_add_region() damon_add_region() is being used by only DAMON core, but exposed to DAMON API callers. Exposing something that is not really being used by others will only increase the maintenance cost. Hide it. Link: https://lore.kernel.org/20260522154026.80546-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 - mm/damon/core.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 4014fd0d463c..b9370c1779cb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1024,7 +1024,6 @@ static inline void damon_insert_region(struct damon_region *r, t->nr_regions++; } -void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges, unsigned long min_region_sz); diff --git a/mm/damon/core.c b/mm/damon/core.c index c9946ac8e279..1dd900814ae8 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -250,7 +250,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -void damon_add_region(struct damon_region *r, struct damon_target *t) +static void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); t->nr_regions++; From 26d6f6960ff91ebb267cd80efe7772c6427b4cc1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:17 -0700 Subject: [PATCH 231/321] mm/damon/core: hide damon_insert_region() damon_insert_region() is being used by only DAMON core, but exposed to DAMON API callers. Exposing something that is not really being used by others will only increase the maintenance cost. Hide it. Link: https://lore.kernel.org/20260522154026.80546-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ----------- mm/damon/core.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index b9370c1779cb..3acca7deb169 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1013,17 +1013,6 @@ void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); struct damon_region *damon_new_region(unsigned long start, unsigned long end); -/* - * Add a region between two other regions - */ -static inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges, unsigned long min_region_sz); diff --git a/mm/damon/core.c b/mm/damon/core.c index 1dd900814ae8..d1e7b441f2bf 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -256,6 +256,17 @@ static void damon_add_region(struct damon_region *r, struct damon_target *t) t->nr_regions++; } +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + #ifdef CONFIG_DAMON_DEBUG_SANITY static void damon_verify_del_region(struct damon_target *t) { From 50d2dec8af1a09056b6b29b54a30e32281b30e2c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:18 -0700 Subject: [PATCH 232/321] mm/damon/core: hide damon_destroy_region() damon_destroy_region() is being used by only DAMON core, but exposed to DAMON API callers. Exposing something that is not really being used by others will only increase the maintenance cost. Hide it. Link: https://lore.kernel.org/20260522154026.80546-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 - mm/damon/core.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3acca7deb169..638ee65f88dc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -1013,7 +1013,6 @@ void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe); struct damon_region *damon_new_region(unsigned long start, unsigned long end); -void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges, unsigned long min_region_sz); void damon_update_region_access_rate(struct damon_region *r, bool accessed, diff --git a/mm/damon/core.c b/mm/damon/core.c index d1e7b441f2bf..d816679dd702 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -291,7 +291,8 @@ static void damon_free_region(struct damon_region *r) kmem_cache_free(damon_region_cache, r); } -void damon_destroy_region(struct damon_region *r, struct damon_target *t) +static void damon_destroy_region(struct damon_region *r, + struct damon_target *t) { damon_del_region(r, t); damon_free_region(r); From 8f793f1ad5bd9f4f7e9c4fa734a7995ef2a2401f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:19 -0700 Subject: [PATCH 233/321] mm/damon/core: add kdamond_call() debug_sanity check kdamond_call() is the place where DAMON API callers are allowed to access the DAMON context's public internal state including the monitoring results. Hence it is important to ensure it is called with the expected DAMON context state. Do the check under DAMON_DEBUG_SANITY. Link: https://lore.kernel.org/20260522154026.80546-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index d816679dd702..00e2997524ec 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3329,6 +3329,37 @@ static void kdamond_usleep(unsigned long usecs) usleep_range_idle(usecs, usecs + 1); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_ctx(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, c) { + struct damon_region *prev_r = NULL; + unsigned int nr_regions = 0; + + damon_for_each_region(r, t) { + WARN_ONCE(r->ar.start >= r->ar.end, + "region start (%lu) >= end (%lu)\n", + r->ar.start, r->ar.end); + WARN_ONCE(prev_r && prev_r->ar.end > r->ar.start, + "region overlap (%lu > %lu)\n", + prev_r->ar.end, r->ar.start); + prev_r = r; + nr_regions++; + } + WARN_ONCE(damon_nr_regions(t) != nr_regions, + "nr_regions mismatch: %u != %u\n", + damon_nr_regions(t), nr_regions); + } +} +#else +static void damon_verify_ctx(struct damon_ctx *c) +{ +} +#endif + /* * kdamond_call() - handle damon_call_control objects. * @ctx: The &struct damon_ctx of the kdamond. @@ -3344,6 +3375,8 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) struct damon_call_control *control, *next; LIST_HEAD(controls); + damon_verify_ctx(ctx); + mutex_lock(&ctx->call_controls_lock); list_splice_tail_init(&ctx->call_controls, &controls); mutex_unlock(&ctx->call_controls_lock); From b8db646fe9a77845c37680ca416847ced5763c06 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:20 -0700 Subject: [PATCH 234/321] mm/damon/core: remove damon_verify_nr_regions() When CONFIG_DAMON_DEBUG_SANITY is enabled, damon_verify_nr_regions() is called for each damon_nr_regions() invocation. damon_veify_nr_regions() iterates all regions. damon_nr_regions() is called for each region in kdamond_reset_aggregated() and damos_apply_scheme(). Hence it imposes O(n**2) overhead where n is the number of regions. Though the verification is enabled only under DAMON_DEBUG_SANITY, which is not for production use cases, it could be too high overhead. Meanwhile, damon_verify_ctx() is doing the damon_nr_regions() test. Because damon_verify_ctx() is called for each kdamond_call(), the test coverage from damon_verify_ctx() could be sufficient. Remove damon_nr_regions() verification. Link: https://lore.kernel.org/20260522154026.80546-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 00e2997524ec..b33920873871 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -686,27 +686,8 @@ void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx) damon_free_target(t); } -#ifdef CONFIG_DAMON_DEBUG_SANITY -static void damon_verify_nr_regions(struct damon_target *t) -{ - struct damon_region *r; - unsigned int count = 0; - - damon_for_each_region(r, t) - count++; - WARN_ONCE(count != t->nr_regions, "t->nr_regions (%u) != count (%u)\n", - t->nr_regions, count); -} -#else -static void damon_verify_nr_regions(struct damon_target *t) -{ -} -#endif - unsigned int damon_nr_regions(struct damon_target *t) { - damon_verify_nr_regions(t); - return t->nr_regions; } From 2ceda82a15c1bc8c6b1f1915743e938703b57e4c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:21 -0700 Subject: [PATCH 235/321] mm/damon/tests/core-kunit: add damon_set_regions() test cases damon_set_regions() is one of the main DAMON kernel API functions that set up the monitoring target memory region boundaries. Implement unit tests for verifying its basic functionalities. Link: https://lore.kernel.org/20260522154026.80546-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 142 ++++++++++++++++++++++++++++++------ 1 file changed, 120 insertions(+), 22 deletions(-) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 866f716e5760..1cfb8c176b87 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -390,41 +390,139 @@ static void damon_test_ops_registration(struct kunit *test) } } -static void damon_test_set_regions(struct kunit *test) +static void damon_test_set_regions_for(struct kunit *test, + struct damon_addr_range *old_ranges, int sz_old_ranges, + struct damon_addr_range *new_ranges, int sz_new_ranges, + unsigned long min_region_sz, + struct damon_addr_range *expect_ranges, int sz_expect_ranges) { - struct damon_target *t = damon_new_target(); - struct damon_region *r1, *r2; - struct damon_addr_range range = {.start = 8, .end = 28}; - unsigned long expects[] = {8, 16, 16, 24, 24, 28}; - int expect_idx = 0; + struct damon_target *t; struct damon_region *r; + int i; + t = damon_new_target(); if (!t) kunit_skip(test, "target alloc fail"); - r1 = damon_new_region(4, 16); - if (!r1) { - damon_free_target(t); - kunit_skip(test, "region alloc fail"); - } - r2 = damon_new_region(24, 32); - if (!r2) { - damon_free_target(t); - damon_free_region(r1); - kunit_skip(test, "second region alloc fail"); + for (i = 0; i < sz_old_ranges; i++) { + r = damon_new_region(old_ranges[i].start, old_ranges[i].end); + if (!r) { + damon_destroy_target(t, NULL); + kunit_skip(test, "%d-th r alloc fail\n", i); + } + damon_add_region(r, t); } - damon_add_region(r1, t); - damon_add_region(r2, t); - damon_set_regions(t, &range, 1, 1); + damon_set_regions(t, new_ranges, sz_new_ranges, min_region_sz); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), sz_expect_ranges); + if (damon_nr_regions(t) != sz_expect_ranges) { + damon_destroy_target(t, NULL); + return; + } + i = 0; damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); - KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.start, expect_ranges[i].start); + KUNIT_EXPECT_EQ(test, r->ar.end, expect_ranges[i++].end); } + damon_destroy_target(t, NULL); } +static void damon_test_set_regions(struct kunit *test) +{ + /* Initial build up on empty target. */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){}, 0, + (struct damon_addr_range[]){ + {.start = 5, .end = 15}, + {.start = 15, .end = 25}, + }, 2, + 1, + (struct damon_addr_range[]){ + {.start = 5, .end = 15}, + {.start = 15, .end = 25}, + }, 2); + /* Un-intersecting regions should be removed. */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 4, .end = 16}, + {.start = 24, .end = 32}, + }, 2, + (struct damon_addr_range[]){ + {.start = 18, .end = 23}, + }, 1, + 1, + (struct damon_addr_range[]){ + {.start = 18, .end = 23}, + }, 1); + /* + * Holes should be filled up with new regions. + * + * old: [4, 16) [24, 32) + * new: [8, 28) + * expect: [8, 16)[16,24),[24, 28) + */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 4, .end = 16}, + {.start = 24, .end = 32}, + }, 2, + (struct damon_addr_range[]){ + {.start = 8, .end = 28}, + }, 1, + 1, + (struct damon_addr_range[]){ + {.start = 8, .end = 16}, + {.start = 16, .end = 24}, + {.start = 24, .end = 28}, + }, 3); + /* + * New regions should be able to be appended. + * + * old: [0, 4)[4, 17) + * new: [0, 15) [25, 40) + * expect: [0, 4)[4, 15) [25, 40) + */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 0, .end = 4}, + {.start = 4, .end = 17}, + }, 2, + (struct damon_addr_range[]){ + {.start = 0, .end = 15}, + {.start = 25, .end = 40}, + }, 2, + 1, + (struct damon_addr_range[]){ + {.start = 0, .end = 4}, + {.start = 4, .end = 15}, + {.start = 25, .end = 40}, + }, 3); + /* + * New regions should be able to be inserted. + * + * old: [0, 4) [42, 52) + * new: [0, 15) [25, 40) [44, 50) + * expect: [0, 15) [25, 40) [44, 50) + */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 0, .end = 4}, + {.start = 42, .end = 52}, + }, 2, + (struct damon_addr_range[]){ + {.start = 0, .end = 15}, + {.start = 25, .end = 40}, + {.start = 44, .end = 50}, + }, 3, + 1, + (struct damon_addr_range[]){ + {.start = 0, .end = 15}, + {.start = 25, .end = 40}, + {.start = 44, .end = 50}, + }, 3); +} + static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test) { struct damon_attrs attrs = { From ae819edb97012b29db0a78f314d80d20959d77c9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:22 -0700 Subject: [PATCH 236/321] selftests/damon/sysfs.py: stop kdamonds before failing When an assertion is failed, sysfs.py DAMON selftest immediately exits the test program leaving the DAMON running behind. Many of the following tests need to start DAMON on their own. But because DAMON that was started by sysfs.py is still running, those start attempts fail, and the tests are failed or skipped. Update sysfs.py to stop DAMON before exiting the test program due to the assertion failure. Link: https://lore.kernel.org/20260522154026.80546-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index cd4d82c85211..aa03a1187489 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -24,9 +24,12 @@ def dump_damon_status_dict(pid): except Exception as e: return None, 'json.load fail (%s)' % e +kdamonds = None def fail(expectation, status): print('unexpected %s' % expectation) print(json.dumps(status, indent=4)) + if kdamonds is not None: + kdamonds.stop() exit(1) def assert_true(condition, expectation, status): @@ -248,6 +251,7 @@ def assert_ctxs_committed(kdamonds): ctx.pause = False def main(): + global kdamonds kdamonds = _damon_sysfs.Kdamonds( [_damon_sysfs.Kdamond( contexts=[_damon_sysfs.DamonCtx( From b6404e44aac2e51c552691d8861c7686be762d42 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:23 -0700 Subject: [PATCH 237/321] selftests/damon/sysfs.sh: test monitoring intervals goal dir sysfs.sh DAMON selftest is not testing monitoring intervals goal directory. Add the test. Link: https://lore.kernel.org/20260522154026.80546-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 1ac3e2ce8e44..b3418214ed35 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -282,6 +282,17 @@ test_targets() ensure_dir "$targets_dir/1" "not_exist" } + +test_intervals_goal() +{ + goal_dir=$1 + ensure_dir "$goal_dir" "exist" + ensure_file "$goal_dir/access_bp" "exist" "600" + ensure_file "$goal_dir/aggrs" "exist" "600" + ensure_file "$goal_dir/min_sample_us" "exist" "600" + ensure_file "$goal_dir/max_sample_us" "exist" "600" +} + test_intervals() { intervals_dir=$1 @@ -289,6 +300,7 @@ test_intervals() ensure_file "$intervals_dir/aggr_us" "exist" "600" ensure_file "$intervals_dir/sample_us" "exist" "600" ensure_file "$intervals_dir/update_us" "exist" "600" + test_intervals_goal "$intervals_dir/intervals_goal" } test_damon_filter() From a8f30ccf23f520bb071657ece5dcf534fda8e53b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:24 -0700 Subject: [PATCH 238/321] selftests/damon/sysfs.sh: test addr_unit file existence sysfs.sh DAMON selftest is not testing the existence of addr_unit sysfs file. Add the test. Link: https://lore.kernel.org/20260522154026.80546-14-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index b3418214ed35..92b44c86818a 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -365,6 +365,7 @@ test_context() ensure_dir "$context_dir" "exist" ensure_file "$context_dir/avail_operations" "exit" 400 ensure_file "$context_dir/operations" "exist" 600 + ensure_file "$context_dir/addr_unit" "exist" 600 test_monitoring_attrs "$context_dir/monitoring_attrs" test_targets "$context_dir/targets" test_schemes "$context_dir/schemes" From 1f9f7e72da1b3262616b7e191db8bae8225f2435 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 22 May 2026 08:40:25 -0700 Subject: [PATCH 239/321] selftests/damon/sysfs.sh: test pause file existence sysfs.sh DAMON selftest is not testing the existence of the 'pause' sysfs file. Add the test. Link: https://lore.kernel.org/20260522154026.80546-15-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 92b44c86818a..78f4badb5beb 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -366,6 +366,7 @@ test_context() ensure_file "$context_dir/avail_operations" "exit" 400 ensure_file "$context_dir/operations" "exist" 600 ensure_file "$context_dir/addr_unit" "exist" 600 + ensure_file "$context_dir/pause" "exist" 600 test_monitoring_attrs "$context_dir/monitoring_attrs" test_targets "$context_dir/targets" test_schemes "$context_dir/schemes" From d63e9d829e42b29201ebbcf0a070acea8ed40b2c Mon Sep 17 00:00:00 2001 From: Maksym Shcherba Date: Thu, 21 May 2026 23:20:19 +0300 Subject: [PATCH 240/321] mm/damon: fix missing parens in macro arguments Patch series "mm/damon: fix macro arguments and clarify quota goals doc", v2. This patch (of 2): The DAMON iterator macros do not wrap their pointer arguments with parentheses. This can cause build failures when the argument is a complex expression due to operator precedence issues. Add missing parentheses around the arguments in the following macros to prevent potential build failures: - damon_for_each_region() - damon_for_each_region_from() - damon_for_each_region_safe() - damos_for_each_quota_goal() Link: https://lore.kernel.org/20260521202020.126500-1-maksym.shcherba@lnu.edu.ua Link: https://lore.kernel.org/20260521202020.126500-2-maksym.shcherba@lnu.edu.ua Signed-off-by: Maksym Shcherba Reviewed-by: SeongJae Park Assisted-by: Antigravity:Gemini-3.1-Pro Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 638ee65f88dc..6f7edb3590ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -963,13 +963,13 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(p, next, &(ctx)->probes, list) #define damon_for_each_region(r, t) \ - list_for_each_entry(r, &t->regions_list, list) + list_for_each_entry(r, &(t)->regions_list, list) #define damon_for_each_region_from(r, t) \ - list_for_each_entry_from(r, &t->regions_list, list) + list_for_each_entry_from(r, &(t)->regions_list, list) #define damon_for_each_region_safe(r, next, t) \ - list_for_each_entry_safe(r, next, &t->regions_list, list) + list_for_each_entry_safe(r, next, &(t)->regions_list, list) #define damon_for_each_target(t, ctx) \ list_for_each_entry(t, &(ctx)->adaptive_targets, list) @@ -984,7 +984,7 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(s, next, &(ctx)->schemes, list) #define damos_for_each_quota_goal(goal, quota) \ - list_for_each_entry(goal, "a->goals, list) + list_for_each_entry(goal, &(quota)->goals, list) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) From 83b25befc1ab1f6e331691c4caf41f919fd082d2 Mon Sep 17 00:00:00 2001 From: Maksym Shcherba Date: Thu, 21 May 2026 23:20:20 +0300 Subject: [PATCH 241/321] Docs/admin-guide/mm/damon/usage: clarify current_value of quota goals The sysfs interface for DAMON quota goals includes a `current_value` file. This file is not updated by the kernel and only serves to receive user input. Clarify in the documentation that the kernel does not update `current_value`, and that reading it only has meaning when `target_metric` is set to `user_input`. While at it, fix missing commas in the goal files list. Link: https://lore.kernel.org/20260521202020.126500-3-maksym.shcherba@lnu.edu.ua Signed-off-by: Maksym Shcherba Reviewed-by: SeongJae Park Assisted-by: Antigravity:Gemini-3.1-Pro Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d46875e603d8..011296f1e7c2 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -474,10 +474,12 @@ to ``N-1``. Each directory represents each goal and current achievement. Among the multiple feedback, the best one is used. Each goal directory contains five files, namely ``target_metric``, -``target_value``, ``current_value`` ``nid`` and ``path``. Users can set and +``target_value``, ``current_value``, ``nid``, and ``path``. Users can set and get the five parameters for the quota auto-tuning goals that specified on the :ref:`design doc ` by writing to and -reading from each of the files. Note that users should further write +reading from each of the files. Because the kernel does not update +``current_value``, reading it only makes sense when ``target_metric`` is +``user_input``. Note that users should further write ``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond directory ` to pass the feedback to DAMON. From 7e6cc35f5283eab81a14231a64ecd640b690c48c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 20 May 2026 08:03:10 -0700 Subject: [PATCH 242/321] mm/damon/core: trace esz at first setup DAMON traces effective size quota from the second update, only if a change has been made by the update. Tracing only changed updates was an intentional decision to avoid unnecessary same value tracing. Always skipping the first value is just an unintended mistake. The mistake makes the tracepoint based investigation incomplete, because the first effective size quota is never traced. It is not a big issue when the 'consist' quota tuner is used, because it keeps changing the quota in the usual setup. However, when the 'temporal' tuner is used, the quota value is not changed before the goal achievement status is completely changed. For example, if the DAMOS scheme is started with an under-achieved goal, the quota is set to the maximum value, and kept the same value until the goal is achieved. Because DAMON skips the first value, the user cannot know what effective quota the current scheme is using. Only after the goal is achieved, the effective quota is changed to zero, and traced. Unconditionally trace the initial quota value to fix this problem. Note that the 'temporal' quota tuner was introduced by commit af738a6a00c1 ("mm/damon/core: introduce DAMOS_QUOTA_GOAL_TUNER_TEMPORAL"), which was added to 7.1-rc1. But even with the 'consist' quota tuner, the tracing is unintentionally incomplete. Hence this commit marks the introduction of the trace event as the broken commit. Link: https://lore.kernel.org/20260520150311.80925-1-sj@kernel.org Fixes: a86d695193bf ("mm/damon: add trace event for effective size quota") Signed-off-by: SeongJae Park Cc: # 6.17.x Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index b33920873871..265d51ade25b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2899,6 +2899,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; damos_set_effective_quota(c, s); + if (trace_damos_esz_enabled()) + damos_trace_esz(c, s, quota); } /* New charge window starts */ From 7c2ebe0fe06e84a5a1fcbc358111735080bdb141 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Sun, 24 May 2026 11:10:53 +0800 Subject: [PATCH 243/321] kasan/test: only do kmalloc_double_kzfree for generic mode kmalloc_double_kzfree() would corrupt kernel memory when the just freed memory were allocated by another thread before the second call to kfree_sensitive() and the new allocation tag happened to match the old one. This could not happen in GENERIC mode as it uses quarantine. Link: https://lore.kernel.org/20260524031053.381776-1-wsw9603@163.com Signed-off-by: Wang Wensheng Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 32d06cbf6a31..3f4ed29178b3 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -874,6 +874,16 @@ static void kmalloc_double_kzfree(struct kunit *test) char *ptr; size_t size = 16; + /* + * With the tag-based KASAN modes, if the memory happens to be + * reallocated between the two frees and the new allocation tag happens + * to match the old one, the second free will cause a memory corruption. + * Resolving https://bugzilla.kernel.org/show_bug.cgi?id=212177 would + * help to deal with this. With Generic KASAN, it's effectively + * impossible for the memory to get reallocated due to the quarantine. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); From 6cbdd9726fb50d749b06ab45a8ef81dff02e69b8 Mon Sep 17 00:00:00 2001 From: "Barry Song (Xiaomi)" Date: Tue, 26 May 2026 21:09:38 +0800 Subject: [PATCH 244/321] mm/mglru: use folio_mark_accessed to replace folio_set_active MGLRU gives high priority to folios mapped in page tables. As a result, folio_set_active() is invoked for all folios read during page faults. In practice, however, readahead can bring in many folios that are never accessed via page tables. A previous attempt by Lei Liu proposed introducing a separate LRU for readahead[1] to make readahead pages easier to reclaim, but that approach is likely over-engineered. Before commit 4d5d14a01e2c ("mm/mglru: rework workingset protection"), folios with PG_active were always placed in the youngest generation, leading to over-protection and increased refaults. After that commit, PG_active folios are placed in the second youngest generation, which is still too optimistic given the presence of readahead. In contrast, the classic active/inactive scheme is more conservative. This patch switches to using folio_mark_accessed() and begins prefaulted file folios from the second oldest generation instead of active generations. We should also adjust the following accordingly: - WORKINGSET_ACTIVATE: aligned with setting active for refaulted workingset folios; - lru_gen_folio_seq(): place (pre)faulted file folios into the second oldest generation; - promote second-scanned folios to workingset in folio_check_references(): we now have to depend on folio_lru_refs() > 1, since we previously relied on PG_referenced being set during the first scan, but PG_referenced is now set earlier. On x86, running a kernel build inside a memcg with a 1GB memory limit using 20 threads. w/o patch: real 1m50.764s user 25m32.305s sys 4m0.012s pswpin: 1333245 pswpout: 4366443 pgpgin: 6962592 pgpgout: 17780712 swpout_zero: 1019603 swpin_zero: 14764 refault_file: 287794 refault_anon: 1347963 w/ patch: real 1m48.879s user 25m29.224s sys 3m37.421s pswpin: 568480 pswpout: 2322657 pgpgin: 4073416 pgpgout: 9613408 swpout_zero: 593275 swpin_zero: 9118 refault_file: 262505 refault_anon: 577550 active/inactive LRU: real 1m49.928s user 25m28.196s sys 3m40.740s pswpin: 463452 pswpout: 2309119 pgpgin: 4438856 pgpgout: 9568628 swpout_zero: 743704 swpin_zero: 7244 refault_file: 562555 refault_anon: 470694 Lance and Xueyuan made a huge contribution to this patch through testing. Link: https://lore.kernel.org/20260526130938.66253-1-baohua@kernel.org Link: https://lore.kernel.org/linux-mm/20250916072226.220426-1-liulei.rjpt@vivo.com/ [1] Signed-off-by: Barry Song (Xiaomi) Tested-by: Lance Yang Tested-by: Xueyuan Chen Cc: Pedro Falcato Cc: Kairui Song Cc: Qi Zheng Cc: Shakeel Butt Cc: wangzicheng Cc: Suren Baghdasaryan Cc: Lei Liu Cc: Matthew Wilcox (Oracle) Cc: Axel Rasmussen Cc: Yuanchu Xie Cc: Wei Xu Cc: Will Deacon Cc: Kalesh Singh Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- mm/swap.c | 16 +++++++++++++--- mm/vmscan.c | 6 +++++- mm/workingset.c | 10 ++++++---- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index a171070e15f0..a8430a7ae054 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -247,7 +247,7 @@ static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else - gen = MAX_NR_GENS - folio_test_workingset(folio); + gen = MAX_NR_GENS - (folio_test_workingset(folio) || folio_test_referenced(folio)); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } diff --git a/mm/swap.c b/mm/swap.c index 2dd84813f4dd..588f50d8f1a8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -544,10 +544,20 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_folio_seq() */ + /* + * For refaulted workingset folios, set PG_active so they + * can be added to active generations. + * For prefaulted file folios, folio_mark_accessed() sets + * PG_referenced so lru_gen_folio_seq() places them into + * the second oldest generation. + */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && - lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) - folio_set_active(folio); + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) { + if (folio_test_workingset(folio)) + folio_set_active(folio); + else if (!folio_test_referenced(folio)) + folio_mark_accessed(folio); + } folio_batch_add_and_move(folio, lru_add); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3c856a78c0a5..76193a84a2af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -850,7 +850,11 @@ static bool lru_gen_set_refs(struct folio *folio) return false; } - set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); + /* Promote on second access */ + if (folio_lru_refs(folio) > 1) + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); + else + folio_mark_accessed(folio); return true; } #else diff --git a/mm/workingset.c b/mm/workingset.c index 07e6836d0502..f351798e723a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -319,11 +319,13 @@ static void lru_gen_refault(struct folio *folio, void *shadow) atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - /* see folio_add_lru() where folio_set_active() will be called */ - if (lru_gen_in_fault()) - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - if (workingset) { + /* + * see folio_add_lru(), where folio_set_active() is + * called for workingset folios + */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else From 62f272d2fbffa7494e4d01c35a3a7b30d71b30a1 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 26 May 2026 11:28:36 +0000 Subject: [PATCH 245/321] mm/page_alloc: remove VM_BUG_ON()s from pindex helpers Vlastimil pointed out that the VM_BUG_ON()s have fallen out of favour, so remove them. Link: https://lore.kernel.org/20260526-page_alloc-unmapped-prep-v2-1-412f4d486115@google.com Signed-off-by: Brendan Jackman Suggested-by: Vlastimil Babka (SUSE) Link: https://lore.kernel.org/all/4074a816-9e75-45a6-8141-25459bcc106b@kernel.org/ Reviewed-by: Vlastimil Babka (SUSE) Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e3c79e79e5b..97cb95820592 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -653,13 +653,8 @@ static inline unsigned int order_to_pindex(int migratetype, int order) if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { bool movable = migratetype == MIGRATE_MOVABLE; - if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(!is_pmd_order(order)); - + if (order > PAGE_ALLOC_COSTLY_ORDER) return NR_LOWORDER_PCP_LISTS + movable; - } - } else { - VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); } return (MIGRATE_PCPTYPES * order) + migratetype; @@ -672,8 +667,6 @@ static inline int pindex_to_order(unsigned int pindex) if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pindex >= NR_LOWORDER_PCP_LISTS) order = HPAGE_PMD_ORDER; - } else { - VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); } return order; From 7bc5e747bba29d5b77b2278e8c696eea0c796706 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sat, 23 May 2026 20:37:58 +0300 Subject: [PATCH 246/321] userfaultfd: merge fs/userfaultfd.c into mm/userfaultfd.c Patch series "userfaultfd: merge fs/userfaultfd.c into mm/userfaultfd.c", v3. These patches merge fs/userfaultfd.c into mm/userfaultfd.c and make functions used only inside mm/userfaultfd.c static. This patch (of 2): Historically userfaultfd implementation has been split between fs/userfaultfd.c and mm/userfaultfd.c. The mm/ part implemented memory management operations, while the fs/ part implemented file descriptor handling and called into the mm/ part for the actual memory management work. This separation is quite artificial and fs/userfaultfd.c does not seem to belong to fs/ because it's only a user if vfs APIs and like for other users, for example, memfd and secretmem, the file descriptor handling could live in mm/ as well. "Append" fs/userfaultfd.c to mm/userfaultfd and update fs/Makefile and MAINTAINERS accordingly. No intended functional changes. Link: https://lore.kernel.org/20260523173759.3964908-1-rppt@kernel.org Link: https://lore.kernel.org/20260523173759.3964908-2-rppt@kernel.org Assisted-by: Copilot:claude-opus-4-6 Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christian Brauner (Amutable) Cc: Al Viro Cc: David Hildenbrand Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Peter Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - fs/Makefile | 1 - fs/userfaultfd.c | 2233 ---------------------------------------------- mm/userfaultfd.c | 2215 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 2215 insertions(+), 2235 deletions(-) delete mode 100644 fs/userfaultfd.c diff --git a/MAINTAINERS b/MAINTAINERS index 1e94e8cc6ad1..48c2265f00a9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17085,7 +17085,6 @@ R: Peter Xu L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/userfaultfd.rst -F: fs/userfaultfd.c F: include/asm-generic/pgtable_uffd.h F: include/linux/userfaultfd_k.h F: include/uapi/linux/userfaultfd.h diff --git a/fs/Makefile b/fs/Makefile index ae1b07f9c6a0..89a8a9d207d1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -27,7 +27,6 @@ obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o -obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c deleted file mode 100644 index 390e4b7d9cb9..000000000000 --- a/fs/userfaultfd.c +++ /dev/null @@ -1,2233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * fs/userfaultfd.c - * - * Copyright (C) 2007 Davide Libenzi - * Copyright (C) 2008-2009 Red Hat, Inc. - * Copyright (C) 2015 Red Hat, Inc. - * - * Some part derived from fs/eventfd.c (anon inode setup) and - * mm/ksm.c (mm hashing). - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int sysctl_unprivileged_userfaultfd __read_mostly; - -#ifdef CONFIG_SYSCTL -static const struct ctl_table vm_userfaultfd_table[] = { - { - .procname = "unprivileged_userfaultfd", - .data = &sysctl_unprivileged_userfaultfd, - .maxlen = sizeof(sysctl_unprivileged_userfaultfd), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; -#endif - -static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; - -struct userfaultfd_fork_ctx { - struct userfaultfd_ctx *orig; - struct userfaultfd_ctx *new; - struct list_head list; -}; - -struct userfaultfd_unmap_ctx { - struct userfaultfd_ctx *ctx; - unsigned long start; - unsigned long end; - struct list_head list; -}; - -struct userfaultfd_wait_queue { - struct uffd_msg msg; - wait_queue_entry_t wq; - struct userfaultfd_ctx *ctx; - bool waken; -}; - -struct userfaultfd_wake_range { - unsigned long start; - unsigned long len; -}; - -/* internal indication that UFFD_API ioctl was successfully executed */ -#define UFFD_FEATURE_INITIALIZED (1u << 31) - -static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) -{ - return ctx->features & UFFD_FEATURE_INITIALIZED; -} - -static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) -{ - return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); -} - -/* - * Whether WP_UNPOPULATED is enabled on the uffd context. It is only - * meaningful when userfaultfd_wp()==true on the vma and when it's - * anonymous. - */ -bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) -{ - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - - if (!ctx) - return false; - - return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; -} - -static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, - int wake_flags, void *key) -{ - struct userfaultfd_wake_range *range = key; - int ret; - struct userfaultfd_wait_queue *uwq; - unsigned long start, len; - - uwq = container_of(wq, struct userfaultfd_wait_queue, wq); - ret = 0; - /* len == 0 means wake all */ - start = range->start; - len = range->len; - if (len && (start > uwq->msg.arg.pagefault.address || - start + len <= uwq->msg.arg.pagefault.address)) - goto out; - WRITE_ONCE(uwq->waken, true); - /* - * The Program-Order guarantees provided by the scheduler - * ensure uwq->waken is visible before the task is woken. - */ - ret = wake_up_state(wq->private, mode); - if (ret) { - /* - * Wake only once, autoremove behavior. - * - * After the effect of list_del_init is visible to the other - * CPUs, the waitqueue may disappear from under us, see the - * !list_empty_careful() in handle_userfault(). - * - * try_to_wake_up() has an implicit smp_mb(), and the - * wq->private is read before calling the extern function - * "wake_up_state" (which in turns calls try_to_wake_up). - */ - list_del_init(&wq->entry); - } -out: - return ret; -} - -/** - * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd - * context. - * @ctx: [in] Pointer to the userfaultfd context. - */ -static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) -{ - refcount_inc(&ctx->refcount); -} - -/** - * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd - * context. - * @ctx: [in] Pointer to userfaultfd context. - * - * The userfaultfd context reference must have been previously acquired either - * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). - */ -static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) -{ - if (refcount_dec_and_test(&ctx->refcount)) { - VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); - VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); - VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); - VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); - VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); - VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); - VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); - VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); - mmdrop(ctx->mm); - kmem_cache_free(userfaultfd_ctx_cachep, ctx); - } -} - -static inline void msg_init(struct uffd_msg *msg) -{ - BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); - /* - * Must use memset to zero out the paddings or kernel data is - * leaked to userland. - */ - memset(msg, 0, sizeof(struct uffd_msg)); -} - -static inline struct uffd_msg userfault_msg(unsigned long address, - unsigned long real_address, - unsigned int flags, - unsigned long reason, - unsigned int features) -{ - struct uffd_msg msg; - - msg_init(&msg); - msg.event = UFFD_EVENT_PAGEFAULT; - - msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? - real_address : address; - - /* - * These flags indicate why the userfault occurred: - * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. - * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. - * - Neither of these flags being set indicates a MISSING fault. - * - * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write - * fault. Otherwise, it was a read fault. - */ - if (flags & FAULT_FLAG_WRITE) - msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; - if (reason & VM_UFFD_WP) - msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; - if (reason & VM_UFFD_MINOR) - msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; - if (features & UFFD_FEATURE_THREAD_ID) - msg.arg.pagefault.feat.ptid = task_pid_vnr(current); - return msg; -} - -#ifdef CONFIG_HUGETLB_PAGE -/* - * Same functionality as userfaultfd_must_wait below with modifications for - * hugepmd ranges. - */ -static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_fault *vmf, - unsigned long reason) -{ - struct vm_area_struct *vma = vmf->vma; - pte_t *ptep, pte; - - assert_fault_locked(vmf); - - ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); - if (!ptep) - return true; - - pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); - - /* - * Lockless access: we're in a wait_event so it's ok if it - * changes under us. - */ - - /* Entry is still missing, wait for userspace to resolve the fault. */ - if (huge_pte_none(pte)) - return true; - /* UFFD PTE markers require userspace to resolve the fault. */ - if (pte_is_uffd_marker(pte)) - return true; - /* - * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to - * resolve the fault. - */ - if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) - return true; - - return false; -} -#else -static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_fault *vmf, - unsigned long reason) -{ - /* Should never get here. */ - VM_WARN_ON_ONCE(1); - return false; -} -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * Verify the pagetables are still not ok after having registered into - * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any - * userfault that has already been resolved, if userfaultfd_read_iter and - * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different - * threads. - */ -static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, - struct vm_fault *vmf, - unsigned long reason) -{ - struct mm_struct *mm = ctx->mm; - unsigned long address = vmf->address; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd, _pmd; - pte_t *pte; - pte_t ptent; - bool ret; - - assert_fault_locked(vmf); - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return true; - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - return true; - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return true; - pmd = pmd_offset(pud, address); -again: - _pmd = pmdp_get_lockless(pmd); - if (pmd_none(_pmd)) - return true; - - /* - * A race could arise which would result in a softleaf entry such as - * migration entry unexpectedly being present in the PMD, so explicitly - * check for this and bail out if so. - */ - if (!pmd_present(_pmd)) - return false; - - if (pmd_trans_huge(_pmd)) - return !pmd_write(_pmd) && (reason & VM_UFFD_WP); - - pte = pte_offset_map(pmd, address); - if (!pte) - goto again; - - /* - * Lockless access: we're in a wait_event so it's ok if it - * changes under us. - */ - ptent = ptep_get(pte); - - ret = true; - /* Entry is still missing, wait for userspace to resolve the fault. */ - if (pte_none(ptent)) - goto out; - /* UFFD PTE markers require userspace to resolve the fault. */ - if (pte_is_uffd_marker(ptent)) - goto out; - /* - * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to - * resolve the fault. - */ - if (!pte_write(ptent) && (reason & VM_UFFD_WP)) - goto out; - - ret = false; -out: - pte_unmap(pte); - return ret; -} - -static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) -{ - if (flags & FAULT_FLAG_INTERRUPTIBLE) - return TASK_INTERRUPTIBLE; - - if (flags & FAULT_FLAG_KILLABLE) - return TASK_KILLABLE; - - return TASK_UNINTERRUPTIBLE; -} - -/* - * The locking rules involved in returning VM_FAULT_RETRY depending on - * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and - * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" - * recommendation in __lock_page_or_retry is not an understatement. - * - * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released - * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is - * not set. - * - * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not - * set, VM_FAULT_RETRY can still be returned if and only if there are - * fatal_signal_pending()s, and the mmap_lock must be released before - * returning it. - */ -vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) -{ - struct vm_area_struct *vma = vmf->vma; - struct mm_struct *mm = vma->vm_mm; - struct userfaultfd_ctx *ctx; - struct userfaultfd_wait_queue uwq; - vm_fault_t ret = VM_FAULT_SIGBUS; - bool must_wait; - unsigned int blocking_state; - - /* - * We don't do userfault handling for the final child pid update - * and when coredumping (faults triggered by get_dump_page()). - */ - if (current->flags & (PF_EXITING|PF_DUMPCORE)) - goto out; - - assert_fault_locked(vmf); - - ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx) - goto out; - - VM_WARN_ON_ONCE(ctx->mm != mm); - - /* Any unrecognized flag is a bug. */ - VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); - /* 0 or > 1 flags set is a bug; we expect exactly 1. */ - VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); - - if (ctx->features & UFFD_FEATURE_SIGBUS) - goto out; - if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) - goto out; - - /* - * Check that we can return VM_FAULT_RETRY. - * - * NOTE: it should become possible to return VM_FAULT_RETRY - * even if FAULT_FLAG_TRIED is set without leading to gup() - * -EBUSY failures, if the userfaultfd is to be extended for - * VM_UFFD_WP tracking and we intend to arm the userfault - * without first stopping userland access to the memory. For - * VM_UFFD_MISSING userfaults this is enough for now. - */ - if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { - /* - * Validate the invariant that nowait must allow retry - * to be sure not to return SIGBUS erroneously on - * nowait invocations. - */ - VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); -#ifdef CONFIG_DEBUG_VM - if (printk_ratelimit()) { - pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", - vmf->flags); - dump_stack(); - } -#endif - goto out; - } - - /* - * Handle nowait, not much to do other than tell it to retry - * and wait. - */ - ret = VM_FAULT_RETRY; - if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) - goto out; - - if (unlikely(READ_ONCE(ctx->released))) { - /* - * If a concurrent release is detected, do not return - * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always - * return VM_FAULT_RETRY with lock released proactively. - * - * If we were to return VM_FAULT_SIGBUS here, the non - * cooperative manager would be instead forced to - * always call UFFDIO_UNREGISTER before it can safely - * close the uffd, to avoid involuntary SIGBUS triggered. - * - * If we were to return VM_FAULT_NOPAGE, it would work for - * the fault path, in which the lock will be released - * later. However for GUP, faultin_page() does nothing - * special on NOPAGE, so GUP would spin retrying without - * releasing the mmap read lock, causing possible livelock. - * - * Here only VM_FAULT_RETRY would make sure the mmap lock - * be released immediately, so that the thread concurrently - * releasing the userfault would always make progress. - */ - release_fault_lock(vmf); - goto out; - } - - /* take the reference before dropping the mmap_lock */ - userfaultfd_ctx_get(ctx); - - init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); - uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, - reason, ctx->features); - uwq.ctx = ctx; - uwq.waken = false; - - blocking_state = userfaultfd_get_blocking_state(vmf->flags); - - /* - * Take the vma lock now, in order to safely call - * userfaultfd_huge_must_wait() later. Since acquiring the - * (sleepable) vma lock can modify the current task state, that - * must be before explicitly calling set_current_state(). - */ - if (is_vm_hugetlb_page(vma)) - hugetlb_vma_lock_read(vma); - - spin_lock_irq(&ctx->fault_pending_wqh.lock); - /* - * After the __add_wait_queue the uwq is visible to userland - * through poll/read(). - */ - __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); - /* - * The smp_mb() after __set_current_state prevents the reads - * following the spin_unlock to happen before the list_add in - * __add_wait_queue. - */ - set_current_state(blocking_state); - spin_unlock_irq(&ctx->fault_pending_wqh.lock); - - if (is_vm_hugetlb_page(vma)) { - must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); - hugetlb_vma_unlock_read(vma); - } else { - must_wait = userfaultfd_must_wait(ctx, vmf, reason); - } - - release_fault_lock(vmf); - - if (likely(must_wait && !READ_ONCE(ctx->released))) { - wake_up_poll(&ctx->fd_wqh, EPOLLIN); - schedule(); - } - - __set_current_state(TASK_RUNNING); - - /* - * Here we race with the list_del; list_add in - * userfaultfd_ctx_read(), however because we don't ever run - * list_del_init() to refile across the two lists, the prev - * and next pointers will never point to self. list_add also - * would never let any of the two pointers to point to - * self. So list_empty_careful won't risk to see both pointers - * pointing to self at any time during the list refile. The - * only case where list_del_init() is called is the full - * removal in the wake function and there we don't re-list_add - * and it's fine not to block on the spinlock. The uwq on this - * kernel stack can be released after the list_del_init. - */ - if (!list_empty_careful(&uwq.wq.entry)) { - spin_lock_irq(&ctx->fault_pending_wqh.lock); - /* - * No need of list_del_init(), the uwq on the stack - * will be freed shortly anyway. - */ - list_del(&uwq.wq.entry); - spin_unlock_irq(&ctx->fault_pending_wqh.lock); - } - - /* - * ctx may go away after this if the userfault pseudo fd is - * already released. - */ - userfaultfd_ctx_put(ctx); - -out: - return ret; -} - -static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, - struct userfaultfd_wait_queue *ewq) -{ - struct userfaultfd_ctx *release_new_ctx; - - if (WARN_ON_ONCE(current->flags & PF_EXITING)) - goto out; - - ewq->ctx = ctx; - init_waitqueue_entry(&ewq->wq, current); - release_new_ctx = NULL; - - spin_lock_irq(&ctx->event_wqh.lock); - /* - * After the __add_wait_queue the uwq is visible to userland - * through poll/read(). - */ - __add_wait_queue(&ctx->event_wqh, &ewq->wq); - for (;;) { - set_current_state(TASK_KILLABLE); - if (ewq->msg.event == 0) - break; - if (READ_ONCE(ctx->released) || - fatal_signal_pending(current)) { - /* - * &ewq->wq may be queued in fork_event, but - * __remove_wait_queue ignores the head - * parameter. It would be a problem if it - * didn't. - */ - __remove_wait_queue(&ctx->event_wqh, &ewq->wq); - if (ewq->msg.event == UFFD_EVENT_FORK) { - struct userfaultfd_ctx *new; - - new = (struct userfaultfd_ctx *) - (unsigned long) - ewq->msg.arg.reserved.reserved1; - release_new_ctx = new; - } - break; - } - - spin_unlock_irq(&ctx->event_wqh.lock); - - wake_up_poll(&ctx->fd_wqh, EPOLLIN); - schedule(); - - spin_lock_irq(&ctx->event_wqh.lock); - } - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&ctx->event_wqh.lock); - - if (release_new_ctx) { - userfaultfd_release_new(release_new_ctx); - userfaultfd_ctx_put(release_new_ctx); - } - - /* - * ctx may go away after this if the userfault pseudo fd is - * already released. - */ -out: - atomic_dec(&ctx->mmap_changing); - VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); - userfaultfd_ctx_put(ctx); -} - -static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, - struct userfaultfd_wait_queue *ewq) -{ - ewq->msg.event = 0; - wake_up_locked(&ctx->event_wqh); - __remove_wait_queue(&ctx->event_wqh, &ewq->wq); -} - -int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) -{ - struct userfaultfd_ctx *ctx = NULL, *octx; - struct userfaultfd_fork_ctx *fctx; - - octx = vma->vm_userfaultfd_ctx.ctx; - if (!octx) - return 0; - - if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { - userfaultfd_reset_ctx(vma); - return 0; - } - - list_for_each_entry(fctx, fcs, list) - if (fctx->orig == octx) { - ctx = fctx->new; - break; - } - - if (!ctx) { - fctx = kmalloc_obj(*fctx); - if (!fctx) - return -ENOMEM; - - ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); - if (!ctx) { - kfree(fctx); - return -ENOMEM; - } - - refcount_set(&ctx->refcount, 1); - ctx->flags = octx->flags; - ctx->features = octx->features; - ctx->released = false; - init_rwsem(&ctx->map_changing_lock); - atomic_set(&ctx->mmap_changing, 0); - ctx->mm = vma->vm_mm; - mmgrab(ctx->mm); - - userfaultfd_ctx_get(octx); - down_write(&octx->map_changing_lock); - atomic_inc(&octx->mmap_changing); - up_write(&octx->map_changing_lock); - fctx->orig = octx; - fctx->new = ctx; - list_add_tail(&fctx->list, fcs); - } - - vma->vm_userfaultfd_ctx.ctx = ctx; - return 0; -} - -static void dup_fctx(struct userfaultfd_fork_ctx *fctx) -{ - struct userfaultfd_ctx *ctx = fctx->orig; - struct userfaultfd_wait_queue ewq; - - msg_init(&ewq.msg); - - ewq.msg.event = UFFD_EVENT_FORK; - ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; - - userfaultfd_event_wait_completion(ctx, &ewq); -} - -void dup_userfaultfd_complete(struct list_head *fcs) -{ - struct userfaultfd_fork_ctx *fctx, *n; - - list_for_each_entry_safe(fctx, n, fcs, list) { - dup_fctx(fctx); - list_del(&fctx->list); - kfree(fctx); - } -} - -void dup_userfaultfd_fail(struct list_head *fcs) -{ - struct userfaultfd_fork_ctx *fctx, *n; - - /* - * An error has occurred on fork, we will tear memory down, but have - * allocated memory for fctx's and raised reference counts for both the - * original and child contexts (and on the mm for each as a result). - * - * These would ordinarily be taken care of by a user handling the event, - * but we are no longer doing so, so manually clean up here. - * - * mm tear down will take care of cleaning up VMA contexts. - */ - list_for_each_entry_safe(fctx, n, fcs, list) { - struct userfaultfd_ctx *octx = fctx->orig; - struct userfaultfd_ctx *ctx = fctx->new; - - atomic_dec(&octx->mmap_changing); - VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); - userfaultfd_ctx_put(octx); - userfaultfd_ctx_put(ctx); - - list_del(&fctx->list); - kfree(fctx); - } -} - -void mremap_userfaultfd_prep(struct vm_area_struct *vma, - struct vm_userfaultfd_ctx *vm_ctx) -{ - struct userfaultfd_ctx *ctx; - - ctx = vma->vm_userfaultfd_ctx.ctx; - - if (!ctx) - return; - - if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { - vm_ctx->ctx = ctx; - userfaultfd_ctx_get(ctx); - down_write(&ctx->map_changing_lock); - atomic_inc(&ctx->mmap_changing); - up_write(&ctx->map_changing_lock); - } else { - /* Drop uffd context if remap feature not enabled */ - userfaultfd_reset_ctx(vma); - } -} - -void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, - unsigned long from, unsigned long to, - unsigned long len) -{ - struct userfaultfd_ctx *ctx = vm_ctx->ctx; - struct userfaultfd_wait_queue ewq; - - if (!ctx) - return; - - msg_init(&ewq.msg); - - ewq.msg.event = UFFD_EVENT_REMAP; - ewq.msg.arg.remap.from = from; - ewq.msg.arg.remap.to = to; - ewq.msg.arg.remap.len = len; - - userfaultfd_event_wait_completion(ctx, &ewq); -} - -void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) -{ - struct userfaultfd_ctx *ctx = vm_ctx->ctx; - - if (!ctx) - return; - - atomic_dec(&ctx->mmap_changing); - VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); - userfaultfd_ctx_put(ctx); -} - -bool userfaultfd_remove(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - struct userfaultfd_ctx *ctx; - struct userfaultfd_wait_queue ewq; - - ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) - return true; - - userfaultfd_ctx_get(ctx); - down_write(&ctx->map_changing_lock); - atomic_inc(&ctx->mmap_changing); - up_write(&ctx->map_changing_lock); - mmap_read_unlock(mm); - - msg_init(&ewq.msg); - - ewq.msg.event = UFFD_EVENT_REMOVE; - ewq.msg.arg.remove.start = start; - ewq.msg.arg.remove.end = end; - - userfaultfd_event_wait_completion(ctx, &ewq); - - return false; -} - -static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, - unsigned long start, unsigned long end) -{ - struct userfaultfd_unmap_ctx *unmap_ctx; - - list_for_each_entry(unmap_ctx, unmaps, list) - if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && - unmap_ctx->end == end) - return true; - - return false; -} - -int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct list_head *unmaps) -{ - struct userfaultfd_unmap_ctx *unmap_ctx; - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || - has_unmap_ctx(ctx, unmaps, start, end)) - return 0; - - unmap_ctx = kzalloc_obj(*unmap_ctx); - if (!unmap_ctx) - return -ENOMEM; - - userfaultfd_ctx_get(ctx); - down_write(&ctx->map_changing_lock); - atomic_inc(&ctx->mmap_changing); - up_write(&ctx->map_changing_lock); - unmap_ctx->ctx = ctx; - unmap_ctx->start = start; - unmap_ctx->end = end; - list_add_tail(&unmap_ctx->list, unmaps); - - return 0; -} - -void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) -{ - struct userfaultfd_unmap_ctx *ctx, *n; - struct userfaultfd_wait_queue ewq; - - list_for_each_entry_safe(ctx, n, uf, list) { - msg_init(&ewq.msg); - - ewq.msg.event = UFFD_EVENT_UNMAP; - ewq.msg.arg.remove.start = ctx->start; - ewq.msg.arg.remove.end = ctx->end; - - userfaultfd_event_wait_completion(ctx->ctx, &ewq); - - list_del(&ctx->list); - kfree(ctx); - } -} - -static int userfaultfd_release(struct inode *inode, struct file *file) -{ - struct userfaultfd_ctx *ctx = file->private_data; - struct mm_struct *mm = ctx->mm; - /* len == 0 means wake all */ - struct userfaultfd_wake_range range = { .len = 0, }; - - WRITE_ONCE(ctx->released, true); - - userfaultfd_release_all(mm, ctx); - - /* - * After no new page faults can wait on this fault_*wqh, flush - * the last page faults that may have been already waiting on - * the fault_*wqh. - */ - spin_lock_irq(&ctx->fault_pending_wqh.lock); - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); - __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); - spin_unlock_irq(&ctx->fault_pending_wqh.lock); - - /* Flush pending events that may still wait on event_wqh */ - wake_up_all(&ctx->event_wqh); - - wake_up_poll(&ctx->fd_wqh, EPOLLHUP); - userfaultfd_ctx_put(ctx); - return 0; -} - -/* fault_pending_wqh.lock must be hold by the caller */ -static inline struct userfaultfd_wait_queue *find_userfault_in( - wait_queue_head_t *wqh) -{ - wait_queue_entry_t *wq; - struct userfaultfd_wait_queue *uwq; - - lockdep_assert_held(&wqh->lock); - - uwq = NULL; - if (!waitqueue_active(wqh)) - goto out; - /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&wqh->head, typeof(*wq), entry); - uwq = container_of(wq, struct userfaultfd_wait_queue, wq); -out: - return uwq; -} - -static inline struct userfaultfd_wait_queue *find_userfault( - struct userfaultfd_ctx *ctx) -{ - return find_userfault_in(&ctx->fault_pending_wqh); -} - -static inline struct userfaultfd_wait_queue *find_userfault_evt( - struct userfaultfd_ctx *ctx) -{ - return find_userfault_in(&ctx->event_wqh); -} - -static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) -{ - struct userfaultfd_ctx *ctx = file->private_data; - __poll_t ret; - - poll_wait(file, &ctx->fd_wqh, wait); - - if (!userfaultfd_is_initialized(ctx)) - return EPOLLERR; - - /* - * poll() never guarantees that read won't block. - * userfaults can be waken before they're read(). - */ - if (unlikely(!(file->f_flags & O_NONBLOCK))) - return EPOLLERR; - /* - * lockless access to see if there are pending faults - * __pollwait last action is the add_wait_queue but - * the spin_unlock would allow the waitqueue_active to - * pass above the actual list_add inside - * add_wait_queue critical section. So use a full - * memory barrier to serialize the list_add write of - * add_wait_queue() with the waitqueue_active read - * below. - */ - ret = 0; - smp_mb(); - if (waitqueue_active(&ctx->fault_pending_wqh)) - ret = EPOLLIN; - else if (waitqueue_active(&ctx->event_wqh)) - ret = EPOLLIN; - - return ret; -} - -static const struct file_operations userfaultfd_fops; - -static int resolve_userfault_fork(struct userfaultfd_ctx *new, - struct inode *inode, - struct uffd_msg *msg) -{ - int fd; - - fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, - O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); - if (fd < 0) - return fd; - - msg->arg.reserved.reserved1 = 0; - msg->arg.fork.ufd = fd; - return 0; -} - -static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, - struct uffd_msg *msg, struct inode *inode) -{ - ssize_t ret; - DECLARE_WAITQUEUE(wait, current); - struct userfaultfd_wait_queue *uwq; - /* - * Handling fork event requires sleeping operations, so - * we drop the event_wqh lock, then do these ops, then - * lock it back and wake up the waiter. While the lock is - * dropped the ewq may go away so we keep track of it - * carefully. - */ - LIST_HEAD(fork_event); - struct userfaultfd_ctx *fork_nctx = NULL; - - /* always take the fd_wqh lock before the fault_pending_wqh lock */ - spin_lock_irq(&ctx->fd_wqh.lock); - __add_wait_queue(&ctx->fd_wqh, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - spin_lock(&ctx->fault_pending_wqh.lock); - uwq = find_userfault(ctx); - if (uwq) { - /* - * Use a seqcount to repeat the lockless check - * in wake_userfault() to avoid missing - * wakeups because during the refile both - * waitqueue could become empty if this is the - * only userfault. - */ - write_seqcount_begin(&ctx->refile_seq); - - /* - * The fault_pending_wqh.lock prevents the uwq - * to disappear from under us. - * - * Refile this userfault from - * fault_pending_wqh to fault_wqh, it's not - * pending anymore after we read it. - * - * Use list_del() by hand (as - * userfaultfd_wake_function also uses - * list_del_init() by hand) to be sure nobody - * changes __remove_wait_queue() to use - * list_del_init() in turn breaking the - * !list_empty_careful() check in - * handle_userfault(). The uwq->wq.head list - * must never be empty at any time during the - * refile, or the waitqueue could disappear - * from under us. The "wait_queue_head_t" - * parameter of __remove_wait_queue() is unused - * anyway. - */ - list_del(&uwq->wq.entry); - add_wait_queue(&ctx->fault_wqh, &uwq->wq); - - write_seqcount_end(&ctx->refile_seq); - - /* careful to always initialize msg if ret == 0 */ - *msg = uwq->msg; - spin_unlock(&ctx->fault_pending_wqh.lock); - ret = 0; - break; - } - spin_unlock(&ctx->fault_pending_wqh.lock); - - spin_lock(&ctx->event_wqh.lock); - uwq = find_userfault_evt(ctx); - if (uwq) { - *msg = uwq->msg; - - if (uwq->msg.event == UFFD_EVENT_FORK) { - fork_nctx = (struct userfaultfd_ctx *) - (unsigned long) - uwq->msg.arg.reserved.reserved1; - list_move(&uwq->wq.entry, &fork_event); - /* - * fork_nctx can be freed as soon as - * we drop the lock, unless we take a - * reference on it. - */ - userfaultfd_ctx_get(fork_nctx); - spin_unlock(&ctx->event_wqh.lock); - ret = 0; - break; - } - - userfaultfd_event_complete(ctx, uwq); - spin_unlock(&ctx->event_wqh.lock); - ret = 0; - break; - } - spin_unlock(&ctx->event_wqh.lock); - - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - if (no_wait) { - ret = -EAGAIN; - break; - } - spin_unlock_irq(&ctx->fd_wqh.lock); - schedule(); - spin_lock_irq(&ctx->fd_wqh.lock); - } - __remove_wait_queue(&ctx->fd_wqh, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&ctx->fd_wqh.lock); - - if (!ret && msg->event == UFFD_EVENT_FORK) { - ret = resolve_userfault_fork(fork_nctx, inode, msg); - spin_lock_irq(&ctx->event_wqh.lock); - if (!list_empty(&fork_event)) { - /* - * The fork thread didn't abort, so we can - * drop the temporary refcount. - */ - userfaultfd_ctx_put(fork_nctx); - - uwq = list_first_entry(&fork_event, - typeof(*uwq), - wq.entry); - /* - * If fork_event list wasn't empty and in turn - * the event wasn't already released by fork - * (the event is allocated on fork kernel - * stack), put the event back to its place in - * the event_wq. fork_event head will be freed - * as soon as we return so the event cannot - * stay queued there no matter the current - * "ret" value. - */ - list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->event_wqh, &uwq->wq); - - /* - * Leave the event in the waitqueue and report - * error to userland if we failed to resolve - * the userfault fork. - */ - if (likely(!ret)) - userfaultfd_event_complete(ctx, uwq); - } else { - /* - * Here the fork thread aborted and the - * refcount from the fork thread on fork_nctx - * has already been released. We still hold - * the reference we took before releasing the - * lock above. If resolve_userfault_fork - * failed we've to drop it because the - * fork_nctx has to be freed in such case. If - * it succeeded we'll hold it because the new - * uffd references it. - */ - if (ret) - userfaultfd_ctx_put(fork_nctx); - } - spin_unlock_irq(&ctx->event_wqh.lock); - } - - return ret; -} - -static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct file *file = iocb->ki_filp; - struct userfaultfd_ctx *ctx = file->private_data; - ssize_t _ret, ret = 0; - struct uffd_msg msg; - struct inode *inode = file_inode(file); - bool no_wait; - - if (!userfaultfd_is_initialized(ctx)) - return -EINVAL; - - no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; - for (;;) { - if (iov_iter_count(to) < sizeof(msg)) - return ret ? ret : -EINVAL; - _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); - if (_ret < 0) - return ret ? ret : _ret; - _ret = !copy_to_iter_full(&msg, sizeof(msg), to); - if (_ret) - return ret ? ret : -EFAULT; - ret += sizeof(msg); - /* - * Allow to read more than one fault at time but only - * block if waiting for the very first one. - */ - no_wait = true; - } -} - -static void __wake_userfault(struct userfaultfd_ctx *ctx, - struct userfaultfd_wake_range *range) -{ - spin_lock_irq(&ctx->fault_pending_wqh.lock); - /* wake all in the range and autoremove */ - if (waitqueue_active(&ctx->fault_pending_wqh)) - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, - range); - if (waitqueue_active(&ctx->fault_wqh)) - __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); - spin_unlock_irq(&ctx->fault_pending_wqh.lock); -} - -static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, - struct userfaultfd_wake_range *range) -{ - unsigned seq; - bool need_wakeup; - - /* - * To be sure waitqueue_active() is not reordered by the CPU - * before the pagetable update, use an explicit SMP memory - * barrier here. PT lock release or mmap_read_unlock(mm) still - * have release semantics that can allow the - * waitqueue_active() to be reordered before the pte update. - */ - smp_mb(); - - /* - * Use waitqueue_active because it's very frequent to - * change the address space atomically even if there are no - * userfaults yet. So we take the spinlock only when we're - * sure we've userfaults to wake. - */ - do { - seq = read_seqcount_begin(&ctx->refile_seq); - need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || - waitqueue_active(&ctx->fault_wqh); - cond_resched(); - } while (read_seqcount_retry(&ctx->refile_seq, seq)); - if (need_wakeup) - __wake_userfault(ctx, range); -} - -static __always_inline int validate_unaligned_range( - struct mm_struct *mm, __u64 start, __u64 len) -{ - __u64 task_size = mm->task_size; - - if (len & ~PAGE_MASK) - return -EINVAL; - if (!len) - return -EINVAL; - if (start >= task_size) - return -EINVAL; - if (len > task_size - start) - return -EINVAL; - if (start + len <= start) - return -EINVAL; - return 0; -} - -static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) -{ - if (start & ~PAGE_MASK) - return -EINVAL; - - return validate_unaligned_range(mm, start, len); -} - -static int userfaultfd_register(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *cur; - int ret; - struct uffdio_register uffdio_register; - struct uffdio_register __user *user_uffdio_register; - vm_flags_t vm_flags; - bool found; - bool basic_ioctls; - unsigned long start, end; - struct vma_iterator vmi; - bool wp_async = userfaultfd_wp_async_ctx(ctx); - - user_uffdio_register = (struct uffdio_register __user *) arg; - - ret = -EFAULT; - if (copy_from_user(&uffdio_register, user_uffdio_register, - sizeof(uffdio_register)-sizeof(__u64))) - goto out; - - ret = -EINVAL; - if (!uffdio_register.mode) - goto out; - if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) - goto out; - vm_flags = 0; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) - vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { - if (!pgtable_supports_uffd_wp()) - goto out; - - vm_flags |= VM_UFFD_WP; - } - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - goto out; -#endif - vm_flags |= VM_UFFD_MINOR; - } - - ret = validate_range(mm, uffdio_register.range.start, - uffdio_register.range.len); - if (ret) - goto out; - - start = uffdio_register.range.start; - end = start + uffdio_register.range.len; - - ret = -ENOMEM; - if (!mmget_not_zero(mm)) - goto out; - - ret = -EINVAL; - mmap_write_lock(mm); - vma_iter_init(&vmi, mm, start); - vma = vma_find(&vmi, end); - if (!vma) - goto out_unlock; - - /* - * If the first vma contains huge pages, make sure start address - * is aligned to huge page size. - */ - if (is_vm_hugetlb_page(vma)) { - unsigned long vma_hpagesize = vma_kernel_pagesize(vma); - - if (start & (vma_hpagesize - 1)) - goto out_unlock; - } - - /* - * Search for not compatible vmas. - */ - found = false; - basic_ioctls = false; - cur = vma; - do { - cond_resched(); - - VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); - - /* check not compatible vmas */ - ret = -EINVAL; - if (!vma_can_userfault(cur, vm_flags, wp_async)) - goto out_unlock; - - /* - * UFFDIO_COPY will fill file holes even without - * PROT_WRITE. This check enforces that if this is a - * MAP_SHARED, the process has write permission to the backing - * file. If VM_MAYWRITE is set it also enforces that on a - * MAP_SHARED vma: there is no F_WRITE_SEAL and no further - * F_WRITE_SEAL can be taken until the vma is destroyed. - */ - ret = -EPERM; - if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) - goto out_unlock; - - /* - * If this vma contains ending address, and huge pages - * check alignment. - */ - if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && - end > cur->vm_start) { - unsigned long vma_hpagesize = vma_kernel_pagesize(cur); - - ret = -EINVAL; - - if (end & (vma_hpagesize - 1)) - goto out_unlock; - } - if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) - goto out_unlock; - - /* - * Check that this vma isn't already owned by a - * different userfaultfd. We can't allow more than one - * userfaultfd to own a single vma simultaneously or we - * wouldn't know which one to deliver the userfaults to. - */ - ret = -EBUSY; - if (cur->vm_userfaultfd_ctx.ctx && - cur->vm_userfaultfd_ctx.ctx != ctx) - goto out_unlock; - - /* - * Note vmas containing huge pages - */ - if (is_vm_hugetlb_page(cur)) - basic_ioctls = true; - - found = true; - } for_each_vma_range(vmi, cur, end); - VM_WARN_ON_ONCE(!found); - - ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, - wp_async); - -out_unlock: - mmap_write_unlock(mm); - mmput(mm); - if (!ret) { - __u64 ioctls_out; - - ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : - UFFD_API_RANGE_IOCTLS; - - /* - * Declare the WP ioctl only if the WP mode is - * specified and all checks passed with the range - */ - if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) - ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); - - /* CONTINUE ioctl is only supported for MINOR ranges. */ - if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) - ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); - - /* - * Now that we scanned all vmas we can already tell - * userland which ioctls methods are guaranteed to - * succeed on this range. - */ - if (put_user(ioctls_out, &user_uffdio_register->ioctls)) - ret = -EFAULT; - } -out: - return ret; -} - -static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev, *cur; - int ret; - struct uffdio_range uffdio_unregister; - bool found; - unsigned long start, end, vma_end; - const void __user *buf = (void __user *)arg; - struct vma_iterator vmi; - bool wp_async = userfaultfd_wp_async_ctx(ctx); - - ret = -EFAULT; - if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) - goto out; - - ret = validate_range(mm, uffdio_unregister.start, - uffdio_unregister.len); - if (ret) - goto out; - - start = uffdio_unregister.start; - end = start + uffdio_unregister.len; - - ret = -ENOMEM; - if (!mmget_not_zero(mm)) - goto out; - - mmap_write_lock(mm); - ret = -EINVAL; - vma_iter_init(&vmi, mm, start); - vma = vma_find(&vmi, end); - if (!vma) - goto out_unlock; - - /* - * If the first vma contains huge pages, make sure start address - * is aligned to huge page size. - */ - if (is_vm_hugetlb_page(vma)) { - unsigned long vma_hpagesize = vma_kernel_pagesize(vma); - - if (start & (vma_hpagesize - 1)) - goto out_unlock; - } - - /* - * Search for not compatible vmas. - */ - found = false; - cur = vma; - do { - cond_resched(); - - VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); - - /* - * Prevent unregistering through a different userfaultfd than - * the one used for registration. - */ - if (cur->vm_userfaultfd_ctx.ctx && - cur->vm_userfaultfd_ctx.ctx != ctx) - goto out_unlock; - - /* - * Check not compatible vmas, not strictly required - * here as not compatible vmas cannot have an - * userfaultfd_ctx registered on them, but this - * provides for more strict behavior to notice - * unregistration errors. - */ - if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) - goto out_unlock; - - found = true; - } for_each_vma_range(vmi, cur, end); - VM_WARN_ON_ONCE(!found); - - vma_iter_set(&vmi, start); - prev = vma_prev(&vmi); - if (vma->vm_start < start) - prev = vma; - - ret = 0; - for_each_vma_range(vmi, vma, end) { - cond_resched(); - - /* VMA not registered with userfaultfd. */ - if (!vma->vm_userfaultfd_ctx.ctx) - goto skip; - - VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); - VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); - - if (vma->vm_start > start) - start = vma->vm_start; - vma_end = min(end, vma->vm_end); - - if (userfaultfd_missing(vma)) { - /* - * Wake any concurrent pending userfault while - * we unregister, so they will not hang - * permanently and it avoids userland to call - * UFFDIO_WAKE explicitly. - */ - struct userfaultfd_wake_range range; - range.start = start; - range.len = vma_end - start; - wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); - } - - vma = userfaultfd_clear_vma(&vmi, prev, vma, - start, vma_end); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - break; - } - - skip: - prev = vma; - start = vma->vm_end; - } - -out_unlock: - mmap_write_unlock(mm); - mmput(mm); -out: - return ret; -} - -/* - * userfaultfd_wake may be used in combination with the - * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. - */ -static int userfaultfd_wake(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - int ret; - struct uffdio_range uffdio_wake; - struct userfaultfd_wake_range range; - const void __user *buf = (void __user *)arg; - - ret = -EFAULT; - if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) - goto out; - - ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); - if (ret) - goto out; - - range.start = uffdio_wake.start; - range.len = uffdio_wake.len; - - /* - * len == 0 means wake all and we don't want to wake all here, - * so check it again to be sure. - */ - VM_WARN_ON_ONCE(!range.len); - - wake_userfault(ctx, &range); - ret = 0; - -out: - return ret; -} - -static int userfaultfd_copy(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - __s64 ret; - struct uffdio_copy uffdio_copy; - struct uffdio_copy __user *user_uffdio_copy; - struct userfaultfd_wake_range range; - uffd_flags_t flags = 0; - - user_uffdio_copy = (struct uffdio_copy __user *) arg; - - ret = -EAGAIN; - if (unlikely(atomic_read(&ctx->mmap_changing))) { - if (unlikely(put_user(ret, &user_uffdio_copy->copy))) - return -EFAULT; - goto out; - } - - ret = -EFAULT; - if (copy_from_user(&uffdio_copy, user_uffdio_copy, - /* don't copy "copy" last field */ - sizeof(uffdio_copy)-sizeof(__s64))) - goto out; - - ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, - uffdio_copy.len); - if (ret) - goto out; - ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); - if (ret) - goto out; - - ret = -EINVAL; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) - goto out; - if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) - flags |= MFILL_ATOMIC_WP; - if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, flags); - mmput(ctx->mm); - } else { - return -ESRCH; - } - if (unlikely(put_user(ret, &user_uffdio_copy->copy))) - return -EFAULT; - if (ret < 0) - goto out; - VM_WARN_ON_ONCE(!ret); - /* len == 0 would wake all */ - range.len = ret; - if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { - range.start = uffdio_copy.dst; - wake_userfault(ctx, &range); - } - ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; -out: - return ret; -} - -static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - __s64 ret; - struct uffdio_zeropage uffdio_zeropage; - struct uffdio_zeropage __user *user_uffdio_zeropage; - struct userfaultfd_wake_range range; - - user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; - - ret = -EAGAIN; - if (unlikely(atomic_read(&ctx->mmap_changing))) { - if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) - return -EFAULT; - goto out; - } - - ret = -EFAULT; - if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, - /* don't copy "zeropage" last field */ - sizeof(uffdio_zeropage)-sizeof(__s64))) - goto out; - - ret = validate_range(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); - if (ret) - goto out; - ret = -EINVAL; - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) - goto out; - - if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); - mmput(ctx->mm); - } else { - return -ESRCH; - } - if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) - return -EFAULT; - if (ret < 0) - goto out; - /* len == 0 would wake all */ - VM_WARN_ON_ONCE(!ret); - range.len = ret; - if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { - range.start = uffdio_zeropage.range.start; - wake_userfault(ctx, &range); - } - ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; -out: - return ret; -} - -static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - int ret; - struct uffdio_writeprotect uffdio_wp; - struct uffdio_writeprotect __user *user_uffdio_wp; - struct userfaultfd_wake_range range; - bool mode_wp, mode_dontwake; - - if (atomic_read(&ctx->mmap_changing)) - return -EAGAIN; - - user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; - - if (copy_from_user(&uffdio_wp, user_uffdio_wp, - sizeof(struct uffdio_writeprotect))) - return -EFAULT; - - ret = validate_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len); - if (ret) - return ret; - - if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | - UFFDIO_WRITEPROTECT_MODE_WP)) - return -EINVAL; - - mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; - mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; - - if (mode_wp && mode_dontwake) - return -EINVAL; - - if (mmget_not_zero(ctx->mm)) { - ret = mwriteprotect_range(ctx, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp); - mmput(ctx->mm); - } else { - return -ESRCH; - } - - if (ret) - return ret; - - if (!mode_wp && !mode_dontwake) { - range.start = uffdio_wp.range.start; - range.len = uffdio_wp.range.len; - wake_userfault(ctx, &range); - } - return ret; -} - -static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) -{ - __s64 ret; - struct uffdio_continue uffdio_continue; - struct uffdio_continue __user *user_uffdio_continue; - struct userfaultfd_wake_range range; - uffd_flags_t flags = 0; - - user_uffdio_continue = (struct uffdio_continue __user *)arg; - - ret = -EAGAIN; - if (unlikely(atomic_read(&ctx->mmap_changing))) { - if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) - return -EFAULT; - goto out; - } - - ret = -EFAULT; - if (copy_from_user(&uffdio_continue, user_uffdio_continue, - /* don't copy the output fields */ - sizeof(uffdio_continue) - (sizeof(__s64)))) - goto out; - - ret = validate_range(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len); - if (ret) - goto out; - - ret = -EINVAL; - if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | - UFFDIO_CONTINUE_MODE_WP)) - goto out; - if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) - flags |= MFILL_ATOMIC_WP; - - if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, - uffdio_continue.range.len, flags); - mmput(ctx->mm); - } else { - return -ESRCH; - } - - if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) - return -EFAULT; - if (ret < 0) - goto out; - - /* len == 0 would wake all */ - VM_WARN_ON_ONCE(!ret); - range.len = ret; - if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { - range.start = uffdio_continue.range.start; - wake_userfault(ctx, &range); - } - ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; - -out: - return ret; -} - -static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) -{ - __s64 ret; - struct uffdio_poison uffdio_poison; - struct uffdio_poison __user *user_uffdio_poison; - struct userfaultfd_wake_range range; - - user_uffdio_poison = (struct uffdio_poison __user *)arg; - - ret = -EAGAIN; - if (unlikely(atomic_read(&ctx->mmap_changing))) { - if (unlikely(put_user(ret, &user_uffdio_poison->updated))) - return -EFAULT; - goto out; - } - - ret = -EFAULT; - if (copy_from_user(&uffdio_poison, user_uffdio_poison, - /* don't copy the output fields */ - sizeof(uffdio_poison) - (sizeof(__s64)))) - goto out; - - ret = validate_range(ctx->mm, uffdio_poison.range.start, - uffdio_poison.range.len); - if (ret) - goto out; - - ret = -EINVAL; - if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) - goto out; - - if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, - uffdio_poison.range.len, 0); - mmput(ctx->mm); - } else { - return -ESRCH; - } - - if (unlikely(put_user(ret, &user_uffdio_poison->updated))) - return -EFAULT; - if (ret < 0) - goto out; - - /* len == 0 would wake all */ - VM_WARN_ON_ONCE(!ret); - range.len = ret; - if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { - range.start = uffdio_poison.range.start; - wake_userfault(ctx, &range); - } - ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; - -out: - return ret; -} - -bool userfaultfd_wp_async(struct vm_area_struct *vma) -{ - return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); -} - -static inline unsigned int uffd_ctx_features(__u64 user_features) -{ - /* - * For the current set of features the bits just coincide. Set - * UFFD_FEATURE_INITIALIZED to mark the features as enabled. - */ - return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; -} - -static int userfaultfd_move(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - __s64 ret; - struct uffdio_move uffdio_move; - struct uffdio_move __user *user_uffdio_move; - struct userfaultfd_wake_range range; - struct mm_struct *mm = ctx->mm; - - user_uffdio_move = (struct uffdio_move __user *) arg; - - ret = -EAGAIN; - if (unlikely(atomic_read(&ctx->mmap_changing))) { - if (unlikely(put_user(ret, &user_uffdio_move->move))) - return -EFAULT; - goto out; - } - - if (copy_from_user(&uffdio_move, user_uffdio_move, - /* don't copy "move" last field */ - sizeof(uffdio_move)-sizeof(__s64))) - return -EFAULT; - - /* Do not allow cross-mm moves. */ - if (mm != current->mm) - return -EINVAL; - - ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); - if (ret) - return ret; - - ret = validate_range(mm, uffdio_move.src, uffdio_move.len); - if (ret) - return ret; - - if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| - UFFDIO_MOVE_MODE_DONTWAKE)) - return -EINVAL; - - if (mmget_not_zero(mm)) { - ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, - uffdio_move.len, uffdio_move.mode); - mmput(mm); - } else { - return -ESRCH; - } - - if (unlikely(put_user(ret, &user_uffdio_move->move))) - return -EFAULT; - if (ret < 0) - goto out; - - /* len == 0 would wake all */ - VM_WARN_ON(!ret); - range.len = ret; - if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { - range.start = uffdio_move.dst; - wake_userfault(ctx, &range); - } - ret = range.len == uffdio_move.len ? 0 : -EAGAIN; - -out: - return ret; -} - -/* - * userland asks for a certain API version and we return which bits - * and ioctl commands are implemented in this kernel for such API - * version or -EINVAL if unknown. - */ -static int userfaultfd_api(struct userfaultfd_ctx *ctx, - unsigned long arg) -{ - struct uffdio_api uffdio_api; - void __user *buf = (void __user *)arg; - unsigned int ctx_features; - int ret; - __u64 features; - - ret = -EFAULT; - if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) - goto out; - features = uffdio_api.features; - ret = -EINVAL; - if (uffdio_api.api != UFFD_API) - goto err_out; - ret = -EPERM; - if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) - goto err_out; - - /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ - if (features & UFFD_FEATURE_WP_ASYNC) - features |= UFFD_FEATURE_WP_UNPOPULATED; - - /* report all available features and ioctls to userland */ - uffdio_api.features = UFFD_API_FEATURES; -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= - ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); -#endif - if (!pgtable_supports_uffd_wp()) - uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; - - if (!uffd_supports_wp_marker()) { - uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; - uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; - uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; - } - - ret = -EINVAL; - if (features & ~uffdio_api.features) - goto err_out; - - uffdio_api.ioctls = UFFD_API_IOCTLS; - ret = -EFAULT; - if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) - goto out; - - /* only enable the requested features for this uffd context */ - ctx_features = uffd_ctx_features(features); - ret = -EINVAL; - if (cmpxchg(&ctx->features, 0, ctx_features) != 0) - goto err_out; - - ret = 0; -out: - return ret; -err_out: - memset(&uffdio_api, 0, sizeof(uffdio_api)); - if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) - ret = -EFAULT; - goto out; -} - -static long userfaultfd_ioctl(struct file *file, unsigned cmd, - unsigned long arg) -{ - int ret = -EINVAL; - struct userfaultfd_ctx *ctx = file->private_data; - - if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) - return -EINVAL; - - switch(cmd) { - case UFFDIO_API: - ret = userfaultfd_api(ctx, arg); - break; - case UFFDIO_REGISTER: - ret = userfaultfd_register(ctx, arg); - break; - case UFFDIO_UNREGISTER: - ret = userfaultfd_unregister(ctx, arg); - break; - case UFFDIO_WAKE: - ret = userfaultfd_wake(ctx, arg); - break; - case UFFDIO_COPY: - ret = userfaultfd_copy(ctx, arg); - break; - case UFFDIO_ZEROPAGE: - ret = userfaultfd_zeropage(ctx, arg); - break; - case UFFDIO_MOVE: - ret = userfaultfd_move(ctx, arg); - break; - case UFFDIO_WRITEPROTECT: - ret = userfaultfd_writeprotect(ctx, arg); - break; - case UFFDIO_CONTINUE: - ret = userfaultfd_continue(ctx, arg); - break; - case UFFDIO_POISON: - ret = userfaultfd_poison(ctx, arg); - break; - } - return ret; -} - -#ifdef CONFIG_PROC_FS -static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct userfaultfd_ctx *ctx = f->private_data; - wait_queue_entry_t *wq; - unsigned long pending = 0, total = 0; - - spin_lock_irq(&ctx->fault_pending_wqh.lock); - list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { - pending++; - total++; - } - list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { - total++; - } - spin_unlock_irq(&ctx->fault_pending_wqh.lock); - - /* - * If more protocols will be added, there will be all shown - * separated by a space. Like this: - * protocols: aa:... bb:... - */ - seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", - pending, total, UFFD_API, ctx->features, - UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); -} -#endif - -static const struct file_operations userfaultfd_fops = { -#ifdef CONFIG_PROC_FS - .show_fdinfo = userfaultfd_show_fdinfo, -#endif - .release = userfaultfd_release, - .poll = userfaultfd_poll, - .read_iter = userfaultfd_read_iter, - .unlocked_ioctl = userfaultfd_ioctl, - .compat_ioctl = compat_ptr_ioctl, - .llseek = noop_llseek, -}; - -static void init_once_userfaultfd_ctx(void *mem) -{ - struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; - - init_waitqueue_head(&ctx->fault_pending_wqh); - init_waitqueue_head(&ctx->fault_wqh); - init_waitqueue_head(&ctx->event_wqh); - init_waitqueue_head(&ctx->fd_wqh); - seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); -} - -static int new_userfaultfd(int flags) -{ - struct userfaultfd_ctx *ctx __free(kfree) = NULL; - - VM_WARN_ON_ONCE(!current->mm); - - /* Check the UFFD_* constants for consistency. */ - BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); - - if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) - return -EINVAL; - - ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - refcount_set(&ctx->refcount, 1); - ctx->flags = flags; - ctx->features = 0; - ctx->released = false; - init_rwsem(&ctx->map_changing_lock); - atomic_set(&ctx->mmap_changing, 0); - ctx->mm = current->mm; - - FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, - anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), - NULL)); - if (fdf.err) - return fdf.err; - - /* prevent the mm struct to be freed */ - mmgrab(ctx->mm); - fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; - retain_and_null_ptr(ctx); - return fd_publish(fdf); -} - -static inline bool userfaultfd_syscall_allowed(int flags) -{ - /* Userspace-only page faults are always allowed */ - if (flags & UFFD_USER_MODE_ONLY) - return true; - - /* - * The user is requesting a userfaultfd which can handle kernel faults. - * Privileged users are always allowed to do this. - */ - if (capable(CAP_SYS_PTRACE)) - return true; - - /* Otherwise, access to kernel fault handling is sysctl controlled. */ - return sysctl_unprivileged_userfaultfd; -} - -SYSCALL_DEFINE1(userfaultfd, int, flags) -{ - if (!userfaultfd_syscall_allowed(flags)) - return -EPERM; - - return new_userfaultfd(flags); -} - -static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) -{ - if (cmd != USERFAULTFD_IOC_NEW) - return -EINVAL; - - return new_userfaultfd(flags); -} - -static const struct file_operations userfaultfd_dev_fops = { - .unlocked_ioctl = userfaultfd_dev_ioctl, - .compat_ioctl = userfaultfd_dev_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice userfaultfd_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "userfaultfd", - .fops = &userfaultfd_dev_fops -}; - -static int __init userfaultfd_init(void) -{ - int ret; - - ret = misc_register(&userfaultfd_misc); - if (ret) - return ret; - - userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", - sizeof(struct userfaultfd_ctx), - 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - init_once_userfaultfd_ctx); -#ifdef CONFIG_SYSCTL - register_sysctl_init("vm", vm_userfaultfd_table); -#endif - return 0; -} -__initcall(userfaultfd_init); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 80cc8be5725f..74af5682f3fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -2,7 +2,12 @@ /* * mm/userfaultfd.c * + * Copyright (C) 2007 Davide Libenzi + * Copyright (C) 2008-2009 Red Hat, Inc. * Copyright (C) 2015 Red Hat, Inc. + * + * Some part derived from fs/eventfd.c (anon inode setup) and + * mm/ksm.c (mm hashing). */ #include @@ -14,6 +19,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -2305,3 +2321,2202 @@ void userfaultfd_release_all(struct mm_struct *mm, mmap_write_unlock(mm); mmput(mm); } + +static int sysctl_unprivileged_userfaultfd __read_mostly; + +#ifdef CONFIG_SYSCTL +static const struct ctl_table vm_userfaultfd_table[] = { + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; +#endif + +static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; + +struct userfaultfd_fork_ctx { + struct userfaultfd_ctx *orig; + struct userfaultfd_ctx *new; + struct list_head list; +}; + +struct userfaultfd_unmap_ctx { + struct userfaultfd_ctx *ctx; + unsigned long start; + unsigned long end; + struct list_head list; +}; + +struct userfaultfd_wait_queue { + struct uffd_msg msg; + wait_queue_entry_t wq; + struct userfaultfd_ctx *ctx; + bool waken; +}; + +struct userfaultfd_wake_range { + unsigned long start; + unsigned long len; +}; + +/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) +{ + return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); +} + +/* + * Whether WP_UNPOPULATED is enabled on the uffd context. It is only + * meaningful when userfaultfd_wp()==true on the vma and when it's + * anonymous. + */ +bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return false; + + return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, + int wake_flags, void *key) +{ + struct userfaultfd_wake_range *range = key; + int ret; + struct userfaultfd_wait_queue *uwq; + unsigned long start, len; + + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + ret = 0; + /* len == 0 means wake all */ + start = range->start; + len = range->len; + if (len && (start > uwq->msg.arg.pagefault.address || + start + len <= uwq->msg.arg.pagefault.address)) + goto out; + WRITE_ONCE(uwq->waken, true); + /* + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. + */ + ret = wake_up_state(wq->private, mode); + if (ret) { + /* + * Wake only once, autoremove behavior. + * + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). + */ + list_del_init(&wq->entry); + } +out: + return ret; +} + +/** + * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to the userfaultfd context. + */ +static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) +{ + refcount_inc(&ctx->refcount); +} + +/** + * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to userfaultfd context. + * + * The userfaultfd context reference must have been previously acquired either + * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). + */ +static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->refcount)) { + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); + mmdrop(ctx->mm); + kmem_cache_free(userfaultfd_ctx_cachep, ctx); + } +} + +static inline void msg_init(struct uffd_msg *msg) +{ + BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); + /* + * Must use memset to zero out the paddings or kernel data is + * leaked to userland. + */ + memset(msg, 0, sizeof(struct uffd_msg)); +} + +static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, + unsigned int flags, + unsigned long reason, + unsigned int features) +{ + struct uffd_msg msg; + + msg_init(&msg); + msg.event = UFFD_EVENT_PAGEFAULT; + + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + + /* + * These flags indicate why the userfault occurred: + * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. + * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. + * - Neither of these flags being set indicates a MISSING fault. + * + * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write + * fault. Otherwise, it was a read fault. + */ + if (flags & FAULT_FLAG_WRITE) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; + if (reason & VM_UFFD_WP) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (reason & VM_UFFD_MINOR) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; + if (features & UFFD_FEATURE_THREAD_ID) + msg.arg.pagefault.feat.ptid = task_pid_vnr(current); + return msg; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Same functionality as userfaultfd_must_wait below with modifications for + * hugepmd ranges. + */ +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_fault *vmf, + unsigned long reason) +{ + struct vm_area_struct *vma = vmf->vma; + pte_t *ptep, pte; + + assert_fault_locked(vmf); + + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); + if (!ptep) + return true; + + pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(pte)) + return true; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ + if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) + return true; + + return false; +} +#else +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_fault *vmf, + unsigned long reason) +{ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Verify the pagetables are still not ok after having registered into + * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any + * userfault that has already been resolved, if userfaultfd_read_iter and + * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different + * threads. + */ +static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + struct vm_fault *vmf, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pte_t ptent; + bool ret; + + assert_fault_locked(vmf); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return true; + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return true; + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return true; + pmd = pmd_offset(pud, address); +again: + _pmd = pmdp_get_lockless(pmd); + if (pmd_none(_pmd)) + return true; + + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ + if (!pmd_present(_pmd)) + return false; + + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); + + pte = pte_offset_map(pmd, address); + if (!pte) + goto again; + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + ptent = ptep_get(pte); + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(ptent)) + goto out; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) + goto out; + + ret = false; +out: + pte_unmap(pte); + return ret; +} + +static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return TASK_INTERRUPTIBLE; + + if (flags & FAULT_FLAG_KILLABLE) + return TASK_KILLABLE; + + return TASK_UNINTERRUPTIBLE; +} + +/* + * The locking rules involved in returning VM_FAULT_RETRY depending on + * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and + * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" + * recommendation in __lock_page_or_retry is not an understatement. + * + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released + * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is + * not set. + * + * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not + * set, VM_FAULT_RETRY can still be returned if and only if there are + * fatal_signal_pending()s, and the mmap_lock must be released before + * returning it. + */ +vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) +{ + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue uwq; + vm_fault_t ret = VM_FAULT_SIGBUS; + bool must_wait; + unsigned int blocking_state; + + /* + * We don't do userfault handling for the final child pid update + * and when coredumping (faults triggered by get_dump_page()). + */ + if (current->flags & (PF_EXITING|PF_DUMPCORE)) + goto out; + + assert_fault_locked(vmf); + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx) + goto out; + + VM_WARN_ON_ONCE(ctx->mm != mm); + + /* Any unrecognized flag is a bug. */ + VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); + /* 0 or > 1 flags set is a bug; we expect exactly 1. */ + VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); + + if (ctx->features & UFFD_FEATURE_SIGBUS) + goto out; + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) + goto out; + + /* + * Check that we can return VM_FAULT_RETRY. + * + * NOTE: it should become possible to return VM_FAULT_RETRY + * even if FAULT_FLAG_TRIED is set without leading to gup() + * -EBUSY failures, if the userfaultfd is to be extended for + * VM_UFFD_WP tracking and we intend to arm the userfault + * without first stopping userland access to the memory. For + * VM_UFFD_MISSING userfaults this is enough for now. + */ + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { + /* + * Validate the invariant that nowait must allow retry + * to be sure not to return SIGBUS erroneously on + * nowait invocations. + */ + VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); +#ifdef CONFIG_DEBUG_VM + if (printk_ratelimit()) { + pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); + dump_stack(); + } +#endif + goto out; + } + + /* + * Handle nowait, not much to do other than tell it to retry + * and wait. + */ + ret = VM_FAULT_RETRY; + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + goto out; + + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + + /* take the reference before dropping the mmap_lock */ + userfaultfd_ctx_get(ctx); + + init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); + uwq.wq.private = current; + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); + uwq.ctx = ctx; + uwq.waken = false; + + blocking_state = userfaultfd_get_blocking_state(vmf->flags); + + /* + * Take the vma lock now, in order to safely call + * userfaultfd_huge_must_wait() later. Since acquiring the + * (sleepable) vma lock can modify the current task state, that + * must be before explicitly calling set_current_state(). + */ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); + + spin_lock_irq(&ctx->fault_pending_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); + /* + * The smp_mb() after __set_current_state prevents the reads + * following the spin_unlock to happen before the list_add in + * __add_wait_queue. + */ + set_current_state(blocking_state); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + if (is_vm_hugetlb_page(vma)) { + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); + hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + + release_fault_lock(vmf); + + if (likely(must_wait && !READ_ONCE(ctx->released))) { + wake_up_poll(&ctx->fd_wqh, EPOLLIN); + schedule(); + } + + __set_current_state(TASK_RUNNING); + + /* + * Here we race with the list_del; list_add in + * userfaultfd_ctx_read(), however because we don't ever run + * list_del_init() to refile across the two lists, the prev + * and next pointers will never point to self. list_add also + * would never let any of the two pointers to point to + * self. So list_empty_careful won't risk to see both pointers + * pointing to self at any time during the list refile. The + * only case where list_del_init() is called is the full + * removal in the wake function and there we don't re-list_add + * and it's fine not to block on the spinlock. The uwq on this + * kernel stack can be released after the list_del_init. + */ + if (!list_empty_careful(&uwq.wq.entry)) { + spin_lock_irq(&ctx->fault_pending_wqh.lock); + /* + * No need of list_del_init(), the uwq on the stack + * will be freed shortly anyway. + */ + list_del(&uwq.wq.entry); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + userfaultfd_ctx_put(ctx); + +out: + return ret; +} + +static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + struct userfaultfd_ctx *release_new_ctx; + + if (WARN_ON_ONCE(current->flags & PF_EXITING)) + goto out; + + ewq->ctx = ctx; + init_waitqueue_entry(&ewq->wq, current); + release_new_ctx = NULL; + + spin_lock_irq(&ctx->event_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->event_wqh, &ewq->wq); + for (;;) { + set_current_state(TASK_KILLABLE); + if (ewq->msg.event == 0) + break; + if (READ_ONCE(ctx->released) || + fatal_signal_pending(current)) { + /* + * &ewq->wq may be queued in fork_event, but + * __remove_wait_queue ignores the head + * parameter. It would be a problem if it + * didn't. + */ + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); + if (ewq->msg.event == UFFD_EVENT_FORK) { + struct userfaultfd_ctx *new; + + new = (struct userfaultfd_ctx *) + (unsigned long) + ewq->msg.arg.reserved.reserved1; + release_new_ctx = new; + } + break; + } + + spin_unlock_irq(&ctx->event_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, EPOLLIN); + schedule(); + + spin_lock_irq(&ctx->event_wqh.lock); + } + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&ctx->event_wqh.lock); + + if (release_new_ctx) { + userfaultfd_release_new(release_new_ctx); + userfaultfd_ctx_put(release_new_ctx); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ +out: + atomic_dec(&ctx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); + userfaultfd_ctx_put(ctx); +} + +static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + ewq->msg.event = 0; + wake_up_locked(&ctx->event_wqh); + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); +} + +int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) +{ + struct userfaultfd_ctx *ctx = NULL, *octx; + struct userfaultfd_fork_ctx *fctx; + + octx = vma->vm_userfaultfd_ctx.ctx; + if (!octx) + return 0; + + if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { + userfaultfd_reset_ctx(vma); + return 0; + } + + list_for_each_entry(fctx, fcs, list) + if (fctx->orig == octx) { + ctx = fctx->new; + break; + } + + if (!ctx) { + fctx = kmalloc_obj(*fctx); + if (!fctx) + return -ENOMEM; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) { + kfree(fctx); + return -ENOMEM; + } + + refcount_set(&ctx->refcount, 1); + ctx->flags = octx->flags; + ctx->features = octx->features; + ctx->released = false; + init_rwsem(&ctx->map_changing_lock); + atomic_set(&ctx->mmap_changing, 0); + ctx->mm = vma->vm_mm; + mmgrab(ctx->mm); + + userfaultfd_ctx_get(octx); + down_write(&octx->map_changing_lock); + atomic_inc(&octx->mmap_changing); + up_write(&octx->map_changing_lock); + fctx->orig = octx; + fctx->new = ctx; + list_add_tail(&fctx->list, fcs); + } + + vma->vm_userfaultfd_ctx.ctx = ctx; + return 0; +} + +static void dup_fctx(struct userfaultfd_fork_ctx *fctx) +{ + struct userfaultfd_ctx *ctx = fctx->orig; + struct userfaultfd_wait_queue ewq; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_FORK; + ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + +void dup_userfaultfd_complete(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + list_for_each_entry_safe(fctx, n, fcs, list) { + dup_fctx(fctx); + list_del(&fctx->list); + kfree(fctx); + } +} + +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + +void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx; + + ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return; + + if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { + vm_ctx->ctx = ctx; + userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); + atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); + } else { + /* Drop uffd context if remap feature not enabled */ + userfaultfd_reset_ctx(vma); + } +} + +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, + unsigned long from, unsigned long to, + unsigned long len) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + struct userfaultfd_wait_queue ewq; + + if (!ctx) + return; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMAP; + ewq.msg.arg.remap.from = from; + ewq.msg.arg.remap.to = to; + ewq.msg.arg.remap.len = len; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + +void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + + if (!ctx) + return; + + atomic_dec(&ctx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); + userfaultfd_ctx_put(ctx); +} + +bool userfaultfd_remove(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue ewq; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) + return true; + + userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); + atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); + mmap_read_unlock(mm); + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMOVE; + ewq.msg.arg.remove.start = start; + ewq.msg.arg.remove.end = end; + + userfaultfd_event_wait_completion(ctx, &ewq); + + return false; +} + +static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, + unsigned long start, unsigned long end) +{ + struct userfaultfd_unmap_ctx *unmap_ctx; + + list_for_each_entry(unmap_ctx, unmaps, list) + if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && + unmap_ctx->end == end) + return true; + + return false; +} + +int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct list_head *unmaps) +{ + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + return 0; + + unmap_ctx = kzalloc_obj(*unmap_ctx); + if (!unmap_ctx) + return -ENOMEM; + + userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); + atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); + + return 0; +} + +void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) +{ + struct userfaultfd_unmap_ctx *ctx, *n; + struct userfaultfd_wait_queue ewq; + + list_for_each_entry_safe(ctx, n, uf, list) { + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_UNMAP; + ewq.msg.arg.remove.start = ctx->start; + ewq.msg.arg.remove.end = ctx->end; + + userfaultfd_event_wait_completion(ctx->ctx, &ewq); + + list_del(&ctx->list); + kfree(ctx); + } +} + +static int userfaultfd_release(struct inode *inode, struct file *file) +{ + struct userfaultfd_ctx *ctx = file->private_data; + struct mm_struct *mm = ctx->mm; + /* len == 0 means wake all */ + struct userfaultfd_wake_range range = { .len = 0, }; + + WRITE_ONCE(ctx->released, true); + + userfaultfd_release_all(mm, ctx); + + /* + * After no new page faults can wait on this fault_*wqh, flush + * the last page faults that may have been already waiting on + * the fault_*wqh. + */ + spin_lock_irq(&ctx->fault_pending_wqh.lock); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + /* Flush pending events that may still wait on event_wqh */ + wake_up_all(&ctx->event_wqh); + + wake_up_poll(&ctx->fd_wqh, EPOLLHUP); + userfaultfd_ctx_put(ctx); + return 0; +} + +/* fault_pending_wqh.lock must be hold by the caller */ +static inline struct userfaultfd_wait_queue *find_userfault_in( + wait_queue_head_t *wqh) +{ + wait_queue_entry_t *wq; + struct userfaultfd_wait_queue *uwq; + + lockdep_assert_held(&wqh->lock); + + uwq = NULL; + if (!waitqueue_active(wqh)) + goto out; + /* walk in reverse to provide FIFO behavior to read userfaults */ + wq = list_last_entry(&wqh->head, typeof(*wq), entry); + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); +out: + return uwq; +} + +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->fault_pending_wqh); +} + +static inline struct userfaultfd_wait_queue *find_userfault_evt( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->event_wqh); +} + +static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) +{ + struct userfaultfd_ctx *ctx = file->private_data; + __poll_t ret; + + poll_wait(file, &ctx->fd_wqh, wait); + + if (!userfaultfd_is_initialized(ctx)) + return EPOLLERR; + + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) + return EPOLLERR; + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; +} + +static const struct file_operations userfaultfd_fops; + +static int resolve_userfault_fork(struct userfaultfd_ctx *new, + struct inode *inode, + struct uffd_msg *msg) +{ + int fd; + + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); + if (fd < 0) + return fd; + + msg->arg.reserved.reserved1 = 0; + msg->arg.fork.ufd = fd; + return 0; +} + +static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, + struct uffd_msg *msg, struct inode *inode) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + struct userfaultfd_wait_queue *uwq; + /* + * Handling fork event requires sleeping operations, so + * we drop the event_wqh lock, then do these ops, then + * lock it back and wake up the waiter. While the lock is + * dropped the ewq may go away so we keep track of it + * carefully. + */ + LIST_HEAD(fork_event); + struct userfaultfd_ctx *fork_nctx = NULL; + + /* always take the fd_wqh lock before the fault_pending_wqh lock */ + spin_lock_irq(&ctx->fd_wqh.lock); + __add_wait_queue(&ctx->fd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock(&ctx->fault_pending_wqh.lock); + uwq = find_userfault(ctx); + if (uwq) { + /* + * Use a seqcount to repeat the lockless check + * in wake_userfault() to avoid missing + * wakeups because during the refile both + * waitqueue could become empty if this is the + * only userfault. + */ + write_seqcount_begin(&ctx->refile_seq); + + /* + * The fault_pending_wqh.lock prevents the uwq + * to disappear from under us. + * + * Refile this userfault from + * fault_pending_wqh to fault_wqh, it's not + * pending anymore after we read it. + * + * Use list_del() by hand (as + * userfaultfd_wake_function also uses + * list_del_init() by hand) to be sure nobody + * changes __remove_wait_queue() to use + * list_del_init() in turn breaking the + * !list_empty_careful() check in + * handle_userfault(). The uwq->wq.head list + * must never be empty at any time during the + * refile, or the waitqueue could disappear + * from under us. The "wait_queue_head_t" + * parameter of __remove_wait_queue() is unused + * anyway. + */ + list_del(&uwq->wq.entry); + add_wait_queue(&ctx->fault_wqh, &uwq->wq); + + write_seqcount_end(&ctx->refile_seq); + + /* careful to always initialize msg if ret == 0 */ + *msg = uwq->msg; + spin_unlock(&ctx->fault_pending_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + + spin_lock(&ctx->event_wqh.lock); + uwq = find_userfault_evt(ctx); + if (uwq) { + *msg = uwq->msg; + + if (uwq->msg.event == UFFD_EVENT_FORK) { + fork_nctx = (struct userfaultfd_ctx *) + (unsigned long) + uwq->msg.arg.reserved.reserved1; + list_move(&uwq->wq.entry, &fork_event); + /* + * fork_nctx can be freed as soon as + * we drop the lock, unless we take a + * reference on it. + */ + userfaultfd_ctx_get(fork_nctx); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + + userfaultfd_event_complete(ctx, uwq); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->event_wqh.lock); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (no_wait) { + ret = -EAGAIN; + break; + } + spin_unlock_irq(&ctx->fd_wqh.lock); + schedule(); + spin_lock_irq(&ctx->fd_wqh.lock); + } + __remove_wait_queue(&ctx->fd_wqh, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&ctx->fd_wqh.lock); + + if (!ret && msg->event == UFFD_EVENT_FORK) { + ret = resolve_userfault_fork(fork_nctx, inode, msg); + spin_lock_irq(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + /* + * The fork thread didn't abort, so we can + * drop the temporary refcount. + */ + userfaultfd_ctx_put(fork_nctx); + + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.entry); + /* + * If fork_event list wasn't empty and in turn + * the event wasn't already released by fork + * (the event is allocated on fork kernel + * stack), put the event back to its place in + * the event_wq. fork_event head will be freed + * as soon as we return so the event cannot + * stay queued there no matter the current + * "ret" value. + */ + list_del(&uwq->wq.entry); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); + + /* + * Leave the event in the waitqueue and report + * error to userland if we failed to resolve + * the userfault fork. + */ + if (likely(!ret)) + userfaultfd_event_complete(ctx, uwq); + } else { + /* + * Here the fork thread aborted and the + * refcount from the fork thread on fork_nctx + * has already been released. We still hold + * the reference we took before releasing the + * lock above. If resolve_userfault_fork + * failed we've to drop it because the + * fork_nctx has to be freed in such case. If + * it succeeded we'll hold it because the new + * uffd references it. + */ + if (ret) + userfaultfd_ctx_put(fork_nctx); + } + spin_unlock_irq(&ctx->event_wqh.lock); + } + + return ret; +} + +static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct userfaultfd_ctx *ctx = file->private_data; + ssize_t _ret, ret = 0; + struct uffd_msg msg; + struct inode *inode = file_inode(file); + bool no_wait; + + if (!userfaultfd_is_initialized(ctx)) + return -EINVAL; + + no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; + for (;;) { + if (iov_iter_count(to) < sizeof(msg)) + return ret ? ret : -EINVAL; + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); + if (_ret < 0) + return ret ? ret : _ret; + _ret = !copy_to_iter_full(&msg, sizeof(msg), to); + if (_ret) + return ret ? ret : -EFAULT; + ret += sizeof(msg); + /* + * Allow to read more than one fault at time but only + * block if waiting for the very first one. + */ + no_wait = true; + } +} + +static void __wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + spin_lock_irq(&ctx->fault_pending_wqh.lock); + /* wake all in the range and autoremove */ + if (waitqueue_active(&ctx->fault_pending_wqh)) + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, + range); + if (waitqueue_active(&ctx->fault_wqh)) + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); +} + +static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned seq; + bool need_wakeup; + + /* + * To be sure waitqueue_active() is not reordered by the CPU + * before the pagetable update, use an explicit SMP memory + * barrier here. PT lock release or mmap_read_unlock(mm) still + * have release semantics that can allow the + * waitqueue_active() to be reordered before the pte update. + */ + smp_mb(); + + /* + * Use waitqueue_active because it's very frequent to + * change the address space atomically even if there are no + * userfaults yet. So we take the spinlock only when we're + * sure we've userfaults to wake. + */ + do { + seq = read_seqcount_begin(&ctx->refile_seq); + need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || + waitqueue_active(&ctx->fault_wqh); + cond_resched(); + } while (read_seqcount_retry(&ctx->refile_seq, seq)); + if (need_wakeup) + __wake_userfault(ctx, range); +} + +static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) +{ + __u64 task_size = mm->task_size; + + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; + if (start >= task_size) + return -EINVAL; + if (len > task_size - start) + return -EINVAL; + if (start + len <= start) + return -EINVAL; + return 0; +} + +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + if (start & ~PAGE_MASK) + return -EINVAL; + + return validate_unaligned_range(mm, start, len); +} + +static int userfaultfd_register(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *cur; + int ret; + struct uffdio_register uffdio_register; + struct uffdio_register __user *user_uffdio_register; + vm_flags_t vm_flags; + bool found; + bool basic_ioctls; + unsigned long start, end; + struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); + + user_uffdio_register = (struct uffdio_register __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_register, user_uffdio_register, + sizeof(uffdio_register)-sizeof(__u64))) + goto out; + + ret = -EINVAL; + if (!uffdio_register.mode) + goto out; + if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) + goto out; + vm_flags = 0; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) + vm_flags |= VM_UFFD_MISSING; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (!pgtable_supports_uffd_wp()) + goto out; + + vm_flags |= VM_UFFD_WP; + } + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + goto out; +#endif + vm_flags |= VM_UFFD_MINOR; + } + + ret = validate_range(mm, uffdio_register.range.start, + uffdio_register.range.len); + if (ret) + goto out; + + start = uffdio_register.range.start; + end = start + uffdio_register.range.len; + + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + + ret = -EINVAL; + mmap_write_lock(mm); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) + goto out_unlock; + + /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* + * Search for not compatible vmas. + */ + found = false; + basic_ioctls = false; + cur = vma; + do { + cond_resched(); + + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* check not compatible vmas */ + ret = -EINVAL; + if (!vma_can_userfault(cur, vm_flags, wp_async)) + goto out_unlock; + + /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + + /* + * If this vma contains ending address, and huge pages + * check alignment. + */ + if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && + end > cur->vm_start) { + unsigned long vma_hpagesize = vma_kernel_pagesize(cur); + + ret = -EINVAL; + + if (end & (vma_hpagesize - 1)) + goto out_unlock; + } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; + + /* + * Check that this vma isn't already owned by a + * different userfaultfd. We can't allow more than one + * userfaultfd to own a single vma simultaneously or we + * wouldn't know which one to deliver the userfaults to. + */ + ret = -EBUSY; + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + /* + * Note vmas containing huge pages + */ + if (is_vm_hugetlb_page(cur)) + basic_ioctls = true; + + found = true; + } for_each_vma_range(vmi, cur, end); + VM_WARN_ON_ONCE(!found); + + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, + wp_async); + +out_unlock: + mmap_write_unlock(mm); + mmput(mm); + if (!ret) { + __u64 ioctls_out; + + ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS; + + /* + * Declare the WP ioctl only if the WP mode is + * specified and all checks passed with the range + */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) + ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + + /* CONTINUE ioctl is only supported for MINOR ranges. */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) + ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); + + /* + * Now that we scanned all vmas we can already tell + * userland which ioctls methods are guaranteed to + * succeed on this range. + */ + if (put_user(ioctls_out, &user_uffdio_register->ioctls)) + ret = -EFAULT; + } +out: + return ret; +} + +static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_range uffdio_unregister; + bool found; + unsigned long start, end, vma_end; + const void __user *buf = (void __user *)arg; + struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); + + ret = -EFAULT; + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) + goto out; + + ret = validate_range(mm, uffdio_unregister.start, + uffdio_unregister.len); + if (ret) + goto out; + + start = uffdio_unregister.start; + end = start + uffdio_unregister.len; + + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + + mmap_write_lock(mm); + ret = -EINVAL; + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) + goto out_unlock; + + /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* + * Search for not compatible vmas. + */ + found = false; + cur = vma; + do { + cond_resched(); + + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* + * Prevent unregistering through a different userfaultfd than + * the one used for registration. + */ + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + /* + * Check not compatible vmas, not strictly required + * here as not compatible vmas cannot have an + * userfaultfd_ctx registered on them, but this + * provides for more strict behavior to notice + * unregistration errors. + */ + if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) + goto out_unlock; + + found = true; + } for_each_vma_range(vmi, cur, end); + VM_WARN_ON_ONCE(!found); + + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); + if (vma->vm_start < start) + prev = vma; + + ret = 0; + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + /* VMA not registered with userfaultfd. */ + if (!vma->vm_userfaultfd_ctx.ctx) + goto skip; + + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + if (userfaultfd_missing(vma)) { + /* + * Wake any concurrent pending userfault while + * we unregister, so they will not hang + * permanently and it avoids userland to call + * UFFDIO_WAKE explicitly. + */ + struct userfaultfd_wake_range range; + range.start = start; + range.len = vma_end - start; + wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + start, vma_end); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; + } + +skip: + prev = vma; + start = vma->vm_end; + } + +out_unlock: + mmap_write_unlock(mm); + mmput(mm); +out: + return ret; +} + +/* + * userfaultfd_wake may be used in combination with the + * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. + */ +static int userfaultfd_wake(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_range uffdio_wake; + struct userfaultfd_wake_range range; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) + goto out; + + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + if (ret) + goto out; + + range.start = uffdio_wake.start; + range.len = uffdio_wake.len; + + /* + * len == 0 means wake all and we don't want to wake all here, + * so check it again to be sure. + */ + VM_WARN_ON_ONCE(!range.len); + + wake_userfault(ctx, &range); + ret = 0; + +out: + return ret; +} + +static int userfaultfd_copy(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_copy uffdio_copy; + struct uffdio_copy __user *user_uffdio_copy; + struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; + + user_uffdio_copy = (struct uffdio_copy __user *) arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_copy, user_uffdio_copy, + /* don't copy "copy" last field */ + sizeof(uffdio_copy)-sizeof(__s64))) + goto out; + + ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, + uffdio_copy.len); + if (ret) + goto out; + ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + goto out; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) + flags |= MFILL_ATOMIC_WP; + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, flags); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + if (ret < 0) + goto out; + VM_WARN_ON_ONCE(!ret); + /* len == 0 would wake all */ + range.len = ret; + if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { + range.start = uffdio_copy.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_zeropage uffdio_zeropage; + struct uffdio_zeropage __user *user_uffdio_zeropage; + struct userfaultfd_wake_range range; + + user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, + /* don't copy "zeropage" last field */ + sizeof(uffdio_zeropage)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (ret) + goto out; + ret = -EINVAL; + if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + if (ret < 0) + goto out; + /* len == 0 would wake all */ + VM_WARN_ON_ONCE(!ret); + range.len = ret; + if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { + range.start = uffdio_zeropage.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + bool mode_wp, mode_dontwake; + + if (atomic_read(&ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + + mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; + mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + + if (mode_wp && mode_dontwake) + return -EINVAL; + + if (mmget_not_zero(ctx->mm)) { + ret = mwriteprotect_range(ctx, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (ret) + return ret; + + if (!mode_wp && !mode_dontwake) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_continue uffdio_continue; + struct uffdio_continue __user *user_uffdio_continue; + struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; + + user_uffdio_continue = (struct uffdio_continue __user *)arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_continue, user_uffdio_continue, + /* don't copy the output fields */ + sizeof(uffdio_continue) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) + goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, + uffdio_continue.range.len, flags); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON_ONCE(!ret); + range.len = ret; + if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { + range.start = uffdio_continue.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, + uffdio_poison.range.len, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON_ONCE(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + +bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); +} + +static inline unsigned int uffd_ctx_features(__u64 user_features) +{ + /* + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. + */ + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; +} + +static int userfaultfd_move(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_move uffdio_move; + struct uffdio_move __user *user_uffdio_move; + struct userfaultfd_wake_range range; + struct mm_struct *mm = ctx->mm; + + user_uffdio_move = (struct uffdio_move __user *) arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + goto out; + } + + if (copy_from_user(&uffdio_move, user_uffdio_move, + /* don't copy "move" last field */ + sizeof(uffdio_move)-sizeof(__s64))) + return -EFAULT; + + /* Do not allow cross-mm moves. */ + if (mm != current->mm) + return -EINVAL; + + ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); + if (ret) + return ret; + + ret = validate_range(mm, uffdio_move.src, uffdio_move.len); + if (ret) + return ret; + + if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| + UFFDIO_MOVE_MODE_DONTWAKE)) + return -EINVAL; + + if (mmget_not_zero(mm)) { + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); + mmput(mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON(!ret); + range.len = ret; + if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { + range.start = uffdio_move.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_move.len ? 0 : -EAGAIN; + +out: + return ret; +} + +/* + * userland asks for a certain API version and we return which bits + * and ioctl commands are implemented in this kernel for such API + * version or -EINVAL if unknown. + */ +static int userfaultfd_api(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct uffdio_api uffdio_api; + void __user *buf = (void __user *)arg; + unsigned int ctx_features; + int ret; + __u64 features; + + ret = -EFAULT; + if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) + goto out; + features = uffdio_api.features; + ret = -EINVAL; + if (uffdio_api.api != UFFD_API) + goto err_out; + ret = -EPERM; + if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) + goto err_out; + + /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ + if (features & UFFD_FEATURE_WP_ASYNC) + features |= UFFD_FEATURE_WP_UNPOPULATED; + + /* report all available features and ioctls to userland */ + uffdio_api.features = UFFD_API_FEATURES; +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); +#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } + + ret = -EINVAL; + if (features & ~uffdio_api.features) + goto err_out; + + uffdio_api.ioctls = UFFD_API_IOCTLS; + ret = -EFAULT; + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + + /* only enable the requested features for this uffd context */ + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + + ret = 0; +out: + return ret; +err_out: + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + ret = -EFAULT; + goto out; +} + +static long userfaultfd_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct userfaultfd_ctx *ctx = file->private_data; + + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) + return -EINVAL; + + switch (cmd) { + case UFFDIO_API: + ret = userfaultfd_api(ctx, arg); + break; + case UFFDIO_REGISTER: + ret = userfaultfd_register(ctx, arg); + break; + case UFFDIO_UNREGISTER: + ret = userfaultfd_unregister(ctx, arg); + break; + case UFFDIO_WAKE: + ret = userfaultfd_wake(ctx, arg); + break; + case UFFDIO_COPY: + ret = userfaultfd_copy(ctx, arg); + break; + case UFFDIO_ZEROPAGE: + ret = userfaultfd_zeropage(ctx, arg); + break; + case UFFDIO_MOVE: + ret = userfaultfd_move(ctx, arg); + break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; + case UFFDIO_CONTINUE: + ret = userfaultfd_continue(ctx, arg); + break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; + } + return ret; +} + +#ifdef CONFIG_PROC_FS +static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct userfaultfd_ctx *ctx = f->private_data; + wait_queue_entry_t *wq; + unsigned long pending = 0, total = 0; + + spin_lock_irq(&ctx->fault_pending_wqh.lock); + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { + pending++; + total++; + } + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { + total++; + } + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + /* + * If more protocols will be added, there will be all shown + * separated by a space. Like this: + * protocols: aa:... bb:... + */ + seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", + pending, total, UFFD_API, ctx->features, + UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); +} +#endif + +static const struct file_operations userfaultfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = userfaultfd_show_fdinfo, +#endif + .release = userfaultfd_release, + .poll = userfaultfd_poll, + .read_iter = userfaultfd_read_iter, + .unlocked_ioctl = userfaultfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static void init_once_userfaultfd_ctx(void *mem) +{ + struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; + + init_waitqueue_head(&ctx->fault_pending_wqh); + init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->event_wqh); + init_waitqueue_head(&ctx->fd_wqh); + seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); +} + +static int new_userfaultfd(int flags) +{ + struct userfaultfd_ctx *ctx __free(kfree) = NULL; + + VM_WARN_ON_ONCE(!current->mm); + + /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); + + if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) + return -EINVAL; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + refcount_set(&ctx->refcount, 1); + ctx->flags = flags; + ctx->features = 0; + ctx->released = false; + init_rwsem(&ctx->map_changing_lock); + atomic_set(&ctx->mmap_changing, 0); + ctx->mm = current->mm; + + FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, + anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), + NULL)); + if (fdf.err) + return fdf.err; + + /* prevent the mm struct to be freed */ + mmgrab(ctx->mm); + fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; + retain_and_null_ptr(ctx); + return fd_publish(fdf); +} + +static inline bool userfaultfd_syscall_allowed(int flags) +{ + /* Userspace-only page faults are always allowed */ + if (flags & UFFD_USER_MODE_ONLY) + return true; + + /* + * The user is requesting a userfaultfd which can handle kernel faults. + * Privileged users are always allowed to do this. + */ + if (capable(CAP_SYS_PTRACE)) + return true; + + /* Otherwise, access to kernel fault handling is sysctl controlled. */ + return sysctl_unprivileged_userfaultfd; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + if (!userfaultfd_syscall_allowed(flags)) + return -EPERM; + + return new_userfaultfd(flags); +} + +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) +{ + if (cmd != USERFAULTFD_IOC_NEW) + return -EINVAL; + + return new_userfaultfd(flags); +} + +static const struct file_operations userfaultfd_dev_fops = { + .unlocked_ioctl = userfaultfd_dev_ioctl, + .compat_ioctl = userfaultfd_dev_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice userfaultfd_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "userfaultfd", + .fops = &userfaultfd_dev_fops +}; + +static int __init userfaultfd_init(void) +{ + int ret; + + ret = misc_register(&userfaultfd_misc); + if (ret) + return ret; + + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", + sizeof(struct userfaultfd_ctx), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + init_once_userfaultfd_ctx); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_userfaultfd_table); +#endif + return 0; +} +__initcall(userfaultfd_init); From b182633f8ce52172a40097ccc0c60047e58c2320 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sat, 23 May 2026 20:37:59 +0300 Subject: [PATCH 247/321] userfaultfd: make functions that are not used outside uffd static After merging fs/userfaultfd.c into mm/userfaultfd.c, several functions that were previously shared between the two files are now only used within mm/userfaultfd.c. Make them static and remove their declarations from include/linux/userfaultfd_k.h. Link: https://lore.kernel.org/20260523173759.3964908-3-rppt@kernel.org Assisted-by: Copilot:claude-opus-4-6 Signed-off-by: Mike Rapoport (Microsoft) Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 36 ----------------------------------- mm/userfaultfd.c | 24 +++++++++++------------ 2 files changed, 12 insertions(+), 48 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d2920f98ab86..3ec8e1071673 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -147,26 +147,12 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) -extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long src_start, unsigned long len, - uffd_flags_t flags); -extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, - unsigned long dst_start, - unsigned long len); -extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long len, uffd_flags_t flags); -extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, - unsigned long len, uffd_flags_t flags); -extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, - unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); -ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, @@ -239,9 +225,6 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, - bool wp_async); - static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; @@ -271,25 +254,6 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); -void userfaultfd_reset_ctx(struct vm_area_struct *vma); - -struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end); - -int userfaultfd_register_range(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - vm_flags_t vm_flags, - unsigned long start, unsigned long end, - bool wp_async); - -void userfaultfd_release_new(struct userfaultfd_ctx *ctx); - -void userfaultfd_release_all(struct mm_struct *mm, - struct userfaultfd_ctx *ctx); - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { /* Only wr-protect mode uses pte markers */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 74af5682f3fb..c86daf38d154 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1033,7 +1033,7 @@ out: return copied ? copied : err; } -ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, +static ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { @@ -1041,7 +1041,7 @@ ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); } -ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, +static ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len) { @@ -1049,7 +1049,7 @@ ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); } -ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, +static ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { @@ -1065,7 +1065,7 @@ ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } -ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, +static ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { return mfill_atomic(ctx, start, 0, len, @@ -1101,7 +1101,7 @@ long uffd_wp_range(struct vm_area_struct *dst_vma, return ret; } -int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, +static int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp) { struct mm_struct *dst_mm = ctx->mm; @@ -1931,7 +1931,7 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. */ -ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, +static ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) { struct mm_struct *mm = ctx->mm; @@ -2106,7 +2106,7 @@ out: return moved ? moved : err; } -bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, +static bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { const struct vm_uffd_ops *ops = vma_uffd_ops(vma); @@ -2163,12 +2163,12 @@ static void userfaultfd_set_ctx(struct vm_area_struct *vma, (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); } -void userfaultfd_reset_ctx(struct vm_area_struct *vma) +static void userfaultfd_reset_ctx(struct vm_area_struct *vma) { userfaultfd_set_ctx(vma, NULL, 0); } -struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, +static struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, @@ -2207,7 +2207,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, } /* Assumes mmap write lock taken, and mm_struct pinned. */ -int userfaultfd_register_range(struct userfaultfd_ctx *ctx, +static int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, vm_flags_t vm_flags, unsigned long start, unsigned long end, @@ -2271,7 +2271,7 @@ skip: return 0; } -void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +static void userfaultfd_release_new(struct userfaultfd_ctx *ctx) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma; @@ -2286,7 +2286,7 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx) mmap_write_unlock(mm); } -void userfaultfd_release_all(struct mm_struct *mm, +static void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx) { struct vm_area_struct *vma, *prev; From 0491e9f75c1515ecff3dfb7d7bd4243e6f47027d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:52 +0800 Subject: [PATCH 248/321] mm/mglru: consolidate common code for retrieving evictable size Patch series "mm/mglru: improve reclaim loop and dirty folio", v7. This series cleans up and slightly improves MGLRU's reclaim loop and dirty writeback handling. As a result, we can see an up to ~30% increase in some workloads like MongoDB with YCSB and a huge decrease in file refault, no swap involved. Other common benchmarks have no regression, and LOC is reduced, with less unexpected OOM, too. Some of the problems were found in our production environment, and others were mostly exposed while stress testing during the development of the LSM/MM/BPF topic on improving MGLRU [1]. This series cleans up the code base and fixes several performance issues, preparing for further work. MGLRU's reclaim loop is a bit complex, and hence these problems are somehow related to each other. The aging, scan number calculation, and reclaim loop are coupled together, and the dirty folio handling logic is quite different, making the reclaim loop hard to follow and the dirty flush ineffective. This series slightly cleans up and improves these issues using a scan budget by calculating the number of folios to scan at the beginning of the loop, and decouples aging from the reclaim calculation helpers. Then, move the dirty flush logic inside the reclaim loop so it can kick in more effectively. These issues are somehow related, and this series handles them and improves MGLRU reclaim in many ways. Test results: All tests are done on a 48c96t NUMA machine with 2 nodes and a 128G memory machine using NVME as storage. Classical (non-MGLRU) LRU numbers are included as "MGLRU disabled" for each benchmark below; see [8] and [9] for the longer write-up. MongoDB ======= Running YCSB workloadb [2] (recordcount:20000000 operationcount:6000000, threads:32), which does 95% read and 5% update to generate mixed read and dirty writeback. MongoDB is set up in a 10G cgroup using Docker, and the WiredTiger cache size is set to 4.5G, using NVME as storage. This is close to the case we observed regressing in our production environment: mixed read and writeback pressure, so it is a practical case for evaluation. Not using SWAP. The intent is to isolate the file LRU writeback path. Enabling SWAP would just add noise from anonymous reclaim. MGLRU Before: Throughput(ops/sec): 60653.502655 workingset_refault_file 12904916 pgpgin 165366622 pgpgout 5219588 MGLRU After: Throughput(ops/sec): 82384.354760 (+35.8%, higher is better) workingset_refault_file 7128285 (-44.7%, lower is better) pgpgin 113170693 (-31.5%, lower is better) pgpgout 5639724 MGLRU Disabled: Throughput(ops/sec): 93713.640901 workingset_refault_file 15013443 pgpgin 85365614 pgpgout 5866508 We can see a significant performance improvement after this series. The test is done on NVME and the performance gap would be even larger for slow devices, such as HDD or network storage. We observed over 100% gain for some workloads with slow IO. Note, classical LRU is still faster for this benchmark, MGLRU may catch up later with further work [7]. Chrome & Node.js [3] ==================== Using Yu Zhao's test script [3], testing on a x86_64 NUMA machine with 2 nodes and 128G memory, using 256G ZRAM as swap and spawn 32 memcg 64 workers. Many memcgs each applying roughly equal pressure exercises the LRU's ability to detect/protect each tenant's working set and to balance reclamation fairly between tenants, which makes this a meaningful test for the reclaim mechanism. Fairness is reported via Jain's fairness index (1.0 means all tenants get exactly equal allocation, lower is worse). Under equal pressure, all memcgs should make roughly equal forward progress. See [8] for the longer rationale and per-memcg breakdown. MGLRU before: Total requests: 81898 Per-worker mean: 1279.7 Per-worker 95% CI (mean): [ 1259.0, 1300.4] Jain's fairness index: 0.995893 (1.0 = perfectly fair) Latency: Bucket Count Pct Cumul [0,1)s 28392 34.67% 34.67% [1,2)s 8022 9.80% 44.46% [2,4)s 6130 7.48% 51.95% [4,8)s 39354 48.05% 100.00% MGLRU after: Total requests: 82901 Per-worker mean: 1295.3 Per-worker 95% CI (mean): [ 1265.3, 1325.4] Jain's fairness index: 0.991607 (1.0 = perfectly fair) Latency: Bucket Count Pct Cumul [0,1)s 28128 33.93% 33.93% [1,2)s 8756 10.56% 44.49% [2,4)s 7028 8.48% 52.97% [4,8)s 38989 47.03% 100.00% MGLRU disabled: Total requests: 62399 Per-worker mean: 975.0 Per-worker 95% CI (mean): [ 941.9, 1008.1] Jain's fairness index: 0.982156 (1.0 = perfectly fair) Latency: Bucket Count Pct Cumul [0,1)s 20051 32.13% 32.13% [1,2)s 2255 3.61% 35.75% [2,4)s 6149 9.85% 45.60% [4,8)s 33927 54.37% 99.97% [8,16)s 17 0.03% 100.00% Reclaim is still fair and effective, total requests number seems slightly better. OOM issue with aging and throttling =================================== For the throttling OOM issue, it can be easily reproduced using dd and cgroup limit as demonstrated and fixed by a later patch in this series. The aging OOM is a bit tricky, a specific reproducer can be used to simulate what we encountered in production environment [4]: Spawns multiple workers that keep reading the given file using mmap, and pauses for 120ms after one file read batch. It also spawns another set of workers that keep allocating and freeing a given size of anonymous memory. The total memory size exceeds the memory limit (eg. 14G anon + 8G file, which is 22G vs a 16G memcg limit). - MGLRU disabled: Finished 128 iterations. - MGLRU enabled: OOM with following info after about ~10-20 iterations: [ 62.624130] file_anon_mix_p invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 [ 62.624999] memory: usage 16777216kB, limit 16777216kB, failcnt 24460 [ 62.640200] swap: usage 0kB, limit 9007199254740988kB, failcnt 0 [ 62.640823] Memory cgroup stats for /demo: [ 62.641017] anon 10604879872 [ 62.641941] file 6574858240 OOM occurs despite there being still evictable file folios. - MGLRU enabled after this series: Finished 128 iterations. Worth noting there is another OOM related issue reported in V1 of this series, which is tested and looking OK now [5]. MySQL: ====== Testing with innodb_buffer_pool_size=26106127360, in a 2G memcg, using ZRAM as swap and test command: sysbench /usr/share/sysbench/oltp_read_only.lua --mysql-db=sb \ --tables=48 --table-size=2000000 --threads=48 --time=600 run A 24G InnoDB buffer pool inside a 2G memcg with ZRAM as swap forces aggressive eviction of cached database anon pages, which exercises the LRU's hot page detection and the eviction path under swap pressure. The workload is practical, and the pressure is higher than what we usually see in production but it is intended to expose the extreme case. MGLRU before: 17313.688333 tps MGLRU after: 17286.195000 tps MGLRU disabled: 16245.330000 tps Seems only noise level changes, no regression. FIO: ==== Testing with the following command, where /mnt/ramdisk is a 64G EXT4 ramdisk, each test file is 3G, in a 10G memcg, 6 test run each: fio --directory=/mnt/ramdisk --filename_format='test.$jobnum.img' \ --name=cached --numjobs=16 --size=3072M --buffered=1 --ioengine=mmap \ --rw=randread --norandommap --time_based \ --ramp_time=1m --runtime=5m --group_reporting Random buffered mmap read on a ramdisk strips out storage variance and stresses purely the LRU's ability to evict and recycle the page cache under heavy random read pressure. MGLRU before: 9033.91 MB/s MGLRU after: 9065.72 MB/s MGLRU disabled: 8254.54 MB/s Also seem only noise level changes and no regression or slightly better. Build kernel: ============= Build kernel test using ZRAM as swap, kernel source on tmpfs, in a memcg with memory.max=3G, using make -j96 and defconfig, measuring system time, 6 test run each. Building the kernel is a classical mixed anon + file workload (lots of small file reads/writes plus parallel anon allocations from cc/ld) and is representative of many real compilation jobs. MGLRU before: 2823.13s MGLRU after: 2801.26s MGLRU disabled: 5023.50s Also seem only noise level changes, no regression or very slightly better. Android: ======== Xinyu reported a performance gain on Android, too, with this series. The test consisted of cold-starting multiple applications sequentially under moderate system load [6]; this is a real Android user-visible scenario, dominated by the LRU's ability to keep the right working set resident and re-fault launch-critical pages quickly. Before: Launch Time Summary (all apps, all runs) Mean 868.0ms P50 888.0ms P90 1274.2ms P95 1399.0ms After: Launch Time Summary (all apps, all runs) Mean 850.5ms (-2.07%) P50 861.5ms (-3.04%) P90 1179.0ms (-8.05%) P95 1228.0ms (-12.2%) This patch (of 15): Merge commonly used code for counting evictable folios in a lruvec. No behavior change. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-0-02fabb92dc43@tencent.com Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-1-02fabb92dc43@tencent.com Link: https://lore.kernel.org/linux-mm/CAMgjq7BoekNjg-Ra3C8M7=8=75su38w=HD782T5E_cxyeCeH_g@mail.gmail.com/ [1] Link: https://github.com/brianfrankcooper/YCSB/blob/master/workloads/workloadb [2] Link: https://lore.kernel.org/all/20221220214923.1229538-1-yuzhao@google.com/ [3] Link: https://github.com/ryncsn/emm-test-project/tree/master/file-anon-mix-pressure [4] Link: https://lore.kernel.org/linux-mm/acgNCzRDVmSbXrOE@KASONG-MC4/ [5] Link: https://lore.kernel.org/linux-mm/20260417025123.2971253-1-wxy2009nrrr@163.com/ [6] Link: https://lore.kernel.org/linux-mm/20260502-mglru-fg-v1-0-913619b014d9@tencent.com/ [7] Link: https://lore.kernel.org/linux-mm/CAMgjq7BzQAPp8u_3-9e3ueXmRCoW=2sydok0hFM=MYL7VC1YYg@mail.gmail.com/ [8] Link: https://lore.kernel.org/linux-mm/CAMgjq7D+4QmiWe73OPFuH0s+ZKCUJoo+MfcWOdJcV+VO-T2Wmg@mail.gmail.com/ [9] Signed-off-by: Kairui Song Acked-by: Yuanchu Xie Reviewed-by: Barry Song Reviewed-by: Chen Ridong Reviewed-by: Axel Rasmussen Reviewed-by: Baolin Wang Acked-by: Shakeel Butt Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yu Zhao Cc: Leno Hou Signed-off-by: Andrew Morton --- mm/vmscan.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 76193a84a2af..5901219dd7fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4088,27 +4088,33 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); } -static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) +static unsigned long lruvec_evictable_size(struct lruvec *lruvec, int swappiness) { int gen, type, zone; - unsigned long total = 0; - int swappiness = get_swappiness(lruvec, sc); + unsigned long seq, total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); for_each_evictable_type(type, swappiness) { - unsigned long seq; - for (seq = min_seq[type]; seq <= max_seq; seq++) { gen = lru_gen_from_seq(seq); - for (zone = 0; zone < MAX_NR_ZONES; zone++) total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); } } + return total; +} + +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long total; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + total = lruvec_evictable_size(lruvec, swappiness); + /* whether the size is big enough to be helpful */ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; } @@ -4913,9 +4919,6 @@ retry: static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, int swappiness, unsigned long *nr_to_scan) { - int gen, type, zone; - unsigned long size = 0; - struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); *nr_to_scan = 0; @@ -4923,18 +4926,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - for_each_evictable_type(type, swappiness) { - unsigned long seq; - - for (seq = min_seq[type]; seq <= max_seq; seq++) { - gen = lru_gen_from_seq(seq); - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - } - } - - *nr_to_scan = size; + *nr_to_scan = lruvec_evictable_size(lruvec, swappiness); /* better to run aging even though eviction is still possible */ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } From 790d3abeca092523621bd358f2d3362a95c571bf Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:53 +0800 Subject: [PATCH 249/321] mm/mglru: rename variables related to aging and rotation The current variable name isn't helpful. Make the variable names more meaningful. Only naming change, no behavior change. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-2-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Suggested-by: Barry Song Reviewed-by: Baolin Wang Reviewed-by: Chen Ridong Reviewed-by: Barry Song Reviewed-by: Axel Rasmussen Acked-by: Shakeel Butt Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5901219dd7fc..9c47a4aa825a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4938,7 +4938,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, */ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { - bool success; + bool need_aging; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); @@ -4946,7 +4946,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); + need_aging = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) @@ -4955,7 +4955,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); /* try to get away with not aging at the default priority */ - if (!success || sc->priority == DEF_PRIORITY) + if (!need_aging || sc->priority == DEF_PRIORITY) return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ @@ -5044,7 +5044,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) { - bool success; + bool need_rotate; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5062,7 +5062,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } - success = try_to_shrink_lruvec(lruvec, sc); + need_rotate = try_to_shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); @@ -5072,10 +5072,10 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) flush_reclaim_state(sc); - if (success && mem_cgroup_online(memcg)) + if (need_rotate && mem_cgroup_online(memcg)) return MEMCG_LRU_YOUNG; - if (!success && lruvec_is_sizable(lruvec, sc)) + if (!need_rotate && lruvec_is_sizable(lruvec, sc)) return 0; /* one retry if offlined or too small */ From aa6ef5b159dcc646d3add48c1580ba8e70df6c64 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:54 +0800 Subject: [PATCH 250/321] mm/mglru: relocate the LRU scan batch limit to callers Same as active / inactive LRU, MGLRU isolates and scans folios in batches. The batch split is done hidden deep in the helper, which makes the code harder to follow. The helper's arguments are also confusing since callers usually request more folios than the batch size, so the helper almost never processes the full requested amount. Move the batch splitting into the top loop to make it cleaner, there should be no behavior change. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-3-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Baolin Wang Reviewed-by: Barry Song Reviewed-by: Chen Ridong Acked-by: Shakeel Butt Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c47a4aa825a..abe2ace8e326 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4699,10 +4699,10 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, int scanned = 0; int isolated = 0; int skipped = 0; - int scan_batch = min(nr_to_scan, MAX_LRU_BATCH); - int remaining = scan_batch; + unsigned long remaining = nr_to_scan; struct lru_gen_folio *lrugen = &lruvec->lrugen; + VM_WARN_ON_ONCE(nr_to_scan > MAX_LRU_BATCH); VM_WARN_ON_ONCE(!list_empty(list)); if (get_nr_gens(lruvec, type) == MIN_NR_GENS) @@ -4755,7 +4755,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, mod_lruvec_state(lruvec, item, isolated); mod_lruvec_state(lruvec, PGREFILL, sorted); mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated); - trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch, + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); if (type == LRU_GEN_FILE) @@ -4991,7 +4991,7 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - long nr_to_scan; + long nr_batch, nr_to_scan; unsigned long scanned = 0; int swappiness = get_swappiness(lruvec, sc); @@ -5002,7 +5002,8 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (nr_to_scan <= 0) break; - delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); + nr_batch = min(nr_to_scan, MAX_LRU_BATCH); + delta = evict_folios(nr_batch, lruvec, sc, swappiness); if (!delta) break; @@ -5627,6 +5628,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long nr_to_reclaim) { + int nr_batch; DEFINE_MAX_SEQ(lruvec); if (seq + MIN_NR_GENS > max_seq) @@ -5643,8 +5645,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, - swappiness)) + nr_batch = min(nr_to_reclaim - sc->nr_reclaimed, MAX_LRU_BATCH); + if (!evict_folios(nr_batch, lruvec, sc, swappiness)) return 0; cond_resched(); From 163bc3d68c9f373a96827ecefe370ff989f26747 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:55 +0800 Subject: [PATCH 251/321] mm/mglru: restructure the reclaim loop The current loop will calculate the scan number on each iteration. The number of folios to scan is based on the LRU length, with some unclear behaviors, e.g, the scan number is only shifted by reclaim priority when aging is not needed or when at the default priority, and it couples the number calculation with aging and rotation. Adjust, simplify it, and decouple aging and rotation. Just calculate the scan number for once at the beginning of the reclaim, always respect the reclaim priority, and make the aging and rotation more explicit. This slightly changes how aging and offline memcg reclaim works: Previously, aging was skipped at DEF_PRIORITY even when eviction was no longer possible, so the reclaimer wasted an iteration until the priority escalated. Now aging runs immediately whenever it is needed to make progress; the DEF_PRIORITY skip only applies when eviction is still viable. This may avoid wasted iterations that over-reclaim slab and break reclaim balance in multi-cgroup setups. Similar for offline memcg. Previously, offline memcg wouldn't be aged unless it didn't have any evictable folios. Now, we might age it if it has only 3 generations, which should be fine. On one hand, offline memcg might still hold long-term folios, and in fact, a long-existing offline memcg must be pinned by some long-term folios like shmem. These folios might be used by other memcg, so aging them as ordinary memcg seems correct. Besides, aging enables further reclaim of an offlined memcg, which will certainly happen if we keep shrinking it. And offline memcg might soon be no longer an issue with reparenting. Overall, the memcg LRU rotation, as described in mmzone.h, remains the same. Note that because the scan budget is now pinned at loop entry, tiny lruvec might skip this reclaim pass, also skipping aging, which could be beneficial as aging is not helpful since it will still be un-reclaimable after aging. Reclaim will go on as usual once priority escalates. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-4-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Acked-by: Shakeel Butt Cc: Baolin Wang Cc: Barry Song Cc: Chen Ridong Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 72 ++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index abe2ace8e326..66ddf211e3ca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4917,49 +4917,37 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - int swappiness, unsigned long *nr_to_scan) + struct scan_control *sc, int swappiness) { DEFINE_MIN_SEQ(lruvec); - *nr_to_scan = 0; /* have to run aging, since eviction is not possible anymore */ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - *nr_to_scan = lruvec_evictable_size(lruvec, swappiness); + /* try to avoid aging, do gentle reclaim at the default priority */ + if (sc->priority == DEF_PRIORITY) + return false; + /* better to run aging even though eviction is still possible */ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } -/* - * For future optimizations: - * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg - * reclaim. - */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + struct mem_cgroup *memcg, int swappiness) { - bool need_aging; - unsigned long nr_to_scan; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MAX_SEQ(lruvec); + unsigned long nr_to_scan, evictable; - if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) - return -1; - - need_aging = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); + evictable = lruvec_evictable_size(lruvec, swappiness); /* try to scrape all its memory if this memcg was deleted */ - if (nr_to_scan && !mem_cgroup_online(memcg)) - return nr_to_scan; + if (!mem_cgroup_online(memcg)) + return evictable; - nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + nr_to_scan = apply_proportional_protection(memcg, sc, evictable); + nr_to_scan >>= sc->priority; - /* try to get away with not aging at the default priority */ - if (!need_aging || sc->priority == DEF_PRIORITY) - return nr_to_scan >> sc->priority; - - /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; + return nr_to_scan; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -4989,31 +4977,44 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) return true; } +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { + bool need_rotate = false; long nr_batch, nr_to_scan; - unsigned long scanned = 0; int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); - while (true) { + nr_to_scan = get_nr_to_scan(lruvec, sc, memcg, swappiness); + while (nr_to_scan > 0) { int delta; + DEFINE_MAX_SEQ(lruvec); - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); - if (nr_to_scan <= 0) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) { + need_rotate = true; break; + } + + if (should_run_aging(lruvec, max_seq, sc, swappiness)) { + if (try_to_inc_max_seq(lruvec, max_seq, swappiness, false)) + need_rotate = true; + /* stop scanning as it's low on cold folios */ + break; + } nr_batch = min(nr_to_scan, MAX_LRU_BATCH); delta = evict_folios(nr_batch, lruvec, sc, swappiness); if (!delta) break; - scanned += delta; - if (scanned >= nr_to_scan) - break; - if (should_abort_scan(lruvec, sc)) break; + nr_to_scan -= delta; cond_resched(); } @@ -5039,8 +5040,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } - /* whether this lruvec should be rotated */ - return nr_to_scan < 0; + return need_rotate; } static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) From 3a72e078b4a32ffd88f416d278882034b3321481 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:56 +0800 Subject: [PATCH 252/321] mm/mglru: scan and count the exact number of folios Make the scan helpers return the exact number of folios being scanned or isolated. Since the reclaim loop now has a natural scan budget that controls the scan progress, returning the scan number and consuming the budget makes the scan more accurate and easier to follow. The number of scanned folios for each iteration is always larger than 0, unless the reclaim must stop for a forced aging, so there is no more need for any special handling when there is no progress made: - `return isolated || !remaining ? scanned : 0` in scan_folios: both the function and the call now just return the exact scan count, combined with the scan budget introduced in the previous commit to avoid livelock or under scan. - `scanned += try_to_inc_min_seq` in evict_folios: adding a bool as a scan count was kind of confusing and no longer needed, as scan number should never be zero as long as there are still evictable gens. We may encounter a empty old gen that returns 0 scan count, to avoid that, do a try_to_inc_min_seq before toisolation which have slight to none overhead in most cases. - `evictable_min_seq + MIN_NR_GENS > max_seq` guard in evict_folios: the per-type get_nr_gens == MIN_NR_GENS check in scan_folios naturally returns 0 when only two gens remain and breaks the loop. Also change try_to_inc_min_seq to return void, as its return value is no longer used by any caller. Call it before isolate_folios to flush any empty gens left by external folio freeing, and again after isolate_folios when scanning moved or protected folios may have emptied the oldest gen. The scan still stops if only two gens are left, as the scan number will be zero. This matches the previous behavior. This forced gen protection may be removed or softened later to improve reclaim further. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-5-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Chen Ridong Reviewed-by: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 58 ++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 66ddf211e3ca..adfe3e6645d6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3882,10 +3882,9 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) +static void try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; - bool success = false; bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); @@ -3911,11 +3910,10 @@ next: /* * If min_seq[type] of both anonymous and file is not increased, - * we can directly return false to avoid unnecessary checking - * overhead later. + * return here to avoid unnecessary checking overhead later. */ if (!seq_inc_flag) - return success; + return; /* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { @@ -3933,10 +3931,7 @@ next: reset_ctrl_pos(lruvec, type, true); WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); - success = true; } - - return success; } static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) @@ -4690,7 +4685,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int type, int tier, - struct list_head *list) + struct list_head *list, int *isolatedp) { int i; int gen; @@ -4760,11 +4755,9 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); if (type == LRU_GEN_FILE) sc->nr.file_taken += isolated; - /* - * There might not be eligible folios due to reclaim_idx. Check the - * remaining to prevent livelock if it's not making progress. - */ - return isolated || !remaining ? scanned : 0; + + *isolatedp = isolated; + return scanned; } static int get_tier_idx(struct lruvec *lruvec, int type) @@ -4808,33 +4801,36 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness) static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness, - int *type_scanned, struct list_head *list) + struct list_head *list, int *isolated, + int *isolate_type, int *isolate_scanned) { int i; + int total_scanned = 0; int type = get_type_to_scan(lruvec, swappiness); for_each_evictable_type(i, swappiness) { int scanned; int tier = get_tier_idx(lruvec, type); - *type_scanned = type; + scanned = scan_folios(nr_to_scan, lruvec, sc, + type, tier, list, isolated); - scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); - if (scanned) - return scanned; + total_scanned += scanned; + if (*isolated) { + *isolate_type = type; + *isolate_scanned = scanned; + break; + } type = !type; } - return 0; + return total_scanned; } static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness) { - int type; - int scanned; - int reclaimed; LIST_HEAD(list); LIST_HEAD(clean); struct folio *folio; @@ -4842,19 +4838,23 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, enum node_stat_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; + int scanned, reclaimed; + int isolated = 0, type, type_scanned; bool skip_retry = false; - struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lruvec_lock_irq(lruvec); - scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); + /* In case folio deletion left empty old gens, flush them */ + try_to_inc_min_seq(lruvec, swappiness); - scanned += try_to_inc_min_seq(lruvec, swappiness); + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, + &list, &isolated, &type, &type_scanned); - if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) - scanned = 0; + /* Scanning may have emptied the oldest gen, flush it */ + if (scanned) + try_to_inc_min_seq(lruvec, swappiness); lruvec_unlock_irq(lruvec); @@ -4865,7 +4865,7 @@ retry: sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - scanned, reclaimed, &stat, sc->priority, + type_scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { From 16b475d2ac3c7994370254644015ae2e5dd5210b Mon Sep 17 00:00:00 2001 From: "Barry Song (Xiaomi)" Date: Tue, 28 Apr 2026 02:06:57 +0800 Subject: [PATCH 253/321] mm/mglru: avoid reclaim type fall back when isolation makes no progress While isolation makes no progress in scan_folios(), we quickly fall back to the other type in isolate_folios(). This is incorrect, as the current type may still have sufficient folios. Falling back can undermine the positive_ctrl_err() result from get_type_to_scan(), which is derived from swappiness. So just continue scanning this type for another round. Worth noting if the cold generations are all reclaimed, scan will no longer make any progress either, which may undermine the swappiness again. This is not a new issue and hence better be fixed later [1]. Link: https://lore.kernel.org/linux-mm/CAGsJ_4zjdOYEtuO6gNjABm7NDxW0skzBFNRNee-k2D6VwsYEQA@mail.gmail.com/ [1] Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-6-02fabb92dc43@tencent.com Signed-off-by: Barry Song (Xiaomi) Signed-off-by: Kairui Song Reviewed-by: Kairui Song Cc: Axel Rasmussen Cc: Baolin Wang Cc: Chen Ridong Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index adfe3e6645d6..32ffbb557e15 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4821,8 +4821,13 @@ static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, *isolate_scanned = scanned; break; } - - type = !type; + /* + * If scanned > 0 and isolated == 0, avoid falling back to the + * other type, as this type remains sufficient. Falling back + * too readily can disrupt the positive_ctrl_err() bias. + */ + if (!scanned) + type = !type; } return total_scanned; From 6e9be217a3cecbd8e8a5beec2aeba4ae9ebd2af9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:58 +0800 Subject: [PATCH 254/321] mm/mglru: use a smaller batch for reclaim With a fixed number to reclaim calculated at the beginning, making each following step smaller should reduce the lock contention and avoid over-aggressive reclaim of folios, as it will abort earlier when the number of folios to be reclaimed is reached. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-7-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Chen Ridong Reviewed-by: Baolin Wang Reviewed-by: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 32ffbb557e15..6128b191b81d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5011,7 +5011,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) break; } - nr_batch = min(nr_to_scan, MAX_LRU_BATCH); + nr_batch = min(nr_to_scan, MIN_LRU_BATCH); delta = evict_folios(nr_batch, lruvec, sc, swappiness); if (!delta) break; From 12316f7902f850e2770d26a91fb13728b5ade065 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:06:59 +0800 Subject: [PATCH 255/321] mm/mglru: don't abort scan immediately right after aging Right now, if eviction triggers aging, the reclaimer will abort. This is not the optimal strategy for several reasons. Aborting the reclaim early wastes a reclaim cycle when under pressure, and for concurrent reclaim, if the LRU is under aging, all concurrent reclaimers might fail. And if the age has just finished, new cold folios exposed by the aging are not reclaimed until the next reclaim iteration. What's more, the current aging trigger is quite lenient, having 3 gens with a reclaim priority lower than default will trigger aging, and blocks reclaiming from one memcg. This wastes reclaim retry cycles easily. And in the worst case, if the reclaim is making slower progress and all following attempts fail due to being blocked by aging, it triggers unexpected early OOM. And if a lruvec requires aging, it doesn't mean it's hot. Instead, the lruvec could be idle for quite a while, and hence it might contain lots of cold folios to be reclaimed. While it's helpful to rotate memcg LRU after aging for global reclaim, as global reclaim fairness is coupled with the rotation in shrink_many, memcg fairness is instead handled by cgroup iteration in shrink_node_memcgs. So, for memcg level pressure, this abort is not the key part for keeping the fairness. And in most cases, there is no need to age, and fairness must be achieved by upper-level reclaim control. So instead, just keep the scanning going unless one whole batch of folios failed to be isolated or enough folios have been scanned, which is triggered by evict_folios returning 0. And only abort for global reclaim after one batch, so when there are fewer memcgs, progress is still made, and the fairness mechanism described above still works fine. And in most cases, the one more batch attempt for global reclaim might just be enough to satisfy what the reclaimer needs, hence improving global reclaim performance by reducing reclaim retry cycles. Rotation is still there after the reclaim is done, which still follows the comment in mmzone.h. And fairness still looking good. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-8-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Chen Ridong Reviewed-by: Barry Song Cc: Baolin Wang Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6128b191b81d..daad01a07e33 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4989,7 +4989,7 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) */ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - bool need_rotate = false; + bool need_rotate = false, should_age = false; long nr_batch, nr_to_scan; int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5007,8 +5007,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (should_run_aging(lruvec, max_seq, sc, swappiness)) { if (try_to_inc_max_seq(lruvec, max_seq, swappiness, false)) need_rotate = true; - /* stop scanning as it's low on cold folios */ - break; + should_age = true; } nr_batch = min(nr_to_scan, MIN_LRU_BATCH); @@ -5019,6 +5018,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (should_abort_scan(lruvec, sc)) break; + /* + * Root reclaim needs rotation when low on cold folio for better + * fairness. Cgroup reclaim gets fairness from the iterator. + */ + if (root_reclaim(sc) && should_age) + break; + nr_to_scan -= delta; cond_resched(); } From acd22fbb9f4714d9beb1796aa27ac7e92d6ab9b3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:00 +0800 Subject: [PATCH 256/321] mm/mglru: remove redundant swap constrained check upon isolation Remove the swap-constrained early reject check upon isolation. This check is a micro optimization when swap IO is not allowed, so folios are rejected early. But it is redundant and overly broad since shrink_folio_list() already handles all these cases with proper granularity. Notably, this check wrongly rejected lazyfree folios, and it doesn't cover all rejection cases. shrink_folio_list() uses may_enter_fs(), which distinguishes non-SWP_FS_OPS devices from filesystem-backed swap and does all the checks after folio is locked, so flags like swap cache are stable. This check also covers dirty file folios, which are not a problem now since sort_folio() already bumps dirty file folios to the next generation, but causes trouble for unifying dirty folio writeback handling. And there should be no performance impact from removing it. We may have lost a micro optimization, but unblocked lazyfree reclaim for NOIO contexts, which is not a common case in the first place. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-9-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Baolin Wang Reviewed-by: Chen Ridong Reviewed-by: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index daad01a07e33..c5de863aeceb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4654,12 +4654,6 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* swap constrained */ - if (!(sc->gfp_mask & __GFP_IO) && - (folio_test_dirty(folio) || - (folio_test_anon(folio) && !folio_test_swapcache(folio)))) - return false; - /* raced with release_pages() */ if (!folio_try_get(folio)) return false; From 75d4c3f5fb980de1b620adede47e43dff4d6a5f3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:01 +0800 Subject: [PATCH 257/321] mm/mglru: use the common routine for dirty/writeback reactivation Currently MGLRU will move the dirty writeback folios to the second oldest gen instead of reactivate them like the classical LRU. This might help to reduce the LRU contention as it skipped the isolation. But as a result we will see these folios at the LRU tail more frequently leading to inefficient reclaim. Besides, the dirty / writeback check after isolation in shrink_folio_list is more accurate and covers more cases. So instead, just drop the special handling for dirty writeback, use the common routine and re-activate it like the classical LRU. This should in theory improve the scan efficiency. These folios will be rotated back to LRU tail once writeback is done so there is no risk of hotness inversion. And now each reclaim loop will have a higher success rate. This also prepares for unifying the writeback and throttling mechanism with classical LRU, we keep these folios far from tail so detecting the tail batch will have a similar pattern with classical LRU. The micro optimization that avoids LRU contention by skipping the isolation is gone, which should be fine. Compared to IO and writeback cost, the isolation overhead is trivial. And using the common routine also keeps the folio's referenced bits (tier bits), which could improve metrics in the long term. Also no more need to clean reclaim bit as the common routine will make use of it. Note the common routine updates a few throttling and writeback counters, which are not used, and never have been for the MGLRU case. We will start making use of these in later commits. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-10-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Barry Song Reviewed-by: Baolin Wang Cc: Chen Ridong Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c5de863aeceb..e699425c5b06 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4582,7 +4582,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int tier_idx) { bool success; - bool dirty, writeback; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -4632,21 +4631,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c return true; } - dirty = folio_test_dirty(folio); - writeback = folio_test_writeback(folio); - if (type == LRU_GEN_FILE && dirty) { - sc->nr.file_taken += delta; - if (!writeback) - sc->nr.unqueued_dirty += delta; - } - - /* waiting for writeback */ - if (writeback || (type == LRU_GEN_FILE && dirty)) { - gen = folio_inc_gen(lruvec, folio, true); - list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - return true; - } - return false; } @@ -4668,9 +4652,6 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); - /* for shrink_folio_list() */ - folio_clear_reclaim(folio); - success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); From f37d3708b676574379a00f8dafe6c89d92f166e9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:02 +0800 Subject: [PATCH 258/321] mm/mglru: simplify and improve dirty writeback handling Right now the flusher wakeup mechanism for MGLRU is less responsive and unlikely to trigger compared to classical LRU. The classical LRU wakes the flusher if one batch of folios passed to shrink_folio_list is unevictable due to under writeback. MGLRU instead check and handle this after the whole reclaim loop is done. We previously even saw OOM problems due to passive flusher, which were fixed but still not perfect [1]. We have just unified the dirty folio counting and activation routine, now just move the dirty flush into the loop right after shrink_folio_list. This improves the performance a lot for workloads involving heavy writeback and prepares for throttling too. Test with YCSB workloadb showed a major performance improvement: Before this series: Throughput(ops/sec): 62485.02962831822 AverageLatency(us): 500.9746963330107 pgpgin 159347462 workingset_refault_file 34522071 After this commit: Throughput(ops/sec): 80857.08510208207 AverageLatency(us): 386.653262968934 pgpgin 112233121 workingset_refault_file 19516246 The performance is a lot better with significantly lower refault. We also observed similar or higher performance gain for other real-world workloads. We were concerned that the dirty flush could cause more wear for SSD: that should not be the problem here, since the wakeup condition is when the dirty folios have been pushed to the tail of LRU, which indicates that memory pressure is so high that writeback is blocking the workload already. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-11-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Link: https://lore.kernel.org/linux-mm/20241026115714.1437435-1-jingxiangzeng.cas@gmail.com/ [1] Reviewed-by: Baolin Wang Cc: Barry Song Cc: Chen Ridong Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e699425c5b06..d26c89546542 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4728,8 +4728,6 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); - if (type == LRU_GEN_FILE) - sc->nr.file_taken += isolated; *isolatedp = isolated; return scanned; @@ -4842,12 +4840,27 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, return scanned; retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); - sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, type_scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); + /* + * If too many file cache in the coldest generation can't be evicted + * due to being dirty, wake up the flusher. + */ + if (stat.nr_unqueued_dirty == isolated) { + wakeup_flusher_threads(WB_REASON_VMSCAN); + + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + list_for_each_entry_safe_reverse(folio, next, &list, lru) { DEFINE_MIN_SEQ(lruvec); @@ -5004,28 +5017,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) cond_resched(); } - /* - * If too many file cache in the coldest generation can't be evicted - * due to being dirty, wake up the flusher. - */ - if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - wakeup_flusher_threads(WB_REASON_VMSCAN); - - /* - * For cgroupv1 dirty throttling is achieved by waking up - * the kernel flusher here and later waiting on folios - * which are in writeback to finish (see shrink_folio_list()). - * - * Flusher may not be able to issue writeback quickly - * enough for cgroupv1 writeback throttling to work - * on a large system. - */ - if (!writeback_throttling_sane(sc)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); - } - return need_rotate; } From 32d87083ee973adf44c11f61c3eb1440d275f314 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:03 +0800 Subject: [PATCH 259/321] mm/mglru: remove no longer used reclaim argument for folio protection Now dirty reclaim folios are handled after isolation, not before, since dirty reactivation must take the folio off LRU first, and that helps to unify the dirty handling logic. So this argument is no longer needed. Just remove it. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-12-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Cc: Baolin Wang Cc: Barry Song Cc: Chen Ridong Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d26c89546542..22c78509c2c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3224,7 +3224,7 @@ static int folio_update_gen(struct folio *folio, int gen) } /* protect pages accessed multiple times through file descriptors */ -static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio) { int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; @@ -3243,9 +3243,6 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; - /* for folio_end_writeback() */ - if (reclaiming) - new_flags |= BIT(PG_reclaim); } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -3859,7 +3856,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); - new_gen = folio_inc_gen(lruvec, folio, false); + new_gen = folio_inc_gen(lruvec, folio); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); /* don't count the workingset being lazily promoted */ @@ -4611,7 +4608,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c /* protected */ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { - gen = folio_inc_gen(lruvec, folio, false); + gen = folio_inc_gen(lruvec, folio); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); /* don't count the workingset being lazily promoted */ @@ -4626,7 +4623,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c /* ineligible */ if (zone > sc->reclaim_idx) { - gen = folio_inc_gen(lruvec, folio, false); + gen = folio_inc_gen(lruvec, folio); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } From e621a24e10bb07998dab930009eadbd220253d4e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:04 +0800 Subject: [PATCH 260/321] mm/vmscan: remove sc->file_taken No one is using it now, just remove it. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-13-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Reviewed-by: Axel Rasmussen Reviewed-by: Baolin Wang Reviewed-by: Chen Ridong Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 22c78509c2c8..e464928252fa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -173,7 +173,6 @@ struct scan_control { unsigned int congested; unsigned int writeback; unsigned int immediate; - unsigned int file_taken; unsigned int taken; } nr; @@ -2044,8 +2043,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, sc->nr.writeback += stat.nr_writeback; sc->nr.immediate += stat.nr_immediate; sc->nr.taken += nr_taken; - if (file) - sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); From 183ff2f9ec4875e010c00984374e51a545f6b169 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:05 +0800 Subject: [PATCH 261/321] mm/vmscan: remove sc->unqueued_dirty No one is using it now, just remove it. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-14-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Suggested-by: Axel Rasmussen Reviewed-by: Baolin Wang Reviewed-by: Axel Rasmussen Reviewed-by: Barry Song Reviewed-by: Chen Ridong Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Leno Hou Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e464928252fa..7494ac73e3f1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -169,7 +169,6 @@ struct scan_control { struct { unsigned int dirty; - unsigned int unqueued_dirty; unsigned int congested; unsigned int writeback; unsigned int immediate; @@ -2039,7 +2038,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; - sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr.writeback += stat.nr_writeback; sc->nr.immediate += stat.nr_immediate; sc->nr.taken += nr_taken; From 39376b9cac1cef505e1fa9a0a6105cf0de7c6734 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 28 Apr 2026 02:07:06 +0800 Subject: [PATCH 262/321] mm/vmscan: unify writeback reclaim statistic and throttling Currently MGLRU and non-MGLRU handle the reclaim statistic and writeback handling very differently, especially throttling. Basically MGLRU just ignored the throttling part. Let's just unify this part, use a helper to deduplicate the code so both setups will share the same behavior. Test using following reproducer using bash: echo "Setup a slow device using dm delay" dd if=/dev/zero of=/var/tmp/backing bs=1M count=2048 LOOP=$(losetup --show -f /var/tmp/backing) mkfs.ext4 -q $LOOP echo "0 $(blockdev --getsz $LOOP) delay $LOOP 0 0 $LOOP 0 1000" | \ dmsetup create slow_dev mkdir -p /mnt/slow && mount /dev/mapper/slow_dev /mnt/slow echo "Start writeback pressure" sync && echo 3 > /proc/sys/vm/drop_caches mkdir /sys/fs/cgroup/test_wb echo 128M > /sys/fs/cgroup/test_wb/memory.max (echo $BASHPID > /sys/fs/cgroup/test_wb/cgroup.procs && \ dd if=/dev/zero of=/mnt/slow/testfile bs=1M count=192) echo "Clean up" echo "0 $(blockdev --getsz $LOOP) error" | dmsetup load slow_dev dmsetup resume slow_dev umount -l /mnt/slow && sync dmsetup remove slow_dev Before this commit, `dd` will get OOM killed immediately if MGLRU is enabled. Classic LRU is fine. After this commit, throttling is now effective and no more spin on LRU or premature OOM. Stress test on other workloads also looks good. Global throttling is not here yet, we will fix that separately later. Link: https://lore.kernel.org/20260428-mglru-reclaim-v7-15-02fabb92dc43@tencent.com Signed-off-by: Kairui Song Suggested-by: Chen Ridong Tested-by: Leno Hou Reviewed-by: Axel Rasmussen Reviewed-by: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Stevens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vernon Yang Cc: Wei Xu Cc: Yafang Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 92 +++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 49 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7494ac73e3f1..e8a90911bf88 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1946,6 +1946,44 @@ static int current_may_throttle(void) return !(current->flags & PF_LOCAL_THROTTLE); } +static void handle_reclaim_writeback(unsigned long nr_taken, + struct pglist_data *pgdat, + struct scan_control *sc, + struct reclaim_stat *stat) +{ + /* + * If dirty folios are scanned that are not queued for IO, it + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty folios to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty folios grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. + */ + if (stat->nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + + sc->nr.dirty += stat->nr_dirty; + sc->nr.congested += stat->nr_congested; + sc->nr.writeback += stat->nr_writeback; + sc->nr.immediate += stat->nr_immediate; + sc->nr.taken += nr_taken; +} + /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -2009,39 +2047,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - - /* - * If dirty folios are scanned that are not queued for IO, it - * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty folios to the end of - * the LRU before the dirty limits are breached and the dirty - * data has expired. It can also happen when the proportion of - * dirty folios grows not through writes but through memory - * pressure reclaiming all the clean cache. And in some cases, - * the flushers simply cannot keep up with the allocation - * rate. Nudge the flusher threads in case they are asleep. - */ - if (stat.nr_unqueued_dirty == nr_taken) { - wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * For cgroupv1 dirty throttling is achieved by waking up - * the kernel flusher here and later waiting on folios - * which are in writeback to finish (see shrink_folio_list()). - * - * Flusher may not be able to issue writeback quickly - * enough for cgroupv1 writeback throttling to work - * on a large system. - */ - if (!writeback_throttling_sane(sc)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); - } - - sc->nr.dirty += stat.nr_dirty; - sc->nr.congested += stat.nr_congested; - sc->nr.writeback += stat.nr_writeback; - sc->nr.immediate += stat.nr_immediate; - sc->nr.taken += nr_taken; - + handle_reclaim_writeback(nr_taken, pgdat, sc, &stat); trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; @@ -4833,26 +4839,13 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); sc->nr_reclaimed += reclaimed; + /* Retry pass is only meant for clean folios without new isolation */ + if (isolated) + handle_reclaim_writeback(isolated, pgdat, sc, &stat); trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, type_scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); - /* - * If too many file cache in the coldest generation can't be evicted - * due to being dirty, wake up the flusher. - */ - if (stat.nr_unqueued_dirty == isolated) { - wakeup_flusher_threads(WB_REASON_VMSCAN); - - /* - * For cgroupv1 dirty throttling is achieved by waking up - * the kernel flusher here and later waiting on folios - * which are in writeback to finish (see shrink_folio_list()). - */ - if (!writeback_throttling_sane(sc)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); - } - list_for_each_entry_safe_reverse(folio, next, &list, lru) { DEFINE_MIN_SEQ(lruvec); @@ -4895,6 +4888,7 @@ retry: if (!list_empty(&list)) { skip_retry = true; + isolated = 0; goto retry; } From ea3085dd7a4ec212d6c4b50efca584e0928caa72 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 25 Apr 2026 23:27:16 -0700 Subject: [PATCH 263/321] fs/proc/task_mmu: read proc/pid/{smaps|numa_maps} under per-vma lock Patch series "use vma locks for proc/pid/{smaps|numa_maps} reads", v2. Use per-vma locks when reading /proc/pid/smaps and /proc/pid/numa_maps similar to /proc/pid/maps to reduce contention on central mmap_lock. One major difference between maps and smaps/numa_maps reading is that the latter executes page table walk which can't be done under RCU due to a possibility of sleeping. Therefore we drop RCU read lock before this walk while keeping the VMA locked. After the walk we retake RCU read lock, reset VMA iterator and proceed with the next VMA. The last two patches extend /proc/pid/maps test to cover /proc/pid/smaps reading during concurrent address space modification. This patch (of 3): proc/pid/{smaps|numa_maps} can be read using the combination of RCU and VMA read locks, similar to proc/pid/maps. RCU is required to safely traverse the VMA tree and VMA lock stabilizes the VMA being processed and the pagetable walk. Link: https://lore.kernel.org/20260426062718.1238437-1-surenb@google.com Link: https://lore.kernel.org/20260426062718.1238437-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Jann Horn Cc: Matthew Wilcox (Oracle) Cc: "Paul E . McKenney" Cc: Pedro Falcato Cc: Shuah Khan Cc: Wei Yang Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 195 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 156 insertions(+), 39 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 751b9ba160fb..1e3a15bf46f4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,6 +132,22 @@ static void release_task_mempolicy(struct proc_maps_private *priv) #ifdef CONFIG_PER_VMA_LOCK +static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + int ret = mmap_read_lock_killable(lock_ctx->mm); + + if (!ret) + lock_ctx->mmap_locked = true; + + return ret; +} + +static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); + lock_ctx->mmap_locked = false; +} + static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) { lock_ctx->locked_vma = NULL; @@ -146,25 +162,11 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) } } -static const struct seq_operations proc_pid_maps_op; - static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_locking_ctx *lock_ctx) { - /* - * smaps and numa_maps perform page table walk, therefore require - * mmap_lock but maps can be read with locking just the vma and - * walking the vma tree under rcu read protection. - */ - if (m->op != &proc_pid_maps_op) { - if (mmap_read_lock_killable(lock_ctx->mm)) - return false; - - lock_ctx->mmap_locked = true; - } else { - rcu_read_lock(); - reset_lock_ctx(lock_ctx); - } + rcu_read_lock(); + reset_lock_ctx(lock_ctx); return true; } @@ -172,7 +174,7 @@ static inline bool lock_vma_range(struct seq_file *m, static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->mmap_locked) { - mmap_read_unlock(lock_ctx->mm); + unlock_ctx_mm(lock_ctx); } else { unlock_ctx_vma(lock_ctx); rcu_read_unlock(); @@ -213,17 +215,45 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, return true; } +static inline void drop_rcu(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return; + + rcu_read_unlock(); +} + +static inline void reacquire_rcu(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return; + + rcu_read_lock(); + /* Reinitialize the iterator. */ + vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end); +} + #else /* CONFIG_PER_VMA_LOCK */ +static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm); +} + +static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(lock_ctx->mm) == 0; + return lock_ctx_mm(lock_ctx) == 0; } static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(lock_ctx->mm); + unlock_ctx_mm(lock_ctx); } static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, @@ -238,6 +268,9 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, return false; } +static inline void drop_rcu(struct proc_maps_private *priv) {} +static inline void reacquire_rcu(struct proc_maps_private *priv) {} + #endif /* CONFIG_PER_VMA_LOCK */ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) @@ -538,12 +571,10 @@ static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { - if (lock_ctx->mmap_locked) { - mmap_read_unlock(lock_ctx->mm); - lock_ctx->mmap_locked = false; - } else { + if (lock_ctx->mmap_locked) + unlock_ctx_mm(lock_ctx); + else unlock_ctx_vma(lock_ctx); - } } static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, @@ -1280,21 +1311,75 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; +#ifdef CONFIG_PER_VMA_LOCK + +static const struct mm_walk_ops smaps_walk_vma_lock_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, +}; + +static const struct mm_walk_ops smaps_shmem_walk_vma_lock_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, +}; + +static inline const struct mm_walk_ops * +get_smaps_walk_ops(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return &smaps_walk_ops; + return &smaps_walk_vma_lock_ops; +} + +static inline const struct mm_walk_ops * +get_smaps_shmem_walk_ops(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return &smaps_shmem_walk_ops; + return &smaps_shmem_walk_vma_lock_ops; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline const struct mm_walk_ops * +get_smaps_walk_ops(struct proc_maps_private *priv) +{ + return &smaps_walk_ops; +} + +static inline const struct mm_walk_ops * +get_smaps_shmem_walk_ops(struct proc_maps_private *priv) +{ + return &smaps_shmem_walk_ops; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0. */ -static void smap_gather_stats(struct vm_area_struct *vma, - struct mem_size_stats *mss, unsigned long start) +static void smap_gather_stats(struct proc_maps_private *priv, + struct vm_area_struct *vma, + struct mem_size_stats *mss, unsigned long start) { - const struct mm_walk_ops *ops = &smaps_walk_ops; + const struct mm_walk_ops *ops = get_smaps_walk_ops(priv); /* Invalid start */ if (start >= vma->vm_end) return; + if (vma == get_gate_vma(priv->lock_ctx.mm)) + return; + + /* Might sleep. Drop RCU read lock but keep the VMA locked. */ + drop_rcu(priv); + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -1312,15 +1397,16 @@ static void smap_gather_stats(struct vm_area_struct *vma, !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { - ops = &smaps_shmem_walk_ops; + ops = get_smaps_shmem_walk_ops(priv); } } - /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); + + reacquire_rcu(priv); } #define SEQ_PUT_DEC(str, val) \ @@ -1369,10 +1455,11 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, static int show_smap(struct seq_file *m, void *v) { + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct mem_size_stats mss = {}; - smap_gather_stats(vma, &mss, 0); + smap_gather_stats(priv, vma, &mss, 0); show_map_vma(m, vma); @@ -1413,7 +1500,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_task; } - ret = mmap_read_lock_killable(mm); + ret = lock_ctx_mm(&priv->lock_ctx); if (ret) goto out_put_mm; @@ -1425,7 +1512,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) vma_start = vma->vm_start; do { - smap_gather_stats(vma, &mss, 0); + smap_gather_stats(priv, vma, &mss, 0); last_vma_end = vma->vm_end; /* @@ -1434,8 +1521,8 @@ static int show_smaps_rollup(struct seq_file *m, void *v) */ if (mmap_lock_is_contended(mm)) { vma_iter_invalidate(&vmi); - mmap_read_unlock(mm); - ret = mmap_read_lock_killable(mm); + unlock_ctx_mm(&priv->lock_ctx); + ret = lock_ctx_mm(&priv->lock_ctx); if (ret) { release_task_mempolicy(priv); goto out_put_mm; @@ -1484,14 +1571,14 @@ static int show_smaps_rollup(struct seq_file *m, void *v) /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) { - smap_gather_stats(vma, &mss, 0); + smap_gather_stats(priv, vma, &mss, 0); last_vma_end = vma->vm_end; continue; } /* Case 4 above */ if (vma->vm_end > last_vma_end) { - smap_gather_stats(vma, &mss, last_vma_end); + smap_gather_stats(priv, vma, &mss, last_vma_end); last_vma_end = vma->vm_end; } } @@ -1505,7 +1592,7 @@ empty_set: __show_smap(m, &mss, true); release_task_mempolicy(priv); - mmap_read_unlock(mm); + unlock_ctx_mm(&priv->lock_ctx); out_put_mm: mmput(mm); @@ -3291,6 +3378,31 @@ static const struct mm_walk_ops show_numa_ops = { .walk_lock = PGWALK_RDLOCK, }; +#ifdef CONFIG_PER_VMA_LOCK +static const struct mm_walk_ops show_numa_vma_lock_ops = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, +}; + +static inline const struct mm_walk_ops * +get_show_numa_ops(struct proc_maps_private *priv) +{ + if (priv->lock_ctx.mmap_locked) + return &show_numa_ops; + return &show_numa_vma_lock_ops; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline const struct mm_walk_ops * +get_show_numa_ops(struct proc_maps_private *priv) +{ + return &show_numa_ops; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + /* * Display pages allocated per node and memory policy via /proc. */ @@ -3335,8 +3447,13 @@ static int show_numa_map(struct seq_file *m, void *v) if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); - /* mmap_lock is held by m_start */ - walk_page_vma(vma, &show_numa_ops, md); + /* Skip walking pages if gate VMA */ + if (vma != get_gate_vma(proc_priv->lock_ctx.mm)) { + /* Might sleep. Drop RCU read lock but keep the VMA locked. */ + drop_rcu(proc_priv); + walk_page_vma(vma, get_show_numa_ops(proc_priv), md); + reacquire_rcu(proc_priv); + } if (!md->pages) goto out; From ba98fca6a345805f7a4bdc5635ce6e8403770db5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 25 Apr 2026 23:27:17 -0700 Subject: [PATCH 264/321] selftests/proc: ensure the test is performed at the right page boundary When running tearing tests we need to ensure the pages we use include VMAs that were mapped by the child process for this test. Currently we always use the first two pages, checking VMAs at their boundaries and this works, however once we add tests for /proc/pid/smaps, the first two pages might not contain the VMAs that child modifies. Locate the page that contains the first VMA mapped by the child and use that and the next page for the test. Link: https://lore.kernel.org/20260426062718.1238437-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: "Paul E . McKenney" Cc: Pedro Falcato Cc: Shuah Khan Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-maps-race.c | 121 +++++++++++++++--- 1 file changed, 101 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index a734553718da..5eb350c23da4 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -39,6 +39,13 @@ #include #include +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + /* /proc/pid/maps parsing routines */ struct page_content { char *data; @@ -77,6 +84,7 @@ FIXTURE(proc_maps_race) struct line_content first_line; unsigned long duration_sec; int shared_mem_size; + int skip_pages; int page_size; int vma_count; bool verbose; @@ -105,38 +113,102 @@ struct vma_modifier_info { void *child_mapped_addr[]; }; - -static bool read_two_pages(FIXTURE_DATA(proc_maps_race) *self) +static bool read_page(FIXTURE_DATA(proc_maps_race) *self, + struct page_content *page) { ssize_t bytes_read; - if (lseek(self->maps_fd, 0, SEEK_SET) < 0) - return false; - - bytes_read = read(self->maps_fd, self->page1.data, self->page_size); + bytes_read = read(self->maps_fd, page->data, self->page_size); if (bytes_read <= 0) return false; - self->page1.size = bytes_read; - - bytes_read = read(self->maps_fd, self->page2.data, self->page_size); - if (bytes_read <= 0) + /* Make sure data always ends with a newline character. */ + if (page->data[bytes_read - 1] != '\n') return false; - self->page2.size = bytes_read; + page->size = bytes_read; return true; } -static void copy_first_line(struct page_content *page, char *first_line) +static bool parse_vma_line(char *line_start, char *line_end, + unsigned long *start, unsigned long *end) { - char *pos = strchr(page->data, '\n'); + bool found; - strncpy(first_line, page->data, pos - page->data); - first_line[pos - page->data] = '\0'; + *line_end = '\0'; /* stop sscanf at the EOL */ + found = (sscanf(line_start, "%lx-%lx", start, end) == 2); + *line_end = '\n'; + + return found; } -static void copy_last_line(struct page_content *page, char *last_line) +static int locate_containing_page(FIXTURE_DATA(proc_maps_race) *self, + unsigned long addr, unsigned long size) +{ + unsigned long start, end; + int page = 0; + + if (lseek(self->maps_fd, 0, SEEK_SET) < 0) + return -1; + + while (true) { + char *curr_pos; + char *end_pos; + + if (!read_page(self, &self->page1)) + return -1; + + curr_pos = self->page1.data; + end_pos = self->page1.data + self->page1.size; + while (curr_pos < end_pos) { + char *line_end; + + line_end = strchr(curr_pos, '\n'); + if (!line_end) + break; + + if (parse_vma_line(curr_pos, line_end, &start, &end) && + start == addr && end == addr + size) + return page; + + curr_pos = line_end + 1; + } + page++; + } + + return 0; +} + +static bool read_two_pages(FIXTURE_DATA(proc_maps_race) *self) +{ + if (lseek(self->maps_fd, 0, SEEK_SET) < 0) + return false; + + for (int i = 0; i < self->skip_pages; i++) + if (!read_page(self, &self->page1)) + return false; + + return read_page(self, &self->page1) && read_page(self, &self->page2); +} + +static void copy_line(const char *line_start, const char *line_end, + char *buf, size_t buf_size) +{ + size_t len = min(line_end - line_start, buf_size - 1); + + strncpy(buf, line_start, len); + buf[len] = '\0'; +} + +static void copy_first_line(struct page_content *page, char *first_line, + size_t line_size) +{ + copy_line(page->data, strchr(page->data, '\n'), first_line, line_size); +} + +static void copy_last_line(struct page_content *page, char *last_line, + size_t line_size) { /* Get the last line in the first page */ const char *end = page->data + page->size - 1; @@ -146,8 +218,8 @@ static void copy_last_line(struct page_content *page, char *last_line) /* search previous newline */ while (pos[-1] != '\n') pos--; - strncpy(last_line, pos, end - pos); - last_line[end - pos] = '\0'; + + copy_line(pos, end, last_line, line_size); } /* Read the last line of the first page and the first line of the second page */ @@ -158,8 +230,8 @@ static bool read_boundary_lines(FIXTURE_DATA(proc_maps_race) *self, if (!read_two_pages(self)) return false; - copy_last_line(&self->page1, last_line->text); - copy_first_line(&self->page2, first_line->text); + copy_last_line(&self->page1, last_line->text, LINE_MAX_SIZE); + copy_first_line(&self->page2, first_line->text, LINE_MAX_SIZE); return sscanf(last_line->text, "%lx-%lx", &last_line->start_addr, &last_line->end_addr) == 2 && @@ -418,6 +490,8 @@ FIXTURE_SETUP(proc_maps_race) struct vma_modifier_info *mod_info; pthread_mutexattr_t mutex_attr; pthread_condattr_t cond_attr; + unsigned long first_map_addr; + unsigned long last_map_addr; unsigned long duration_sec; char fname[32]; @@ -502,6 +576,13 @@ FIXTURE_SETUP(proc_maps_race) self->page2.data = malloc(self->page_size); ASSERT_NE(self->page2.data, NULL); + first_map_addr = (unsigned long)mod_info->child_mapped_addr[0]; + last_map_addr = (unsigned long)mod_info->child_mapped_addr[mod_info->vma_count - 1]; + + self->skip_pages = locate_containing_page(self, + min(first_map_addr, last_map_addr), + self->page_size * 3); + ASSERT_NE(self->skip_pages, -1); ASSERT_TRUE(read_boundary_lines(self, &self->last_line, &self->first_line)); /* From 6d536ed691485fa5aa6417252d357c65eb474b75 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 25 Apr 2026 23:27:18 -0700 Subject: [PATCH 265/321] selftests/proc: add /proc/pid/smaps tearing tests Add tearing tests for /proc/pid/smaps file. New tests reuse the same logic as with maps file but skipping all the data except for the VMA addresses, which are the only part relevant for the tearing tests. Skip PROCMAP_QUERY parts of the tests because smaps does not implement that ioctl. Link: https://lore.kernel.org/20260426062718.1238437-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: "Paul E . McKenney" Cc: Pedro Falcato Cc: Shuah Khan Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-maps-race.c | 178 +++++++++++++----- 1 file changed, 133 insertions(+), 45 deletions(-) diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 5eb350c23da4..1026d8c400e1 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -17,8 +17,8 @@ */ /* * Fork a child that concurrently modifies address space while the main - * process is reading /proc/$PID/maps and verifying the results. Address - * space modifications include: + * process is reading /proc/$PID/maps and /proc/$PID/smaps, verifying the + * results. Address space modifications include: * VMA splitting and merging * */ @@ -73,6 +73,11 @@ enum test_state { TEST_DONE, }; +enum maps_file { + MAPS, + SMAPS, +}; + struct vma_modifier_info; FIXTURE(proc_maps_race) @@ -83,6 +88,7 @@ FIXTURE(proc_maps_race) struct line_content last_line; struct line_content first_line; unsigned long duration_sec; + enum maps_file maps_file; int shared_mem_size; int skip_pages; int page_size; @@ -92,6 +98,19 @@ FIXTURE(proc_maps_race) pid_t pid; }; +FIXTURE_VARIANT(proc_maps_race) +{ + const enum maps_file maps_file; +}; + +FIXTURE_VARIANT_ADD(proc_maps_race, maps) { + .maps_file = MAPS, +}; + +FIXTURE_VARIANT_ADD(proc_maps_race, smaps) { + .maps_file = SMAPS, +}; + typedef bool (*vma_modifier_op)(FIXTURE_DATA(proc_maps_race) *self); typedef bool (*vma_mod_result_check_op)(struct line_content *mod_last_line, struct line_content *mod_first_line, @@ -222,6 +241,57 @@ static void copy_last_line(struct page_content *page, char *last_line, copy_line(pos, end, last_line, line_size); } +static bool copy_first_entry(struct page_content *page, char *first_line, + size_t line_size) +{ + char *start_pos = page->data; + + while (start_pos < page->data + page->size) { + unsigned long start_addr; + unsigned long end_addr; + char *end_pos; + + end_pos = strchr(start_pos, '\n'); + if (!end_pos) + break; + + if (parse_vma_line(start_pos, end_pos, &start_addr, &end_addr)) { + copy_line(start_pos, end_pos, first_line, line_size); + return true; + } + + start_pos = end_pos + 1; + } + + return false; +} + +static bool copy_last_entry(struct page_content *page, char *last_line, + size_t line_size) +{ + char *end_pos = page->data + page->size - 1; + char *start_pos; + + while (end_pos > page->data) { + unsigned long start_addr; + unsigned long end_addr; + + /* skip last newline */ + start_pos = end_pos - 1; + /* search previous newline */ + while (start_pos > page->data && start_pos[-1] != '\n') + start_pos--; + if (parse_vma_line(start_pos, end_pos, &start_addr, &end_addr)) { + copy_line(start_pos, end_pos, last_line, line_size); + return true; + } + + end_pos = start_pos - 1; + } + + return false; +} + /* Read the last line of the first page and the first line of the second page */ static bool read_boundary_lines(FIXTURE_DATA(proc_maps_race) *self, struct line_content *last_line, @@ -230,8 +300,16 @@ static bool read_boundary_lines(FIXTURE_DATA(proc_maps_race) *self, if (!read_two_pages(self)) return false; - copy_last_line(&self->page1, last_line->text, LINE_MAX_SIZE); - copy_first_line(&self->page2, first_line->text, LINE_MAX_SIZE); + if (self->maps_file == MAPS) { + copy_last_line(&self->page1, last_line->text, LINE_MAX_SIZE); + copy_first_line(&self->page2, first_line->text, LINE_MAX_SIZE); + } else if (self->maps_file == SMAPS) { + if (!copy_last_entry(&self->page1, last_line->text, LINE_MAX_SIZE) || + !copy_first_entry(&self->page2, first_line->text, LINE_MAX_SIZE)) + return false; + } else { + return false; + } return sscanf(last_line->text, "%lx-%lx", &last_line->start_addr, &last_line->end_addr) == 2 && @@ -497,6 +575,7 @@ FIXTURE_SETUP(proc_maps_race) self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); self->verbose = verbose && !strncmp(verbose, "1", 1); + self->maps_file = variant->maps_file; duration_sec = duration ? atol(duration) : 0; self->duration_sec = duration_sec ? duration_sec : 5UL; @@ -563,7 +642,16 @@ FIXTURE_SETUP(proc_maps_race) exit(0); } - sprintf(fname, "/proc/%d/maps", self->pid); + switch (self->maps_file) { + case MAPS: + sprintf(fname, "/proc/%d/maps", self->pid); + break; + case SMAPS: + sprintf(fname, "/proc/%d/smaps", self->pid); + break; + default: + ksft_exit_fail(); + } self->maps_fd = open(fname, O_RDONLY); ASSERT_NE(self->maps_fd, -1); @@ -608,7 +696,6 @@ FIXTURE_SETUP(proc_maps_race) ASSERT_TRUE(mod_info->addr && mod_info->next_addr); signal_state(mod_info, PARENT_READY); - } FIXTURE_TEARDOWN(proc_maps_race) @@ -698,20 +785,20 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) last_line_changed = strcmp(new_last_line.text, self->last_line.text) != 0; first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed); - - /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ - ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, - &vma_start, &vma_end)); - /* - * The vma at the split address can be either the same as - * original one (if read before the split) or the same as the - * first line in the second page (if read after the split). - */ - ASSERT_TRUE((vma_start == self->last_line.start_addr && - vma_end == self->last_line.end_addr) || - (vma_start == split_first_line.start_addr && - vma_end == split_first_line.end_addr)); - + if (self->maps_file == MAPS) { + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma at the split address can be either the same as + * original one (if read before the split) or the same as the + * first line in the second page (if read after the split). + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end == self->last_line.end_addr) || + (vma_start == split_first_line.start_addr && + vma_end == split_first_line.end_addr)); + } clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -781,17 +868,18 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) strcmp(new_first_line.text, restored_first_line.text), "Expand result invalid", self)); } - - /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ - ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end)); - /* - * The vma should stay at the same address and have either the - * original size of 3 pages or 1 page if read after shrinking. - */ - ASSERT_TRUE(vma_start == self->last_line.start_addr && - (vma_end - vma_start == self->page_size * 3 || - vma_end - vma_start == self->page_size)); - + if (self->maps_file == MAPS) { + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, + &vma_start, &vma_end)); + /* + * The vma should stay at the same address and have either the + * original size of 3 pages or 1 page if read after shrinking. + */ + ASSERT_TRUE(vma_start == self->last_line.start_addr && + (vma_end - vma_start == self->page_size * 3 || + vma_end - vma_start == self->page_size)); + } clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -861,20 +949,20 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) strcmp(new_first_line.text, restored_first_line.text), "Remap restore result invalid", self)); } - - /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ - ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, - &vma_start, &vma_end)); - /* - * The vma should either stay at the same address and have the - * original size of 3 pages or we should find the remapped vma - * at the remap destination address with size of 1 page. - */ - ASSERT_TRUE((vma_start == self->last_line.start_addr && - vma_end - vma_start == self->page_size * 3) || - (vma_start == self->last_line.start_addr + self->page_size && - vma_end - vma_start == self->page_size)); - + if (self->maps_file == MAPS) { + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma should either stay at the same address and have the + * original size of 3 pages or we should find the remapped vma + * at the remap destination address with size of 1 page. + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end - vma_start == self->page_size * 3) || + (vma_start == self->last_line.start_addr + self->page_size && + vma_end - vma_start == self->page_size)); + } clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); From eb4c458a9803c3c75ee27d567a3a2ff0cc66da98 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Mon, 25 May 2026 07:57:51 -0700 Subject: [PATCH 266/321] mm: make mmap_miss accounting symmetric for VM_SEQ_READ do_sync_mmap_readahead() skips both the mmap_miss increment and the MMAP_LOTSAMISS check for VM_SEQ_READ mappings, since sequential access is non-speculative and should always read ahead. The two decrement sites in do_async_mmap_readahead() and filemap_map_pages() do not mirror this skip, so concurrent faults on a VM_SEQ_READ mapping can still drive ra->mmap_miss down to zero through the decrement paths even though nothing in the sync path ever increments it. The counter itself is per-file (file->f_ra.mmap_miss), so it can be moved by any VMA mapping the file, not just the one currently faulting. Skip the decrement for VM_SEQ_READ in both decrement sites so the counter only moves for mappings that also participate in the increment side. No functional change for VM_SEQ_READ users, since the increment-side gate already prevents the counter from being consulted on their behalf, but it stops a VM_SEQ_READ mapping from biasing the counter for other mappings of the same file. Link: https://lore.kernel.org/20260525145751.2671248-1-usama.arif@linux.dev Signed-off-by: Usama Arif Closes: https://lore.kernel.org/all/8edc8cd0-f65c-4456-9b3f-362e744c9a96@linux.dev/ Reviewed-by: William Kucharski Reviewed-by: Jan Kara Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/filemap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 4263d9775998..6bf0b540ef19 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3434,8 +3434,13 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * Don't touch the mmap_miss counter to avoid decreasing it multiple * times for a single folio and break the balance with mmap_miss * increase in do_sync_mmap_readahead(). + * + * VM_SEQ_READ mappings skip the mmap_miss increment in + * do_sync_mmap_readahead(), so skip the decrement here as well to + * keep the counter symmetric. */ - if (likely(!folio_test_locked(folio))) { + if (likely(!folio_test_locked(folio)) && + !(vmf->vma->vm_flags & VM_SEQ_READ)) { mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); @@ -3936,10 +3941,15 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, * In such situation, read-ahead is only a waste of IO. * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. + * + * VM_SEQ_READ mappings skip the mmap_miss increment in + * do_sync_mmap_readahead(), so skip the decrement here as + * well to keep the counter symmetric. */ if ((map_ret & VM_FAULT_NOPAGE) && !(vmf->flags & FAULT_FLAG_TRIED) && - !folio_test_workingset(folio)) { + !folio_test_workingset(folio) && + !(vma->vm_flags & VM_SEQ_READ)) { unsigned short mmap_miss; mmap_miss = READ_ONCE(file->f_ra.mmap_miss); From ad1cee3940d51c8e0d03a3f45d9803aa8f2154a4 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Mon, 25 May 2026 10:26:59 +0000 Subject: [PATCH 267/321] mm: shmem: refactor thpsize_shmem_enabled_store() with sysfs_match_string() Patch series "refactors thpsize_shmem_enabled_store() and thpsize_shmem_enabled_show()", v4. This patch (of 2): Inspired by commit 82d9ff648c6c ("mm: huge_memory: refactor anon_enabled_store() with set_anon_enabled_mode()"), refactor thpsize_shmem_enabled_store() using sysfs_match_string(). This eliminates the duplicated spin_lock/unlock(), set/clear_bit(), calls across all branches, reducing code duplication. Behavioral change: Call start_stop_khugepaged() only when the mode actually changes. If unchanged, call set_recommended_min_free_kbytes() to preserve legacy watermark behavior. This avoids unnecessary khugepaged restarts. Tested with selftests ./run_kselftest.sh -t mm:ksft_thp.sh, all test cases passed. Link: https://lore.kernel.org/20260525102700.68707-1-ranxiaokai627@163.com Link: https://lore.kernel.org/20260525102700.68707-2-ranxiaokai627@163.com Signed-off-by: Ran Xiaokai Reviewed-by: Baolin Wang Reviewed-by: Barry Song Tested-by: Baolin Wang Tested-by: Lance Yang Acked-by: David Hildenbrand (arm) Reviewed-by: Lorenzo Stoakes Cc: Breno Leitao Cc: Dev Jain Cc: Hugh Dickins Cc: Liam R. Howlett Cc: Nico Pache Cc: Ran Xiaokai Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/shmem.c | 105 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 77a3e28e5160..748b135d04fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5463,6 +5463,29 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); +enum huge_mode { + HUGE_SHMEM_ENABLED_ALWAYS = 0, + HUGE_SHMEM_ENABLED_INHERIT, + HUGE_SHMEM_ENABLED_WITHIN_SIZE, + HUGE_SHMEM_ENABLED_ADVISE, + HUGE_SHMEM_ENABLED_NEVER, +}; + +static const char * const huge_mode_strings[] = { + [HUGE_SHMEM_ENABLED_ALWAYS] = "always", + [HUGE_SHMEM_ENABLED_INHERIT] = "inherit", + [HUGE_SHMEM_ENABLED_WITHIN_SIZE] = "within_size", + [HUGE_SHMEM_ENABLED_ADVISE] = "advise", + [HUGE_SHMEM_ENABLED_NEVER] = "never", +}; + +static unsigned long * const huge_mode_orders[] = { + [HUGE_SHMEM_ENABLED_ALWAYS] = &huge_shmem_orders_always, + [HUGE_SHMEM_ENABLED_INHERIT] = &huge_shmem_orders_inherit, + [HUGE_SHMEM_ENABLED_WITHIN_SIZE] = &huge_shmem_orders_within_size, + [HUGE_SHMEM_ENABLED_ADVISE] = &huge_shmem_orders_madvise, +}; + static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -5483,63 +5506,53 @@ static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } +static bool set_shmem_enabled_mode(int order, enum huge_mode mode) +{ + bool changed = false; + enum huge_mode idx; + + spin_lock(&huge_shmem_orders_lock); + for (idx = 0; idx < ARRAY_SIZE(huge_mode_orders); idx++) { + if (idx == mode) + changed |= !__test_and_set_bit(order, huge_mode_orders[idx]); + else + changed |= __test_and_clear_bit(order, huge_mode_orders[idx]); + } + spin_unlock(&huge_shmem_orders_lock); + + return changed; +} + static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; - ssize_t ret = count; + int mode; - if (sysfs_streq(buf, "always")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_madvise); - clear_bit(order, &huge_shmem_orders_within_size); - set_bit(order, &huge_shmem_orders_always); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "inherit")) { - /* Do not override huge allocation policy with non-PMD sized mTHP */ - if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) - return -EINVAL; + mode = sysfs_match_string(huge_mode_strings, buf); + if (mode < 0) + return mode; - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_madvise); - clear_bit(order, &huge_shmem_orders_within_size); - set_bit(order, &huge_shmem_orders_inherit); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "within_size")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_madvise); - set_bit(order, &huge_shmem_orders_within_size); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "advise")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_within_size); - set_bit(order, &huge_shmem_orders_madvise); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "never")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_within_size); - clear_bit(order, &huge_shmem_orders_madvise); - spin_unlock(&huge_shmem_orders_lock); - } else { - ret = -EINVAL; - } + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (mode == HUGE_SHMEM_ENABLED_INHERIT && + shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) + return -EINVAL; - if (ret > 0) { + if (set_shmem_enabled_mode(order, mode)) { int err = start_stop_khugepaged(); - if (err) - ret = err; + return err; + } else { + /* + * Recalculate watermarks even when the mode hasn't changed + * to preserve the legacy behavior, as this is always called + * inside start_stop_khugepaged(). + */ + set_recommended_min_free_kbytes(); } - return ret; + + return count; } struct kobj_attribute thpsize_shmem_enabled_attr = From bf7033eb7c2f892580f060554ae4ea92bd52b9fb Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Mon, 25 May 2026 10:27:00 +0000 Subject: [PATCH 268/321] mm: shmem: refactor thpsize_shmem_enabled_show() with helper arrays Replace the hardcoded if/else chain of test_bit() calls and string literals in thpsize_shmem_enabled_show() with a loop over huge_shmem_orders_by_mode[] and huge_shmem_enabled_mode_strings[] arrays. This makes thpsize_shmem_enabled_show() consistent with thpsize_shmem_enabled_store() and eliminates duplicated mode name strings. Link: https://lore.kernel.org/20260525102700.68707-3-ranxiaokai627@163.com Signed-off-by: Ran Xiaokai Reviewed-by: Baolin Wang Reviewed-by: Barry Song Reviewed-by: Breno Leitao Reviewed-by: Lorenzo Stoakes Tested-by: Baolin Wang Tested-by: Lance Yang Acked-by: David Hildenbrand (arm) Cc: Dev Jain Cc: Hugh Dickins Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/shmem.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 748b135d04fb..56c23a7b15c7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5490,20 +5490,30 @@ static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; - const char *output; + int active = HUGE_SHMEM_ENABLED_NEVER; + int len = 0; + int i; - if (test_bit(order, &huge_shmem_orders_always)) - output = "[always] inherit within_size advise never"; - else if (test_bit(order, &huge_shmem_orders_inherit)) - output = "always [inherit] within_size advise never"; - else if (test_bit(order, &huge_shmem_orders_within_size)) - output = "always inherit [within_size] advise never"; - else if (test_bit(order, &huge_shmem_orders_madvise)) - output = "always inherit within_size [advise] never"; - else - output = "always inherit within_size advise [never]"; + for (i = 0; i < ARRAY_SIZE(huge_mode_orders); i++) { + if (test_bit(order, huge_mode_orders[i])) { + active = i; + break; + } + } - return sysfs_emit(buf, "%s\n", output); + for (i = 0; i < ARRAY_SIZE(huge_mode_strings); i++) { + if (i == active) + len += sysfs_emit_at(buf, len, "[%s] ", + huge_mode_strings[i]); + else + len += sysfs_emit_at(buf, len, "%s ", + huge_mode_strings[i]); + } + + /* Replace trailing space with newline */ + buf[len - 1] = '\n'; + + return len; } static bool set_shmem_enabled_mode(int order, enum huge_mode mode) From 528db7d37e08dc7eae70d046cda5a2ee30208448 Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Sun, 24 May 2026 22:35:56 +0300 Subject: [PATCH 269/321] selftests/memfd: fix -Wmaybe-uninitialized warning in memfd_test Patch series "selftests/memfd: fix compilation warnings". This patchset fixes warnings about unused but initialized variables, and unused dummy buffer passed to pwrite() syscall in the tests. This patch (of 2): memfd_test.c: In function 'mfd_fail_grow_write.part.0': memfd_test.c:685:13: warning: '' may be used uninitialized [-Wmaybe-uninitialized] 685 | l = pwrite(fd, buf, mfd_def_size * 8, 0); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pwrite() is declared with attribute 'access (read_only, 2, 3)', so GCC knows it reads from the buffer. malloc() returns uninitialized memory, hence the warning. Use calloc() to zero-initialize the buffer. The actual contents don't matter here since the test verifies that pwrite() fails on a sealed memfd. Link: https://lore.kernel.org/20260524193732.48853-1-eva.kurchatova@virtuozzo.com Link: https://lore.kernel.org/20260524193732.48853-2-eva.kurchatova@virtuozzo.com Signed-off-by: Konstantin Khorenko Signed-off-by: Eva Kurchatova Cc: Aristeu Rozanski Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 2ca07ea7202a..cdab3a837624 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -688,9 +688,9 @@ static void mfd_assert_grow_write(int fd) if (hugetlbfs_test) return; - buf = malloc(mfd_def_size * 8); + buf = calloc(1, mfd_def_size * 8); if (!buf) { - printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); + printf("calloc(1, %zu) failed: %m\n", mfd_def_size * 8); abort(); } From 952923ff200817506a9a5fd1dd1b811745f25746 Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Sun, 24 May 2026 22:35:57 +0300 Subject: [PATCH 270/321] selftests/memfd: remove unused variable 'sig' in fuse_test fuse_test.c: In function 'sealing_thread_fn': fuse_test.c:165:13: warning: unused variable 'sig' [-Wunused-variable] 165 | int sig, r; | ^~~ Remove unused 'sig' to fix -Wunused-variable warning. Link: https://lore.kernel.org/20260524193732.48853-3-eva.kurchatova@virtuozzo.com Signed-off-by: Konstantin Khorenko Cc: Aristeu Rozanski Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/fuse_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index dbc171a3806d..510056c1b0d0 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -162,7 +162,7 @@ static void *global_p = NULL; static int sealing_thread_fn(void *arg) { - int sig, r; + int r; /* * This thread first waits 200ms so any pending operation in the parent From e0d4e7405f267ae31ffafd5673ce14d0d9e4cbe0 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 May 2026 20:39:28 -0700 Subject: [PATCH 271/321] memcg: store node_id instead of pglist_data pointer Patch series "memcg: shrink obj_stock_pcp and cache multiple objcgs", v3. Commit 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") split a memcg's single obj_cgroup into one per NUMA node so that reparenting LRU folios can take per-node lru locks. As a side effect, the per-CPU obj_stock_pcp -- which caches a single cached_objcg pointer -- thrashes on workloads where threads of the same memcg run on different NUMA nodes. The kernel test robot reported a 67.7% regression on stress-ng.switch.ops_per_sec from this pattern. Commit d0211878ce06 ("memcg: cache obj_stock by memcg, not by objcg pointer") landed as a temporary fix by treating sibling per-node objcgs as equivalent for the cache lookup, intended to be reverted once per-node kmem accounting is introduced. This series takes a more general approach: cache multiple objcgs per CPU using the multi-slot pattern memcg_stock_pcp already uses, so the per-node objcg variants of one memcg can all coexist in the stock without ever forcing a drain. The temporary fix can then be reverted. To avoid increasing the per-CPU cache footprint, the first three patches shrink the existing single-slot obj_stock_pcp fields. The final patch converts cached_objcg and nr_bytes into NR_OBJ_STOCK=5 slot arrays and reorders the struct so the entire consume/refill/account hot path fits within a single 64-byte cache line on non-debug 64-bit builds (verified with pahole). This patch (of 4): The struct obj_stock_pcp stores a pointer to pglist_data for the slab stats cached on the cpu. On 64-bit machines, this costs 8 bytes. The pointer is not strictly required: NODE_DATA() can recover it from the node id. Replace cached_pgdat with int16_t node_id and use NUMA_NO_NODE as the "no stats cached" sentinel. At the moment all the archs limit MAX_NUMNODES to 1024 so int16_t is plenty; a BUILD_BUG_ON() makes sure we notice if that ever changes. Link: https://lore.kernel.org/20260526033931.1760588-1-shakeel.butt@linux.dev Link: https://lore.kernel.org/20260526033931.1760588-2-shakeel.butt@linux.dev Fixes: 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") Signed-off-by: Shakeel Butt Tested-by: kernel test robot Acked-by: Muchun Song Reviewed-by: Harry Yoo (Oracle) Acked-by: Qi Zheng Cc: Alexandre Ghiti Cc: Johannes Weiner Cc: Joshua Hahn Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92269740eef1..e983fa590af8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2022,7 +2022,7 @@ struct obj_stock_pcp { local_trylock_t lock; unsigned int nr_bytes; struct obj_cgroup *cached_objcg; - struct pglist_data *cached_pgdat; + int16_t node_id; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; @@ -2032,6 +2032,7 @@ struct obj_stock_pcp { static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { .lock = INIT_LOCAL_TRYLOCK(lock), + .node_id = NUMA_NO_NODE, }; static DEFINE_MUTEX(percpu_charge_mutex); @@ -3162,6 +3163,13 @@ static void __account_obj_stock(struct obj_cgroup *objcg, { int *bytes; + /* + * Though at the moment MAX_NUMNODES <= 1024 in all archs but let's make + * sure it does not exceed S16_MAX otherwise we need to fix node_id type + * in struct obj_stock_pcp. + */ + BUILD_BUG_ON(MAX_NUMNODES >= S16_MAX); + if (!stock || READ_ONCE(stock->cached_objcg) != objcg) goto direct; @@ -3169,9 +3177,11 @@ static void __account_obj_stock(struct obj_cgroup *objcg, * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat changes. */ - if (stock->cached_pgdat != pgdat) { + if (stock->node_id == NUMA_NO_NODE) { + stock->node_id = pgdat->node_id; + } else if (stock->node_id != pgdat->node_id) { /* Flush the existing cached vmstat data */ - struct pglist_data *oldpg = stock->cached_pgdat; + struct pglist_data *oldpg = NODE_DATA(stock->node_id); if (stock->nr_slab_reclaimable_b) { mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, @@ -3183,7 +3193,7 @@ static void __account_obj_stock(struct obj_cgroup *objcg, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } - stock->cached_pgdat = pgdat; + stock->node_id = pgdat->node_id; } bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b @@ -3279,19 +3289,21 @@ static void drain_obj_stock(struct obj_stock_pcp *stock) * Flush the vmstat data in current stock */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { + struct pglist_data *oldpg = NODE_DATA(stock->node_id); + if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } - stock->cached_pgdat = NULL; + stock->node_id = NUMA_NO_NODE; } WRITE_ONCE(stock->cached_objcg, NULL); From 37a7f91e44f41f1b4cd60d1f89a4de7cf871d158 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 May 2026 20:39:29 -0700 Subject: [PATCH 272/321] memcg: uint16_t for nr_bytes in obj_stock_pcp Currently struct obj_stock_pcp stores nr_bytes in an 'unsigned int' which is 4 bytes on 64-bit machines. Switch the field to uint16_t to shrink the per-CPU cache. The kernel supports PAGE_SIZE_4KB, _8KB, _16KB, _32KB, _64KB and _256KB (see HAVE_PAGE_SIZE_* in arch/Kconfig). After the PAGE_SIZE-aligned flush in __refill_obj_stock(), the sub-page remainder fits in uint16_t up through 64KiB pages where PAGE_SIZE - 1 == U16_MAX, but on 256KiB pages PAGE_SIZE - 1 == 0x3FFFF exceeds U16_MAX. The accumulator also needs to stay within uint16_t between page-aligned flushes on 64KiB pages where PAGE_SIZE itself is U16_MAX + 1. Accumulate the new total in an 'unsigned int' local, then on PAGE_SHIFT <= 16 flush whenever the accumulator would hit U16_MAX; together with the existing allow_uncharge flush at PAGE_SIZE this keeps the uint16_t safe. On configs with PAGE_SHIFT > 16 (PAGE_SIZE_256KB on hexagon and powerpc 44x, both 32-bit), uint16_t cannot represent the sub-page remainder. Define obj_stock_bytes_t as 'unsigned int' on those archs so nr_bytes can hold the full remainder and the normal page-boundary flush in __refill_obj_stock() and the page extraction in drain_obj_stock() both work correctly. The single-cache-line layout target only applies to PAGE_SHIFT <= 16; those archs are 32-bit embedded and not the optimization target. Link: https://lore.kernel.org/20260526033931.1760588-3-shakeel.butt@linux.dev Fixes: 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") Signed-off-by: Shakeel Butt Tested-by: kernel test robot Reviewed-by: Harry Yoo (Oracle) Acked-by: Qi Zheng Acked-by: Muchun Song Cc: Alexandre Ghiti Cc: Johannes Weiner Cc: Joshua Hahn Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e983fa590af8..8bbcc7bc42e3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2020,8 +2020,17 @@ static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { struct obj_stock_pcp { local_trylock_t lock; - unsigned int nr_bytes; struct obj_cgroup *cached_objcg; +#if PAGE_SHIFT > 16 + /* + * On rare archs with 256KiB base page size (hexagon and powerpc 44x) + * keep nr_bytes to unsigned int as uint16_t cannot represent the full + * sub-page remainder. + */ + unsigned int nr_bytes; +#else + uint16_t nr_bytes; +#endif int16_t node_id; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; @@ -3334,6 +3343,7 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, bool allow_uncharge) { unsigned int nr_pages = 0; + unsigned int stock_nr_bytes; if (!stock) { nr_pages = nr_bytes >> PAGE_SHIFT; @@ -3342,21 +3352,24 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, goto out; } + stock_nr_bytes = stock->nr_bytes; if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + stock_nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; WRITE_ONCE(stock->cached_objcg, objcg); allow_uncharge = true; /* Allow uncharge when objcg changes */ } - stock->nr_bytes += nr_bytes; + stock_nr_bytes += nr_bytes; - if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { - nr_pages = stock->nr_bytes >> PAGE_SHIFT; - stock->nr_bytes &= (PAGE_SIZE - 1); + if ((allow_uncharge && (stock_nr_bytes > PAGE_SIZE)) || + stock_nr_bytes > U16_MAX) { + nr_pages = stock_nr_bytes >> PAGE_SHIFT; + stock_nr_bytes &= (PAGE_SIZE - 1); } + stock->nr_bytes = stock_nr_bytes; out: if (nr_pages) From 7a09fb91c285ba1b253f7c72f86cf37b373afb10 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 May 2026 20:39:30 -0700 Subject: [PATCH 273/321] memcg: int16_t for cached slab stats Currently struct obj_stock_pcp stores cached slab stats in 'int' which is 4 bytes per counter on 64-bit machines. Switch them to int16_t to shrink the cached metadata. The existing PAGE_SIZE flush in __account_obj_stock() bounds *bytes at PAGE_SIZE on 4KiB and 16KiB page archs, well within int16_t. On 64KiB pages PAGE_SIZE is well above S16_MAX so that flush never fires, and a sufficiently long run of accumulations would overflow the cache. Add an explicit S16_MAX guard before each add: when the next add would push abs(*bytes) past S16_MAX, fold the cached value into @nr and flush directly via mod_objcg_mlstate() before the accumulation. Link: https://lore.kernel.org/20260526033931.1760588-4-shakeel.butt@linux.dev Fixes: 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") Signed-off-by: Shakeel Butt Tested-by: kernel test robot Reviewed-by: Harry Yoo (Oracle) Acked-by: Qi Zheng Acked-by: Muchun Song Cc: Alexandre Ghiti Cc: Johannes Weiner Cc: Joshua Hahn Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8bbcc7bc42e3..ac7c99e32f99 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2032,8 +2032,8 @@ struct obj_stock_pcp { uint16_t nr_bytes; #endif int16_t node_id; - int nr_slab_reclaimable_b; - int nr_slab_unreclaimable_b; + int16_t nr_slab_reclaimable_b; + int16_t nr_slab_unreclaimable_b; struct work_struct work; unsigned long flags; @@ -3170,7 +3170,7 @@ static void __account_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) { - int *bytes; + int16_t *bytes; /* * Though at the moment MAX_NUMNODES <= 1024 in all archs but let's make @@ -3207,21 +3207,20 @@ static void __account_obj_stock(struct obj_cgroup *objcg, bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b : &stock->nr_slab_unreclaimable_b; + /* - * Even for large object >= PAGE_SIZE, the vmstat data will still be - * cached locally at least once before pushing it out. + * Fold @nr into the cached value and decide whether to keep it cached + * or flush it directly. Cache the combined value when it fits in the + * int16_t storage and either the cache was empty (so even a value + * above PAGE_SIZE gets a chance to be canceled by a paired delta) or + * the combined value is within the PAGE_SIZE flush threshold. */ - if (!*bytes) { + nr += *bytes; + if (abs(nr) <= S16_MAX && (!*bytes || abs(nr) <= PAGE_SIZE)) { *bytes = nr; nr = 0; } else { - *bytes += nr; - if (abs(*bytes) > PAGE_SIZE) { - nr = *bytes; - *bytes = 0; - } else { - nr = 0; - } + *bytes = 0; } direct: if (nr) From 29a1ea41456b79d657e5f5deced1239477d03af1 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 May 2026 20:39:31 -0700 Subject: [PATCH 274/321] memcg: multi objcg charge support Commit 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") split a memcg's single obj_cgroup into one per NUMA node so that reparenting LRU folios can take per-node lru locks. As a side effect, the per-CPU obj_stock_pcp -- which caches exactly one cached_objcg -- thrashes on workloads where threads of the same memcg run on different NUMA nodes. The kernel test robot reported a 67.7% regression on stress-ng.switch.ops_per_sec from this pattern. Mirror the multi-slot pattern already used by memcg_stock_pcp: turn nr_bytes and cached_objcg into NR_OBJ_STOCK-element arrays, scan all slots on consume/refill/account, prefer empty slots when inserting, and evict a slot round-robin only when full. With multiple slots a CPU can hold the per-node objcg variants of one memcg plus a few siblings without ever forcing a drain. A single int8_t index records which slot the cached slab stats belong to; the stats are flushed on slot or pgdat change. With NR_OBJ_STOCK = 5 the layout (verified with pahole) is: offset 0 : lock(1) + index(1) + node_id(2) + slab stats(4) = 8B offset 8 : nr_bytes[5] = 10B offset 18 : padding = 6B offset 24 : cached[5] = 40B offset 64 : (line 2) work_struct + flags (cold) so consume_obj_stock, refill_obj_stock and the slab account path each touch exactly one 64-byte cache line on non-debug 64-bit builds. Link: https://lore.kernel.org/20260526033931.1760588-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202605121641.b6a60cb0-lkp@intel.com Fixes: 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg per-node type") Tested-by: kernel test robot Reviewed-by: Harry Yoo (Oracle) Cc: Alexandre Ghiti Cc: Johannes Weiner Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 200 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 142 insertions(+), 58 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac7c99e32f99..e24114a4493a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -150,15 +150,15 @@ static void obj_cgroup_release(struct percpu_ref *ref) * However, it can be PAGE_SIZE or (x * PAGE_SIZE). * * The following sequence can lead to it: - * 1) CPU0: objcg == stock->cached_objcg + * 1) CPU0: objcg cached in one of stock->cached[i] * 2) CPU1: we do a small allocation (e.g. 92 bytes), * PAGE_SIZE bytes are charged * 3) CPU1: a process from another memcg is allocating something, * the stock if flushed, * objcg->nr_charged_bytes = PAGE_SIZE - 92 - * 5) CPU0: we do release this object, - * 92 bytes are added to stock->nr_bytes - * 6) CPU0: stock is flushed, + * 4) CPU0: we do release this object, + * 92 bytes are added to stock->nr_bytes[i] + * 5) CPU0: stock is flushed, * 92 bytes are added to objcg->nr_charged_bytes * * In the result, nr_charged_bytes == PAGE_SIZE. @@ -2018,34 +2018,49 @@ static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { .lock = INIT_LOCAL_TRYLOCK(lock), }; +/* + * NR_OBJ_STOCK is sized so the entire hot path of obj_stock_pcp + * (lock, accounting metadata, nr_bytes[] and cached[]) fits within a + * single 64-byte cache line on non-debug 64-bit builds. With 5 slots: + * lock(1) + index(1) + node_id(2) + slab stats(4) + nr_bytes(10) + * + pad(6) + cached(40) == 64 bytes. + * A CPU can thus consume/refill/account against five different objcgs + * (typically per-node variants of the same memcg) while incurring at + * most one cache miss on the stock. + */ +#define NR_OBJ_STOCK 5 struct obj_stock_pcp { local_trylock_t lock; - struct obj_cgroup *cached_objcg; + int8_t index; + int16_t node_id; + int16_t nr_slab_reclaimable_b; + int16_t nr_slab_unreclaimable_b; #if PAGE_SHIFT > 16 /* * On rare archs with 256KiB base page size (hexagon and powerpc 44x) * keep nr_bytes to unsigned int as uint16_t cannot represent the full - * sub-page remainder. +e patches/memcg-uint16_t-for-nr_bytes-in-obj_stock_pcp.patch * sub-page remainder. Such archs are not cacheline optimization target. */ - unsigned int nr_bytes; + unsigned int nr_bytes[NR_OBJ_STOCK]; #else - uint16_t nr_bytes; + uint16_t nr_bytes[NR_OBJ_STOCK]; #endif - int16_t node_id; - int16_t nr_slab_reclaimable_b; - int16_t nr_slab_unreclaimable_b; + struct obj_cgroup *cached[NR_OBJ_STOCK]; struct work_struct work; unsigned long flags; + uint8_t drain_idx; }; static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { .lock = INIT_LOCAL_TRYLOCK(lock), + .index = -1, .node_id = NUMA_NO_NODE, }; static DEFINE_MUTEX(percpu_charge_mutex); +static void drain_obj_stock_slot(struct obj_stock_pcp *stock, int i); static void drain_obj_stock(struct obj_stock_pcp *stock); static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg); @@ -3165,12 +3180,13 @@ static void unlock_stock(struct obj_stock_pcp *stock) local_unlock(&obj_stock.lock); } -/* Call after __refill_obj_stock() to ensure stock->cached_objg == objcg */ +/* Call after __refill_obj_stock() so a slot for objcg exists in the stock */ static void __account_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) { int16_t *bytes; + int i; /* * Though at the moment MAX_NUMNODES <= 1024 in all archs but let's make @@ -3179,29 +3195,39 @@ static void __account_obj_stock(struct obj_cgroup *objcg, */ BUILD_BUG_ON(MAX_NUMNODES >= S16_MAX); - if (!stock || READ_ONCE(stock->cached_objcg) != objcg) + if (!stock) + goto direct; + + for (i = 0; i < NR_OBJ_STOCK; ++i) { + if (READ_ONCE(stock->cached[i]) == objcg) + break; + } + if (i == NR_OBJ_STOCK) goto direct; /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat changes. + * accumulating over a page of vmstat data or when the objcg slot or + * pgdat the stats belong to changes. */ - if (stock->node_id == NUMA_NO_NODE) { + if (stock->index < 0) { + stock->index = i; stock->node_id = pgdat->node_id; - } else if (stock->node_id != pgdat->node_id) { - /* Flush the existing cached vmstat data */ + } else if (stock->index != i || stock->node_id != pgdat->node_id) { + struct obj_cgroup *old = READ_ONCE(stock->cached[stock->index]); struct pglist_data *oldpg = NODE_DATA(stock->node_id); if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } + stock->index = i; stock->node_id = pgdat->node_id; } @@ -3231,10 +3257,16 @@ static bool __consume_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, unsigned int nr_bytes) { - if (objcg == READ_ONCE(stock->cached_objcg) && - stock->nr_bytes >= nr_bytes) { - stock->nr_bytes -= nr_bytes; - return true; + int i; + + for (i = 0; i < NR_OBJ_STOCK; ++i) { + if (READ_ONCE(stock->cached[i]) != objcg) + continue; + if (stock->nr_bytes[i] >= nr_bytes) { + stock->nr_bytes[i] -= nr_bytes; + return true; + } + return false; } return false; @@ -3255,16 +3287,42 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) return ret; } -static void drain_obj_stock(struct obj_stock_pcp *stock) +/* Flush the cached slab stats (if any) back to their owning objcg/pgdat. */ +static void drain_obj_stock_stats(struct obj_stock_pcp *stock) { - struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); + struct obj_cgroup *old; + struct pglist_data *oldpg; + + if (stock->index < 0) + return; + + old = READ_ONCE(stock->cached[stock->index]); + oldpg = NODE_DATA(stock->node_id); + + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->index = -1; + stock->node_id = NUMA_NO_NODE; +} + +static void drain_obj_stock_slot(struct obj_stock_pcp *stock, int i) +{ + struct obj_cgroup *old = READ_ONCE(stock->cached[i]); if (!old) return; - if (stock->nr_bytes) { - unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; - unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + if (stock->nr_bytes[i]) { + unsigned int nr_pages = stock->nr_bytes[i] >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes[i] & (PAGE_SIZE - 1); if (nr_pages) { struct mem_cgroup *memcg; @@ -3290,46 +3348,43 @@ static void drain_obj_stock(struct obj_stock_pcp *stock) * so it might be changed in the future. */ atomic_add(nr_bytes, &old->nr_charged_bytes); - stock->nr_bytes = 0; + stock->nr_bytes[i] = 0; } - /* - * Flush the vmstat data in current stock - */ - if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { - struct pglist_data *oldpg = NODE_DATA(stock->node_id); + /* Flush vmstat data when its owning slot is being drained. */ + if (stock->index == i) + drain_obj_stock_stats(stock); - if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(old, oldpg, - NR_SLAB_RECLAIMABLE_B, - stock->nr_slab_reclaimable_b); - stock->nr_slab_reclaimable_b = 0; - } - if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(old, oldpg, - NR_SLAB_UNRECLAIMABLE_B, - stock->nr_slab_unreclaimable_b); - stock->nr_slab_unreclaimable_b = 0; - } - stock->node_id = NUMA_NO_NODE; - } - - WRITE_ONCE(stock->cached_objcg, NULL); + WRITE_ONCE(stock->cached[i], NULL); obj_cgroup_put(old); } +static void drain_obj_stock(struct obj_stock_pcp *stock) +{ + int i; + + for (i = 0; i < NR_OBJ_STOCK; ++i) + drain_obj_stock_slot(stock, i); +} + static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg) { - struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); + struct obj_cgroup *objcg; struct mem_cgroup *memcg; bool flush = false; + int i; rcu_read_lock(); - if (objcg) { + for (i = 0; i < NR_OBJ_STOCK; ++i) { + objcg = READ_ONCE(stock->cached[i]); + if (!objcg) + continue; memcg = obj_cgroup_memcg(objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) { flush = true; + break; + } } rcu_read_unlock(); @@ -3343,6 +3398,7 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, { unsigned int nr_pages = 0; unsigned int stock_nr_bytes; + int i, slot = -1, empty_slot = -1; if (!stock) { nr_pages = nr_bytes >> PAGE_SHIFT; @@ -3351,16 +3407,44 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, goto out; } - stock_nr_bytes = stock->nr_bytes; - if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + for (i = 0; i < NR_OBJ_STOCK; ++i) { + struct obj_cgroup *cached = READ_ONCE(stock->cached[i]); + + if (!cached) { + if (empty_slot == -1) + empty_slot = i; + continue; + } + if (cached == objcg) { + slot = i; + break; + } + } + + if (slot == -1) { + slot = empty_slot; + if (slot == -1) { + slot = stock->drain_idx++; + if (stock->drain_idx == NR_OBJ_STOCK) + stock->drain_idx = 0; + drain_obj_stock_slot(stock, slot); + } obj_cgroup_get(objcg); + /* + * Keep the xchg result in the unsigned int local; storing + * it directly into stock->nr_bytes[slot] (uint16_t) would + * silently truncate values >= U16_MAX and bypass the flush + * guard below, leaking page-counter charges. + */ stock_nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); + WRITE_ONCE(stock->cached[slot], objcg); allow_uncharge = true; /* Allow uncharge when objcg changes */ + } else { + stock_nr_bytes = stock->nr_bytes[slot]; } + stock_nr_bytes += nr_bytes; if ((allow_uncharge && (stock_nr_bytes > PAGE_SIZE)) || @@ -3368,7 +3452,7 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, nr_pages = stock_nr_bytes >> PAGE_SHIFT; stock_nr_bytes &= (PAGE_SIZE - 1); } - stock->nr_bytes = stock_nr_bytes; + stock->nr_bytes[slot] = stock_nr_bytes; out: if (nr_pages) From 3e8d8eb8d7f5b1ec3993ad4dbb8140a55f789f90 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 May 2026 11:27:16 +0900 Subject: [PATCH 275/321] zram: do not leak blk idx at the end of writeback zram_writeback_slots() loop can terminate with valid reserved backing device blk_idx. The problem is that cleanup code doesn't release that reserved blk_idx before zram_writeback_slots() returns, which leads to blk_idx leak (it becomes permanently busy and can not be used for actual writeback.) This does not lead to any system instabilities, it only means that we can writeback less pages. The scenario is hard to hit in practice as it requires writeabck to race with modification (slot-free or overwrite) of the final post-processing slot. Release reserved but unused blk_idx before returning from zram_writeback_slots(). Link: https://lore.kernel.org/20260526022754.2377730-2-senozhatsky@chromium.org Fixes: f405066a1f0db ("zram: introduce writeback bio batching") Signed-off-by: Sergey Senozhatsky Suggested-by: Brian Geffon Cc: Jens Axboe Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 07111455eecf..602abfe23797 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1127,6 +1127,9 @@ next: if (req) release_wb_req(req); + if (blk_idx != INVALID_BDEV_BLOCK) + zram_release_bdev_block(zram, blk_idx); + while (atomic_read(&wb_ctl->num_inflight) > 0) { wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs)); err = zram_complete_done_reqs(zram, wb_ctl); From 3bf1c285dc406067eae5b3a7072afad81ad4a4fc Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 May 2026 11:27:17 +0900 Subject: [PATCH 276/321] zram: clear trailing bytes of compressed writeback pages Patch series "zram: writeback fixes", v2. Brian (privately) reported a "leak" of writeback bitmap in certain cases, so that backing device can store less pages; and a theoretical data leak in the trailing bytes of compressed writeback pages. Both issues are low risk. This patch (of 2): When compressed writeback is available writtenback pages contain "garbage" in PAGE_SIZE - obj_size trailing bytes. That "garbage" is, basically, whatever data that page held before we got it for writeback. To get advantage of it an attacker needs to be able to read from active backing swap device, which is already catastrophic. Still, just in case, zero out those trailing bytes before writeback to a backing device so that we only store swap-ed out data there. Link: https://lore.kernel.org/20260526022754.2377730-1-senozhatsky@chromium.org Link: https://lore.kernel.org/20260526022754.2377730-3-senozhatsky@chromium.org Fixes: d38fab605c66 ("zram: introduce compressed data writeback") Signed-off-by: Sergey Senozhatsky Suggested-by: Brian Geffon Cc: Jens Axboe Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 602abfe23797..7917fc7a2a29 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2134,6 +2134,8 @@ static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index) zs_obj_read_end(zram->mem_pool, handle, size, src); zcomp_stream_put(zstrm); + memzero_page(page, size, PAGE_SIZE - size); + return 0; } #endif From 088a2353d714591d2eadb9870767910b9c67b32d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 May 2026 20:56:48 +0100 Subject: [PATCH 277/321] mm: remove mentions of PageWriteback Update two comments to refer to writeback in general instead of the specific flag. Convert the large comment in memory.c to be entirely folio-based. Link: https://lore.kernel.org/20260526195650.353196-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- mm/memory.c | 20 ++++++++++---------- mm/migrate.c | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 168e63940b78..8f664fb09f24 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1123,7 +1123,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * To minimise LRU disruption, the caller can indicate with * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages * it will be able to migrate without blocking - clean pages - * for the most part. PageWriteback would require blocking. + * for the most part. Writeback would require blocking. */ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio)) goto isolate_fail_put; diff --git a/mm/memory.c b/mm/memory.c index 7c020995eafc..5a365492a9a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5398,18 +5398,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) vm_fault_t ret; /* - * Preallocate pte before we take page_lock because this might lead to - * deadlocks for memcg reclaim which waits for pages under writeback: - * lock_page(A) - * SetPageWriteback(A) - * unlock_page(A) - * lock_page(B) - * lock_page(B) + * Preallocate pte before we take folio lock because this might lead to + * deadlocks for memcg reclaim which waits for folios under writeback: + * folio_lock(A) + * folio_set_writeback(A) + * folio_unlock(A) + * folio_lock(B) + * folio_lock(B) * pte_alloc_one * shrink_folio_list - * wait_on_page_writeback(A) - * SetPageWriteback(B) - * unlock_page(B) + * folio_wait_writeback(A) + * folio_set_writeback(B) + * folio_unlock(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { diff --git a/mm/migrate.c b/mm/migrate.c index 0c6a0ab6ecce..d8090cdda4f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1256,7 +1256,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it - * necessary to wait for PageWriteback. In the async case, + * necessary to wait for writeback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ From 9c87962f85106a4d330a91b26b054376245f47c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 May 2026 21:00:30 +0100 Subject: [PATCH 278/321] mm: document the folio refcount a little better Expand the documentation of folio_ref_count() to talk about expected, temporary and spurious refcounts as well as the concept of freezing. Link: https://lore.kernel.org/20260526200032.353868-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page_ref.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 94d3f0e71c06..9f5c75d06f76 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -71,6 +71,12 @@ static inline int page_ref_count(const struct page *page) * folio_ref_count - The reference count on this folio. * @folio: The folio. * + * Folios contain a reference count. When that reference count reaches + * zero, the folio is referred to as frozen. At this point, it will + * usually be returned to the memory allocator, but some parts of the + * kernel freeze folios in order to perform unusual operations on them + * such as splitting or migration. + * * The refcount is usually incremented by calls to folio_get() and * decremented by calls to folio_put(). Some typical users of the * folio refcount: @@ -82,6 +88,18 @@ static inline int page_ref_count(const struct page *page) * - Pipes * - Direct IO which references this page in the process address space * + * The reference count has three components: expected, temporary and + * spurious. The expected reference count of a folio is that which + * we would logically expect it to be from just reading the code. + * Temporary refcounts are gained by threads which need a temporary + * reference to make sure the folio isn't reallocated while they use it. + * Spurious refcounts are gained by threads which, thanks to RCU walks + * of the page tables or file cache, find a stale pointer to a folio. + * These threads will drop the refcount after discoveering the pointer + * is stale, but it can surprise other users to see the spurious refcount + * on a freshly allocated folio (eg they may see a refcount of 2 instead + * of 1). + * * Return: The number of references to this folio. */ static inline int folio_ref_count(const struct folio *folio) From 13f77972b94c51f6e5b94d672025601363440a94 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 26 May 2026 16:42:11 +0200 Subject: [PATCH 279/321] mm/migrate: find_mm_struct: fix race between security checks and suid exec The target task can execute a setuid binary between ptrace_may_access() and get_task_mm(). Protect this critical section with exec_update_lock. I don't think cpuset_mems_allowed(task) should be called under exec_update_lock, but this patch just tries to add the minimal fix. Perhaps we can later add a common helper which can be used by find_mm_struct() and kernel_migrate_pages(). Link: https://lore.kernel.org/ahWxQ3JxdR5ff2qf@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Gregory Price Cc: Alistair Popple Cc: Byungchul Park Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Jann Horn Cc: Joshua Hahn Cc: Kees Cook Cc: Matthew Brost Cc: Rakie Kim Cc: Ying Huang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d8090cdda4f9..d9b23909d716 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2555,24 +2555,29 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) } task = find_get_task_by_vpid(pid); - if (!task) { + if (!task) return ERR_PTR(-ESRCH); - } + if (down_read_killable(&task->signal->exec_update_lock)) { + mm = ERR_PTR(-EINTR); + goto out; + } /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { mm = ERR_PTR(-EPERM); - goto out; + goto unlock; } mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) - goto out; + goto unlock; *mem_nodes = cpuset_mems_allowed(task); mm = get_task_mm(task); +unlock: + up_read(&task->signal->exec_update_lock); out: put_task_struct(task); if (!mm) From 8f42b751e7d7f73dbb2b59281cef891bfb7ae40c Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 28 May 2026 09:56:14 -0400 Subject: [PATCH 280/321] MAINTAINERS: add vm.rst to memory management core The vm.rst file is currently not listed in the MAINTAINERS file, so let's go ahead and add to the MM core subsystem so that the maintainers are CCed when changes to the documentation are proposed. Link: https://lore.kernel.org/20260528-mm-vm-rst-maintainers-file-v1-1-306631c0a610@redhat.com Signed-off-by: Brian Masney Reviewed-by: Lorenzo Stoakes Reviewed-by: Oscar Salvador (SUSE) Reviewed-by: Liam R. Howlett (Oracle) Reviewed-by: SeongJae Park Acked-by: David Hildenbrand (Arm) Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 48c2265f00a9..ef31a8dd9e5b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16781,6 +16781,7 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/admin-guide/sysctl/vm.rst F: include/linux/folio_batch.h F: include/linux/gfp.h F: include/linux/gfp_types.h From a195cf013e98b02d3c129e2cccd7efa98aabf546 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 28 May 2026 09:45:10 -0400 Subject: [PATCH 281/321] docs: mm: clarify that user_reserve_kbytes has no effect when overcommit_memory is set to 0 or 1 Looking at __vm_enough_memory() in mm/util.c, user_reserve_kbytes has no effect when overcommit_memory is set to 0 or 1. The documentation for overcommit_memory already references user_reserve_kbytes when the flag is set to 2. Let's go ahead and add a clarification to user_reserve_kbytes in vm.rst that it has no effect when overcommit_memory is set to 0 or 1. Link: https://lore.kernel.org/20260528-mm-clarify-docs-v1-1-aa88e83b4bfd@redhat.com Signed-off-by: Brian Masney Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/vm.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 97e12359775c..b9b0c218bfb4 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -1034,6 +1034,8 @@ min(3% of current process size, user_reserve_kbytes) of free memory. This is intended to prevent a user from starting a single memory hogging process, such that they cannot recover (kill the hog). +This setting has no effect when overcommit_memory is set to 0 or 1. + user_reserve_kbytes defaults to min(3% of the current process size, 128MB). If this is reduced to zero, then the user will be allowed to allocate From f5cf8c92a2b9fd176d90b6e217ed50fbb5d1f48d Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Fri, 29 May 2026 13:27:54 -0700 Subject: [PATCH 282/321] mm/nodemask: correctly describe nodemask operation return types Commit 0dfe54071d7c8 ("nodemask: Fix return values to be unsigned") changed a number of nodemask operations that used to return int to returning a bool instead. However, it did not update the comment block that described these functions, leaving the documentation incorrect. Fix the comment block to accurately describe the functions. Also fix a typo (unsigend --> unsigned), and fix a callsite in mempolicy.c that did not get updated during the conversion. No functional changes intended; changes are purely cosmetic. Link: https://lore.kernel.org/20260529202755.1846800-1-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: SeongJae Park Cc: Alistair Popple Cc: Byungchul Park Cc: David Hildenbrand Cc: Gregory Price Cc: Matthew Brost Cc: Rakie Kim Cc: Rasmus Villemoes Cc: Ying Huang Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 18 +++++++++--------- mm/mempolicy.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 204c92462f3c..b842aa525546 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -24,23 +24,23 @@ * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask - * int node_test_and_set(node, mask) test and set bit 'node' in mask + * bool node_test_and_set(node, mask) test and set bit 'node' in mask * - * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] + * bool nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 - * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 + * bool nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * - * int nodes_equal(mask1, mask2) Does mask1 == mask2? - * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? - * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? - * int nodes_empty(mask) Is mask empty (no bits sets)? - * int nodes_full(mask) Is mask full (all bits sets)? + * bool nodes_equal(mask1, mask2) Does mask1 == mask2? + * bool nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? + * bool nodes_subset(mask1, mask2) Is mask1 a subset of mask2? + * bool nodes_empty(mask) Is mask empty (no bits sets)? + * bool nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES - * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * unsigned int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * unsigned int first_unset_node(mask) First node not set in mask, or diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4e4421b22b59..36699fabd3c2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2865,7 +2865,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: - return !!nodes_equal(a->nodes, b->nodes); + return nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: From 79a031583ca5bb3d5484178f579fea97706f1ed6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:08 -0400 Subject: [PATCH 283/321] mm: list_lru: fix set_shrinker_bit() call during race with cgroup deletion Patch series "mm: switch THP shrinker to list_lru", v5. The open-coded deferred split queue has issues. It's not NUMA-aware (when cgroup is enabled), and it's more complicated in the callsites interacting with it. Switching to list_lru fixes the NUMA problem and streamlines things. It also simplifies planned shrinker work. Patch 1 fixes a pre-existing list_lru bug where the shrinker bit is set on the caller's memcg rather than the ancestor whose sublist the item actually lands on after a walk-up. Standalone, backportable; the rest of the series depends on it. Patches 2-5 are cleanups and small refactors in list_lru code. They're basically independent, but make the THP shrinker conversion easier. Patch 6 extends the list_lru API to allow the caller to control the locking scope. The THP shrinker has private state it needs to keep synchronized with the LRU state. Patch 7 extends the list_lru API with a convenience helper to do list_lru head allocation (memcg_list_lru_alloc) when coming from a folio. Anon THPs are instantiated in several places, and with the folio reparenting patches pending, folio_memcg() access is now a more delicate dance. This avoids having to replicate that dance everywhere. Patch 8 flattens the alloc_anon_folio() retry loop so the next patch's list_lru hook lands as a clean addition rather than nested deep inside an if (folio) block. Patch 9 finally switches the deferred_split_queue to list_lru. This patch (of 9): When list_lru_add() races with cgroup deletion, the shrinker bit is set on the wrong group and lost. This can cause a shrinker run to miss the cgroup that actually has the object. When the passed in memcg is dead, the function finds the first non-dead parent from the passed in memcg and adds the object there; but the shrinker bit is set on the memcg that was passed in. This bug is as old as the shrinker bitmap itself. Fix it by returning the "effective" memcg from the locking function, and have the caller use that. Link: https://lore.kernel.org/20260527204757.2544958-1-hannes@cmpxchg.org Link: https://lore.kernel.org/20260527204757.2544958-2-hannes@cmpxchg.org Fixes: fae91d6d8be5 ("mm/list_lru.c: set bit in memcg shrinker bitmap on first list_lru item appearance") Signed-off-by: Johannes Weiner Reported-by: Usama Arif Reported-by: Sashiko Acked-by: Usama Arif Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: David Hildenbrand Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/list_lru.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index dd29bcf8eb5f..45d1b97737ea 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -77,14 +77,14 @@ static inline bool lock_list_lru(struct list_lru_one *l, bool irq) } static inline struct list_lru_one * -lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, - bool irq, bool skip_empty) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, bool irq, bool skip_empty) { struct list_lru_one *l; rcu_read_lock(); again: - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(*memcg)); if (likely(l) && lock_list_lru(l, irq)) { rcu_read_unlock(); return l; @@ -97,8 +97,8 @@ again: rcu_read_unlock(); return NULL; } - VM_WARN_ON(!css_is_dying(&memcg->css)); - memcg = parent_mem_cgroup(memcg); + VM_WARN_ON(!css_is_dying(&(*memcg)->css)); + *memcg = parent_mem_cgroup(*memcg); goto again; } @@ -135,8 +135,8 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } static inline struct list_lru_one * -lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, - bool irq, bool skip_empty) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, bool irq, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; @@ -164,12 +164,16 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + l = lock_list_lru_of_memcg(lru, nid, &memcg, false, false); if (!l) return false; if (list_empty(item)) { list_add_tail(item, &l->list); - /* Set shrinker bit if the first element was added */ + /* + * Set shrinker bit on the memcg that owns the locked + * sublist - lock_list_lru_of_memcg() may have walked up + * past a dying memcg, and the bit must be set there. + */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); unlock_list_lru(l, false); @@ -204,7 +208,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + l = lock_list_lru_of_memcg(lru, nid, &memcg, false, false); if (!l) return false; if (!list_empty(item)) { @@ -288,7 +292,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, unsigned long isolated = 0; restart: - l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); + l = lock_list_lru_of_memcg(lru, nid, &memcg, irq_off, true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { From 1923b1d76b964adc055b5a4bd877dda50550298f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:09 -0400 Subject: [PATCH 284/321] mm: list_lru: lock_list_lru_of_memcg() cannot return NULL if !skip_empty skip_empty is only for the shrinker to abort and skip a list that's empty or whose cgroup is being deleted. For list additions and deletions, the cgroup hierarchy is walked upwards until a valid list_lru head is found, or it will fall back to the node list. Acquiring the lock won't fail. Remove the NULL checks in those callers. Link: https://lore.kernel.org/20260527204757.2544958-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Liam R. Howlett (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/list_lru.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 45d1b97737ea..77999ed78fa5 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -165,8 +165,6 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, &memcg, false, false); - if (!l) - return false; if (list_empty(item)) { list_add_tail(item, &l->list); /* @@ -208,9 +206,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; + l = lock_list_lru_of_memcg(lru, nid, &memcg, false, false); - if (!l) - return false; if (!list_empty(item)) { list_del_init(item); l->nr_items--; From 82d8bca1c715e9ed31eaeb5197a0ba00bf8be597 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:10 -0400 Subject: [PATCH 285/321] mm: list_lru: deduplicate unlock_list_lru() The MEMCG and !MEMCG variants are the same. lock_list_lru() has the same pattern when bailing. Consolidate into a common implementation. Link: https://lore.kernel.org/20260527204757.2544958-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Liam R. Howlett (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/list_lru.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 77999ed78fa5..5497034e80f3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,6 +15,14 @@ #include "slab.h" #include "internal.h" +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); +} + #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -67,10 +75,7 @@ static inline bool lock_list_lru(struct list_lru_one *l, bool irq) else spin_lock(&l->lock); if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { - if (irq) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); + unlock_list_lru(l, irq); return false; } return true; @@ -101,14 +106,6 @@ again: *memcg = parent_mem_cgroup(*memcg); goto again; } - -static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) -{ - if (irq_off) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); -} #else static void list_lru_register(struct list_lru *lru) { @@ -147,14 +144,6 @@ lock_list_lru_of_memcg(struct list_lru *lru, int nid, return l; } - -static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) -{ - if (irq_off) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); -} #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ From 8b98cfe2c52d7492a024b655e0978545845646cb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:11 -0400 Subject: [PATCH 286/321] mm: list_lru: move list dead check to lock_list_lru_of_memcg() Only the MEMCG variant of lock_list_lru() needs to check if there is a race with cgroup deletion and list reparenting. Move the check to the caller, so that the next patch can unify the lock_list_lru() variants. Link: https://lore.kernel.org/20260527204757.2544958-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Liam R. Howlett (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/list_lru.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5497034e80f3..7d0523e44010 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -68,17 +68,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) return &lru->node[nid].lru; } -static inline bool lock_list_lru(struct list_lru_one *l, bool irq) +static inline void lock_list_lru(struct list_lru_one *l, bool irq) { if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); - if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { - unlock_list_lru(l, irq); - return false; - } - return true; } static inline struct list_lru_one * @@ -90,9 +85,13 @@ lock_list_lru_of_memcg(struct list_lru *lru, int nid, rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(*memcg)); - if (likely(l) && lock_list_lru(l, irq)) { - rcu_read_unlock(); - return l; + if (likely(l)) { + lock_list_lru(l, irq); + if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { + rcu_read_unlock(); + return l; + } + unlock_list_lru(l, irq); } /* * Caller may simply bail out if raced with reparenting or From bc7adb3b3f6ad3f1cf8a030be0034f61e7580fe4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:12 -0400 Subject: [PATCH 287/321] mm: list_lru: deduplicate lock_list_lru() The MEMCG and !MEMCG paths have the same pattern. Share the code. Link: https://lore.kernel.org/20260527204757.2544958-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Liam R. Howlett (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/list_lru.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 7d0523e44010..fdb3fe2ea64f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,6 +15,14 @@ #include "slab.h" #include "internal.h" +static inline void lock_list_lru(struct list_lru_one *l, bool irq) +{ + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); +} + static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) @@ -68,14 +76,6 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) return &lru->node[nid].lru; } -static inline void lock_list_lru(struct list_lru_one *l, bool irq) -{ - if (irq) - spin_lock_irq(&l->lock); - else - spin_lock(&l->lock); -} - static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup **memcg, bool irq, bool skip_empty) @@ -136,10 +136,7 @@ lock_list_lru_of_memcg(struct list_lru *lru, int nid, { struct list_lru_one *l = &lru->node[nid].lru; - if (irq) - spin_lock_irq(&l->lock); - else - spin_lock(&l->lock); + lock_list_lru(l, irq); return l; } From 1479b44c7203b2cad3393533c64aa16d42056310 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:13 -0400 Subject: [PATCH 288/321] mm: list_lru: introduce caller locking for additions and deletions Locking is currently internal to the list_lru API. However, a caller might want to keep auxiliary state synchronized with the LRU state. For example, the THP shrinker uses the lock of its custom LRU to keep PG_partially_mapped and vmstats consistent. To allow the THP shrinker to switch to list_lru, provide normal and irqsafe locking primitives as well as caller-locked variants of the addition and deletion functions. Link: https://lore.kernel.org/20260527204757.2544958-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Liam R. Howlett (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 43 +++++++++++++ mm/list_lru.c | 133 ++++++++++++++++++++++++++++++--------- 2 files changed, 145 insertions(+), 31 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index fe739d35a864..134cb3e5652a 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -83,6 +83,46 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); +/** + * list_lru_lock: lock the sublist for the given node and memcg + * @lru: the lru pointer + * @nid: the node id of the sublist to lock. + * @memcg: pointer to the cgroup of the sublist to lock. On return, + * updated to the cgroup whose sublist was actually locked, + * which may be an ancestor if the original memcg was dying. + * + * Returns the locked list_lru_one sublist. The caller must call + * list_lru_unlock() when done. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). + * + * Return: the locked list_lru_one, or NULL on failure + */ +struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid, + struct mem_cgroup **memcg); + +/** + * list_lru_unlock: unlock a sublist locked by list_lru_lock() + * @l: the list_lru_one to unlock + */ +void list_lru_unlock(struct list_lru_one *l); + +struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid, + struct mem_cgroup **memcg); +void list_lru_unlock_irq(struct list_lru_one *l); + +struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, unsigned long *irq_flags); +void list_lru_unlock_irqrestore(struct list_lru_one *l, + unsigned long *irq_flags); + +/* Caller-locked variants, see list_lru_add() etc for documentation */ +bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid, struct mem_cgroup *memcg); +bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid); + /** * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer @@ -115,6 +155,9 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg); +bool list_lru_add_irq(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + /** * list_lru_add_obj: add an element to the lru list's tail * @lru: the lru pointer diff --git a/mm/list_lru.c b/mm/list_lru.c index fdb3fe2ea64f..402bb028114d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,17 +15,23 @@ #include "slab.h" #include "internal.h" -static inline void lock_list_lru(struct list_lru_one *l, bool irq) +static inline void lock_list_lru(struct list_lru_one *l, bool irq, + unsigned long *irq_flags) { - if (irq) + if (irq_flags) + spin_lock_irqsave(&l->lock, *irq_flags); + else if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); } -static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off, + unsigned long *irq_flags) { - if (irq_off) + if (irq_flags) + spin_unlock_irqrestore(&l->lock, *irq_flags); + else if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); @@ -78,7 +84,8 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, - struct mem_cgroup **memcg, bool irq, bool skip_empty) + struct mem_cgroup **memcg, bool irq, + unsigned long *irq_flags, bool skip_empty) { struct list_lru_one *l; @@ -86,12 +93,12 @@ lock_list_lru_of_memcg(struct list_lru *lru, int nid, again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(*memcg)); if (likely(l)) { - lock_list_lru(l, irq); + lock_list_lru(l, irq, irq_flags); if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { rcu_read_unlock(); return l; } - unlock_list_lru(l, irq); + unlock_list_lru(l, irq, irq_flags); } /* * Caller may simply bail out if raced with reparenting or @@ -132,24 +139,58 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, - struct mem_cgroup **memcg, bool irq, bool skip_empty) + struct mem_cgroup **memcg, bool irq, + unsigned long *irq_flags, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; - lock_list_lru(l, irq); + lock_list_lru(l, irq, irq_flags); return l; } #endif /* CONFIG_MEMCG */ -/* The caller must ensure the memcg lifetime. */ -bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) +struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid, + struct mem_cgroup **memcg) { - struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; + return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/false, + /*irq_flags=*/NULL, /*skip_empty=*/false); +} - l = lock_list_lru_of_memcg(lru, nid, &memcg, false, false); +void list_lru_unlock(struct list_lru_one *l) +{ + unlock_list_lru(l, /*irq_off=*/false, /*irq_flags=*/NULL); +} + +struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid, + struct mem_cgroup **memcg) +{ + return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true, + /*irq_flags=*/NULL, /*skip_empty=*/false); +} + +void list_lru_unlock_irq(struct list_lru_one *l) +{ + unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/NULL); +} + +struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, + unsigned long *flags) +{ + return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true, + /*irq_flags=*/flags, /*skip_empty=*/false); +} + +void list_lru_unlock_irqrestore(struct list_lru_one *l, unsigned long *flags) +{ + unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/flags); +} + +bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ if (list_empty(item)) { list_add_tail(item, &l->list); /* @@ -159,15 +200,50 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - unlock_list_lru(l, false); - atomic_long_inc(&nlru->nr_items); + atomic_long_inc(&lru->node[nid].nr_items); return true; } - unlock_list_lru(l, false); return false; } EXPORT_SYMBOL_GPL(list_lru_add); +bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid) +{ + if (!list_empty(item)) { + list_del_init(item); + l->nr_items--; + atomic_long_dec(&lru->node[nid].nr_items); + return true; + } + return false; +} + +/* The caller must ensure the memcg lifetime. */ +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ + struct list_lru_one *l; + bool ret; + + l = list_lru_lock(lru, nid, &memcg); + ret = __list_lru_add(lru, l, item, nid, memcg); + list_lru_unlock(l); + return ret; +} + +bool list_lru_add_irq(struct list_lru *lru, struct list_head *item, + int nid, struct mem_cgroup *memcg) +{ + struct list_lru_one *l; + bool ret; + + l = list_lru_lock_irq(lru, nid, &memcg); + ret = __list_lru_add(lru, l, item, nid, memcg); + list_lru_unlock_irq(l); + return ret; +} + bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; @@ -189,19 +265,13 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { - struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; + bool ret; - l = lock_list_lru_of_memcg(lru, nid, &memcg, false, false); - if (!list_empty(item)) { - list_del_init(item); - l->nr_items--; - unlock_list_lru(l, false); - atomic_long_dec(&nlru->nr_items); - return true; - } - unlock_list_lru(l, false); - return false; + l = list_lru_lock(lru, nid, &memcg); + ret = __list_lru_del(lru, l, item, nid); + list_lru_unlock(l); + return ret; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) @@ -274,7 +344,8 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, unsigned long isolated = 0; restart: - l = lock_list_lru_of_memcg(lru, nid, &memcg, irq_off, true); + l = lock_list_lru_of_memcg(lru, nid, &memcg, /*irq=*/irq_off, + /*irq_flags=*/NULL, /*skip_empty=*/true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { @@ -315,7 +386,7 @@ restart: BUG(); } } - unlock_list_lru(l, irq_off); + unlock_list_lru(l, irq_off, NULL); out: return isolated; } From ae64f07a6a4018c73111b3cd4e1c5598ce5cfa84 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:14 -0400 Subject: [PATCH 289/321] mm: list_lru: introduce folio_memcg_list_lru_alloc() memcg_list_lru_alloc() is called every time an object that may end up on the list_lru is created. It needs to quickly check if the list_lru heads for the memcg already exist, and allocate them when they don't. Doing this with folio objects is tricky: folio_memcg() is not stable and requires either RCU protection or pinning the cgroup. But it's desirable to make the existence check lightweight under RCU, and only pin the memcg when we need to allocate list_lru heads and may block. In preparation for switching the THP shrinker to list_lru, add a helper function for allocating list_lru heads coming from a folio. Link: https://lore.kernel.org/20260527204757.2544958-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: David Hildenbrand (Arm) Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Kairui Song Cc: Lance Yang Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Usama Arif Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 27 +++++++++++++++++++++++++++ mm/list_lru.c | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 134cb3e5652a..a450fffe1550 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -81,6 +81,33 @@ static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); + +#ifdef CONFIG_MEMCG +/** + * folio_memcg_list_lru_alloc - allocate list_lru heads for shrinkable folio + * @folio: the newly allocated & charged folio + * @lru: the list_lru this might be queued on + * @gfp: gfp mask + * + * Allocate list_lru heads (per-memcg, per-node) needed to queue this + * particular folio down the line. + * + * This does memcg_list_lru_alloc(), but on the memcg that @folio is + * associated with. Handles folio_memcg() access rules in the fast + * path (list_lru heads allocated) and the allocation slowpath. + * + * Returns 0 on success, a negative error value otherwise. + */ +int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru, + gfp_t gfp); +#else +static inline int folio_memcg_list_lru_alloc(struct folio *folio, + struct list_lru *lru, gfp_t gfp) +{ + return 0; +} +#endif + void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); /** diff --git a/mm/list_lru.c b/mm/list_lru.c index 402bb028114d..41a811966063 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -568,17 +568,14 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, return idx < 0 || xa_load(&lru->xa, idx); } -int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, - gfp_t gfp) +static int __memcg_list_lru_alloc(struct mem_cgroup *memcg, + struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); - if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) - return 0; - gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's @@ -619,6 +616,38 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, return xas_error(&xas); } + +int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, + gfp_t gfp) +{ + if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) + return 0; + return __memcg_list_lru_alloc(memcg, lru, gfp); +} + +int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int res; + + if (!list_lru_memcg_aware(lru)) + return 0; + + /* Fast path when list_lru heads already exist */ + rcu_read_lock(); + memcg = folio_memcg(folio); + res = memcg_list_lru_allocated(memcg, lru); + rcu_read_unlock(); + if (likely(res)) + return 0; + + /* Allocation may block, pin the memcg */ + memcg = get_mem_cgroup_from_folio(folio); + res = __memcg_list_lru_alloc(memcg, lru, gfp); + mem_cgroup_put(memcg); + return res; +} #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { From 65180e9663c782e45ed1c76276dc64d96615da9d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:15 -0400 Subject: [PATCH 290/321] mm: memory: flatten alloc_anon_folio() retry loop alloc_anon_folio() uses a top-level if (folio) that buries the success path four levels deep. This makes for awkward long lines and wrapping. The next patch will add more code here, so flatten this now to keep things clean and simple. The next label is already there, use it for !folio. No functional change intended. Link: https://lore.kernel.org/20260527204757.2544958-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Lorenzo Stoakes (Oracle) Acked-by: Usama Arif Acked-by: Shakeel Butt Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: David Hildenbrand (Arm) Cc: Kairui Song Cc: Lance Yang Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5a365492a9a2..1d8e09d9b3c9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5215,24 +5215,24 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); - if (folio) { - if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { - count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - folio_put(folio); - goto next; - } - folio_throttle_swaprate(folio, gfp); - /* - * When a folio is not zeroed during allocation - * (__GFP_ZERO not used) or user folios require special - * handling, folio_zero_user() is used to make sure - * that the page corresponding to the faulting address - * will be hot in the cache after zeroing. - */ - if (user_alloc_needs_zeroing()) - folio_zero_user(folio, vmf->address); - return folio; + if (!folio) + goto next; + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + folio_put(folio); + goto next; } + folio_throttle_swaprate(folio, gfp); + /* + * When a folio is not zeroed during allocation + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. + */ + if (user_alloc_needs_zeroing()) + folio_zero_user(folio, vmf->address); + return folio; next: count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); From fafaeceb89a5e2e856ff04c2cacb6cae4a2ecb67 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 May 2026 16:45:16 -0400 Subject: [PATCH 291/321] mm: switch deferred split shrinker to list_lru The deferred split queue handles cgroups in a suboptimal fashion. The queue is per-NUMA node or per-cgroup, not the intersection. That means on a cgrouped system, a node-restricted allocation entering reclaim can end up splitting large pages on other nodes: alloc/unmap deferred_split_folio() list_add_tail(memcg->split_queue) set_shrinker_bit(memcg, node, deferred_shrinker_id) for_each_zone_zonelist_nodemask(restricted_nodes) mem_cgroup_iter() shrink_slab(node, memcg) shrink_slab_memcg(node, memcg) if test_shrinker_bit(memcg, node, deferred_shrinker_id) deferred_split_scan() walks memcg->split_queue The shrinker bit adds an imperfect guard rail. As soon as the cgroup has a single large page on the node of interest, all large pages owned by that memcg, including those on other nodes, will be split. list_lru properly sets up per-node, per-cgroup lists. As a bonus, it streamlines a lot of the list operations and reclaim walks. It's used widely by other major shrinkers already. Convert the deferred split queue as well. The list_lru per-memcg heads are instantiated on demand when the first object of interest is allocated for a cgroup, by calling folio_memcg_alloc_deferred(). Add calls to where splittable pages are created: anon faults, swapin faults, khugepaged collapse. These calls create all possible node heads for the cgroup at once, so the migration code (between nodes) doesn't need any special care. [akpm@linux-foundation.org: fix build with CONFIG_TRANSPARENT_HUGEPAGE=n] Link: https://lore.kernel.org/202605281620.lc3rtkBm-lkp@intel.com [hannes@cmpxchg.org: fix cgroup.memory=nokmem handling] Link: https://lore.kernel.org/ah9PGv12mqai84ES@cmpxchg.org Link: https://lore.kernel.org/20260527204757.2544958-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Mikhail Zaslonko Tested-by: Mikhail Zaslonko Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Usama Arif Reviewed-by: Kairui Song Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: David Hildenbrand (Arm) Cc: Dev Jain Cc: Lance Yang Cc: Liam R. Howlett Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 17 +- include/linux/memcontrol.h | 4 - include/linux/mmzone.h | 12 -- mm/huge_memory.c | 367 +++++++++++++------------------------ mm/internal.h | 2 +- mm/khugepaged.c | 5 + mm/memcontrol.c | 12 +- mm/memory.c | 4 + mm/mm_init.c | 15 -- mm/swap_state.c | 10 + 10 files changed, 160 insertions(+), 288 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 58382e97a66d..c0d223d0c556 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -439,10 +439,10 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list_to_order(page, NULL, 0); } + +int folio_memcg_alloc_deferred(struct folio *folio); + void deferred_split_folio(struct folio *folio, bool partially_mapped); -#ifdef CONFIG_MEMCG -void reparent_deferred_split_queue(struct mem_cgroup *memcg); -#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); @@ -679,8 +679,15 @@ static inline int try_folio_split_to_order(struct folio *folio, return -EINVAL; } -static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} +static inline int folio_memcg_alloc_deferred(struct folio *folio) +{ + return 0; +} + +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) +{ +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8f2662db166b..e1f46a0016fc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -278,10 +278,6 @@ struct mem_cgroup { struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct deferred_split deferred_split_queue; -#endif - #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1331a7b93f33..8e449f524f26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1431,14 +1431,6 @@ struct zonelist { */ extern struct page *mem_map; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -struct deferred_split { - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; -}; -#endif - #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. @@ -1564,10 +1556,6 @@ typedef struct pglist_data { unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct deferred_split deferred_split_queue; -#endif - #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1f14c5c48b4a..6927f66b2eb2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; shrinker_register(deferred_split_shrinker); @@ -962,6 +978,7 @@ static int __init thp_shrinker_init(void) huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); if (!huge_zero_folio_shrinker) { shrinker_free(deferred_split_shrinker); + list_lru_destroy(&deferred_split_lru); return -ENOMEM; } @@ -976,6 +993,7 @@ static void __init thp_shrinker_exit(void) { shrinker_free(huge_zero_folio_shrinker); shrinker_free(deferred_split_shrinker); + list_lru_destroy(&deferred_split_lru); } static int __init hugepage_init(void) @@ -1155,119 +1173,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static struct deferred_split *split_queue_node(int nid) -{ - struct pglist_data *pgdata = NODE_DATA(nid); - - return &pgdata->deferred_split_queue; -} - -#ifdef CONFIG_MEMCG -static inline -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, - struct deferred_split *queue) -{ - if (mem_cgroup_disabled()) - return NULL; - if (split_queue_node(folio_nid(folio)) == queue) - return NULL; - return container_of(queue, struct mem_cgroup, deferred_split_queue); -} - -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) -{ - return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); -} -#else -static inline -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, - struct deferred_split *queue) -{ - return NULL; -} - -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) -{ - return split_queue_node(nid); -} -#endif - -static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) -{ - struct deferred_split *queue; - -retry: - queue = memcg_split_queue(nid, memcg); - spin_lock(&queue->split_queue_lock); - /* - * There is a period between setting memcg to dying and reparenting - * deferred split queue, and during this period the THPs in the deferred - * split queue will be hidden from the shrinker side. - */ - if (unlikely(memcg_is_dying(memcg))) { - spin_unlock(&queue->split_queue_lock); - memcg = parent_mem_cgroup(memcg); - goto retry; - } - - return queue; -} - -static struct deferred_split * -split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) -{ - struct deferred_split *queue; - -retry: - queue = memcg_split_queue(nid, memcg); - spin_lock_irqsave(&queue->split_queue_lock, *flags); - if (unlikely(memcg_is_dying(memcg))) { - spin_unlock_irqrestore(&queue->split_queue_lock, *flags); - memcg = parent_mem_cgroup(memcg); - goto retry; - } - - return queue; -} - -static struct deferred_split *folio_split_queue_lock(struct folio *folio) -{ - struct deferred_split *queue; - - rcu_read_lock(); - queue = split_queue_lock(folio_nid(folio), folio_memcg(folio)); - /* - * The memcg destruction path is acquiring the split queue lock for - * reparenting. Once you have it locked, it's safe to drop the rcu lock. - */ - rcu_read_unlock(); - - return queue; -} - -static struct deferred_split * -folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) -{ - struct deferred_split *queue; - - rcu_read_lock(); - queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); - rcu_read_unlock(); - - return queue; -} - -static inline void split_queue_unlock(struct deferred_split *queue) -{ - spin_unlock(&queue->split_queue_lock); -} - -static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, - unsigned long flags) -{ - spin_unlock_irqrestore(&queue->split_queue_lock, flags); -} - static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) @@ -1368,6 +1273,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return NULL; } + + if (folio_memcg_alloc_deferred(folio)) { + folio_put(folio); + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + folio_throttle_swaprate(folio, gfp); /* @@ -3903,34 +3816,43 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n struct folio *end_folio = folio_next(folio); struct folio *new_folio, *next; int old_order = folio_order(folio); + struct list_lru_one *lru; + bool dequeue_deferred; int ret = 0; - struct deferred_split *ds_queue; VM_WARN_ON_ONCE(!mapping && end); - /* Prevent deferred_split_scan() touching ->_refcount */ - ds_queue = folio_split_queue_lock(folio); + /* + * If this folio can be on the deferred split queue, lock out + * the shrinker before freezing the ref. If the shrinker sees + * a 0-ref folio, it assumes it beat folio_put() to the list + * lock and must clean up the LRU state - the same dequeue we + * will do below as part of the split. + */ + dequeue_deferred = folio_test_anon(folio) && old_order > 1; + if (dequeue_deferred) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = folio_memcg(folio); + lru = list_lru_lock(&deferred_split_lru, + folio_nid(folio), &memcg); + } if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) { struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; - if (old_order > 1) { - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } + if (dequeue_deferred) { + __list_lru_del(&deferred_split_lru, lru, + &folio->_deferred_list, folio_nid(folio)); if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(old_order, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } + list_lru_unlock(lru); + rcu_read_unlock(); } - split_queue_unlock(ds_queue); + if (mapping) { int nr = folio_nr_pages(folio); @@ -4031,7 +3953,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n if (ci) swap_cluster_unlock(ci); } else { - split_queue_unlock(ds_queue); + if (dequeue_deferred) { + list_lru_unlock(lru); + rcu_read_unlock(); + } return -EAGAIN; } @@ -4397,33 +4322,37 @@ int split_folio_to_list(struct folio *folio, struct list_head *list) * queueing THP splits, and that list is (racily observed to be) non-empty. * * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is - * zero: because even when split_queue_lock is held, a non-empty _deferred_list - * might be in use on deferred_split_scan()'s unlocked on-stack list. + * zero: because even when the list_lru lock is held, a non-empty + * _deferred_list might be in use on deferred_split_scan()'s unlocked + * on-stack list. * - * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is - * therefore important to unqueue deferred split before changing folio memcg. + * The list_lru sublist is determined by folio's memcg: it is therefore + * important to unqueue deferred split before changing folio memcg. */ bool __folio_unqueue_deferred_split(struct folio *folio) { - struct deferred_split *ds_queue; + struct mem_cgroup *memcg; + struct list_lru_one *lru; + int nid = folio_nid(folio); unsigned long flags; bool unqueued = false; WARN_ON_ONCE(folio_ref_count(folio)); WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); - ds_queue = folio_split_queue_lock_irqsave(folio, &flags); - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; + rcu_read_lock(); + memcg = folio_memcg(folio); + lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags); + if (__list_lru_del(&deferred_split_lru, lru, &folio->_deferred_list, nid)) { if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } - list_del_init(&folio->_deferred_list); unqueued = true; } - split_queue_unlock_irqrestore(ds_queue, flags); + list_lru_unlock_irqrestore(lru, &flags); + rcu_read_unlock(); return unqueued; /* useful for debug warnings */ } @@ -4431,7 +4360,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { - struct deferred_split *ds_queue; + struct list_lru_one *lru; + int nid; + struct mem_cgroup *memcg; unsigned long flags; /* @@ -4454,7 +4385,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) if (folio_test_swapcache(folio)) return; - ds_queue = folio_split_queue_lock_irqsave(folio, &flags); + nid = folio_nid(folio); + + rcu_read_lock(); + memcg = folio_memcg(folio); + lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); @@ -4462,36 +4397,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); - } } else { /* partially mapped folios cannot become non-partially mapped */ VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } - if (list_empty(&folio->_deferred_list)) { - struct mem_cgroup *memcg; - - memcg = folio_split_queue_memcg(folio, ds_queue); - list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); - ds_queue->split_queue_len++; - if (memcg) - set_shrinker_bit(memcg, folio_nid(folio), - shrinker_id(deferred_split_shrinker)); - } - split_queue_unlock_irqrestore(ds_queue, flags); + __list_lru_add(&deferred_split_lru, lru, &folio->_deferred_list, nid, memcg); + list_lru_unlock_irqrestore(lru, &flags); + rcu_read_unlock(); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { - struct pglist_data *pgdata = NODE_DATA(sc->nid); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; - -#ifdef CONFIG_MEMCG - if (sc->memcg) - ds_queue = &sc->memcg->deferred_split_queue; -#endif - return READ_ONCE(ds_queue->split_queue_len); + return list_lru_shrink_count(&deferred_split_lru, sc); } static bool thp_underused(struct folio *folio) @@ -4521,45 +4440,49 @@ static bool thp_underused(struct folio *folio) return false; } +static enum lru_status deferred_split_isolate(struct list_head *item, + struct list_lru_one *lru, + void *cb_arg) +{ + struct folio *folio = container_of(item, struct folio, _deferred_list); + struct list_head *freeable = cb_arg; + + if (folio_try_get(folio)) { + list_lru_isolate_move(lru, item, freeable); + return LRU_REMOVED; + } + + /* + * We lost race with folio_put(). Read folio state before the + * isolate: folio_unqueue_deferred_split() checks list_empty() + * locklessly, so once removed the folio can be freed any time. + */ + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } + list_lru_isolate(lru, item); + return LRU_REMOVED; +} + static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct deferred_split *ds_queue; - unsigned long flags; + LIST_HEAD(dispose); struct folio *folio, *next; - int split = 0, i; - struct folio_batch fbatch; + int split = 0; + unsigned long isolated; - folio_batch_init(&fbatch); + isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc, + deferred_split_isolate, &dispose); -retry: - ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); - /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_entry_safe(folio, next, &ds_queue->split_queue, - _deferred_list) { - if (folio_try_get(folio)) { - folio_batch_add(&fbatch, folio); - } else if (folio_test_partially_mapped(folio)) { - /* We lost race with folio_put() */ - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; - if (!--sc->nr_to_scan) - break; - if (!folio_batch_space(&fbatch)) - break; - } - split_queue_unlock_irqrestore(ds_queue, flags); - - for (i = 0; i < folio_batch_count(&fbatch); i++) { + list_for_each_entry_safe(folio, next, &dispose, _deferred_list) { bool did_split = false; bool underused = false; - struct deferred_split *fqueue; - folio = fbatch.folios[i]; + list_del_init(&folio->_deferred_list); + if (!folio_test_partially_mapped(folio)) { /* * See try_to_map_unused_to_zeropage(): we cannot @@ -4588,63 +4511,23 @@ next: * underused, then consider it used and don't add it back to * split_queue. */ - if (did_split || !folio_test_partially_mapped(folio)) - continue; + if (!did_split && folio_test_partially_mapped(folio)) { requeue: - /* - * Add back partially mapped folios, or underused folios that - * we could not lock this round. - */ - fqueue = folio_split_queue_lock_irqsave(folio, &flags); - if (list_empty(&folio->_deferred_list)) { - list_add_tail(&folio->_deferred_list, &fqueue->split_queue); - fqueue->split_queue_len++; + rcu_read_lock(); + list_lru_add_irq(&deferred_split_lru, + &folio->_deferred_list, + folio_nid(folio), + folio_memcg(folio)); + rcu_read_unlock(); } - split_queue_unlock_irqrestore(fqueue, flags); - } - folios_put(&fbatch); - - if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) { - cond_resched(); - goto retry; + folio_put(folio); } - /* - * Stop shrinker if we didn't split any page, but the queue is empty. - * This can happen if pages were freed under us. - */ - if (!split && list_empty(&ds_queue->split_queue)) + if (!split && !isolated) return SHRINK_STOP; return split; } -#ifdef CONFIG_MEMCG -void reparent_deferred_split_queue(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct deferred_split *ds_queue = &memcg->deferred_split_queue; - struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; - int nid; - - spin_lock_irq(&ds_queue->split_queue_lock); - spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); - - if (!ds_queue->split_queue_len) - goto unlock; - - list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); - parent_ds_queue->split_queue_len += ds_queue->split_queue_len; - ds_queue->split_queue_len = 0; - - for_each_node(nid) - set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); - -unlock: - spin_unlock(&parent_ds_queue->split_queue_lock); - spin_unlock_irq(&ds_queue->split_queue_lock); -} -#endif - #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { diff --git a/mm/internal.h b/mm/internal.h index 5602393054f3..181e79f1d6a2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -852,7 +852,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio) /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe - * to check without acquiring the split_queue_lock. + * to check without acquiring the list_lru lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a4b97ec8ce56..73e262cb30dd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1123,6 +1123,11 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a if (result != SCAN_SUCCEED) goto out_nolock; + if (folio_memcg_alloc_deferred(folio)) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } + mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e24114a4493a..56cd4af08232 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4143,11 +4143,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) memcg->cgwb_frn[i].done = __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); - INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); - memcg->deferred_split_queue.split_queue_len = 0; #endif lru_gen_init_memcg(memcg); return memcg; @@ -4299,11 +4294,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); memcg_offline_kmem(memcg); - reparent_deferred_split_queue(memcg); /* - * The reparenting of objcg must be after the reparenting of the - * list_lru and deferred_split_queue above, which ensures that they will - * not mistakenly get the parent list_lru and deferred_split_queue. + * The reparenting of objcg must be after the reparenting of + * the list_lru in memcg_offline_kmem(), which ensures that + * they will not mistakenly get the parent list_lru. */ memcg_reparent_objcgs(memcg); reparent_shrinker_deferred(memcg); diff --git a/mm/memory.c b/mm/memory.c index 1d8e09d9b3c9..56be920c56d7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5222,6 +5222,10 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_put(folio); goto next; } + if (order > 1 && folio_memcg_alloc_deferred(folio)) { + folio_put(folio); + goto fallback; + } folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation diff --git a/mm/mm_init.c b/mm/mm_init.c index db5568cf36e1..c0a7f1cf6fef 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1373,19 +1373,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void pgdat_init_split_queue(struct pglist_data *pgdat) -{ - struct deferred_split *ds_queue = &pgdat->deferred_split_queue; - - spin_lock_init(&ds_queue->split_queue_lock); - INIT_LIST_HEAD(&ds_queue->split_queue); - ds_queue->split_queue_len = 0; -} -#else -static void pgdat_init_split_queue(struct pglist_data *pgdat) {} -#endif - #ifdef CONFIG_COMPACTION static void pgdat_init_kcompactd(struct pglist_data *pgdat) { @@ -1401,8 +1388,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_resize_init(pgdat); pgdat_kswapd_lock_init(pgdat); - - pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); init_waitqueue_head(&pgdat->kswapd_wait); diff --git a/mm/swap_state.c b/mm/swap_state.c index 04f5ce992401..9c3a5cf99778 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -465,6 +465,16 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, return ERR_PTR(-ENOMEM); } + if (order > 1 && folio_memcg_alloc_deferred(folio)) { + spin_lock(&ci->lock); + __swap_cache_do_del_folio(ci, folio, entry, shadow); + spin_unlock(&ci->lock); + folio_unlock(folio); + /* nr_pages refs from swap cache, 1 from allocation */ + folio_put_refs(folio, nr_pages + 1); + return ERR_PTR(-ENOMEM); + } + /* memsw uncharges swap when folio is added to swap cache */ memcg1_swapin(folio); if (shadow) From 25fcea21302237641ddd5816b5b2a20f368d1027 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Tue, 2 Jun 2026 12:34:53 +0800 Subject: [PATCH 292/321] mm/thp: clear deferred split shrinker bits when queues drain deferred_split_count() returns the raw list_lru count. When the per-memcg, per-node list is empty, that count is 0. That skips scanning, but it does not tell memcg reclaim that the shrinker is empty. shrink_slab_memcg() only clears the memcg shrinker bit when the count callback reports SHRINK_EMPTY. Return SHRINK_EMPTY for an empty deferred split list, so the bit can be cleared once the queue has drained. Link: https://lore.kernel.org/20260602043453.67597-1-lance.yang@linux.dev Signed-off-by: Lance Yang Reviewed-by: David Hildenbrand (Arm) Acked-by: Usama Arif Cc: Baolin Wang Cc: Barry Song Cc: Dave Chinner Cc: Dev Jain Cc: Johannes Weiner Cc: Kairui Song Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mikhail Zaslonko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6927f66b2eb2..da851a5696d5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4410,7 +4410,10 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { - return list_lru_shrink_count(&deferred_split_lru, sc); + unsigned long count; + + count = list_lru_shrink_count(&deferred_split_lru, sc); + return count ?: SHRINK_EMPTY; } static bool thp_underused(struct folio *folio) From ed384eb3a3e121c1d6d5c5d36950fbd286b92026 Mon Sep 17 00:00:00 2001 From: fujunjie Date: Tue, 26 May 2026 12:22:41 +0000 Subject: [PATCH 293/321] mm/compaction: respect cpusets when checking retry suitability should_compact_retry() handles COMPACT_SKIPPED by asking compaction_zonelist_suitable() whether reclaim can make a later compaction attempt worthwhile. That answer is used for the current allocation, so it should follow the same zone eligibility rules as the allocation itself. When cpusets are enabled, allocator slowpath decisions are marked with ALLOC_CPUSET. The allocation path, direct compaction and reclaim retry all skip zones rejected by __cpuset_zone_allowed(). compaction_zonelist_suitable() does not apply that filter. It only walks ac->zonelist/ac->nodemask, so it can return true because a zone that is not usable for the current allocation would pass __compaction_suitable(). That does not let the allocation use the disallowed zone. Later allocation and direct compaction paths still apply cpuset filtering. However, it can make should_compact_retry() retry based on memory that this allocation cannot use. Pass gfp_mask down and apply the same ALLOC_CPUSET check in compaction_zonelist_suitable(). This keeps the retry decision aligned with the zones that the allocation is allowed to use. A temporary debugfs probe was also used to call the old and new compaction_zonelist_suitable() predicates in the same two-node NUMA guest. The task was restricted to mems=0 while ac->nodemask covered nodes 0-1. After putting pressure on node0, node0 failed __compaction_suitable() for order-10 and node1 passed it, but node1 was rejected by __cpuset_zone_allowed(). In that state the old predicate returned true and the patched predicate returned false. Link: https://lore.kernel.org/tencent_F59F2BA2CC5779308E10DF54593C736D3E0A@qq.com Fixes: 435b3894e742 ("mm:page_alloc: fix the NULL ac->nodemask in __alloc_pages_slowpath()") Signed-off-by: fujunjie Reviewed-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/compaction.h | 2 +- mm/compaction.c | 6 +++++- mm/page_alloc.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 173d9c07a895..c829c48d1c71 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -101,7 +101,7 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); bool compaction_zonelist_suitable(struct alloc_context *ac, int order, - int alloc_flags); + int alloc_flags, gfp_t gfp_mask); extern void __meminit kcompactd_run(int nid); extern void __meminit kcompactd_stop(int nid); diff --git a/mm/compaction.c b/mm/compaction.c index 8f664fb09f24..b776f35ad020 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2448,7 +2448,7 @@ bool compaction_suitable(struct zone *zone, int order, unsigned long watermark, /* Used by direct reclaimers */ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, - int alloc_flags) + int alloc_flags, gfp_t gfp_mask) { struct zone *zone; struct zoneref *z; @@ -2461,6 +2461,10 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, ac->highest_zoneidx, ac->nodemask) { unsigned long available; + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* * Do not consider all the reclaimable memory because we do not * want to trash just for a single high order allocation which diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97cb95820592..dd2d3d5ac1b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4198,7 +4198,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, +should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order, + int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) @@ -4220,7 +4221,8 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * migration targets. Continue if reclaim can help. */ if (compact_result == COMPACT_SKIPPED) { - ret = compaction_zonelist_suitable(ac, order, alloc_flags); + ret = compaction_zonelist_suitable(ac, order, alloc_flags, + gfp_mask); goto out; } @@ -4273,7 +4275,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, +should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order, + int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) @@ -4940,9 +4943,9 @@ retry: * of free memory (see __compaction_suitable) */ if (did_some_progress > 0 && can_compact && - should_compact_retry(ac, order, alloc_flags, - compact_result, &compact_priority, - &compaction_retries)) + should_compact_retry(gfp_mask, ac, order, alloc_flags, + compact_result, &compact_priority, + &compaction_retries)) goto retry; /* Reclaim/compaction failed to prevent the fallback */ From 2f5e0477276bb87a407edc75f3d65012e6f63c68 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Mon, 1 Jun 2026 03:21:17 -0700 Subject: [PATCH 294/321] mm: bypass mmap_miss heuristic for VM_EXEC readahead Patch series "mm: improve large folio readahead for exec memory", v7. Two checks in do_sync_mmap_readahead() limit large-folio readahead: 1. The mmap_miss heuristic is meant to throttle wasteful speculative readahead. It is currently also applied to the VM_EXEC readahead path, which is targeted rather than speculative. Once mmap_miss exceeds MMAP_LOTSAMISS, exec readahead - including the large-folio order requested by exec_folio_order() - is disabled. On configurations where the mmap_miss decrement paths are not active (see patch 1) the counter only grows, so exec readahead is permanently disabled after the first 100 faults. 2. The force_thp_readahead path is gated only on HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER and always drives the readahead at HPAGE_PMD_ORDER. Configurations where HPAGE_PMD_ORDER exceeds MAX_PAGECACHE_ORDER never reach this path, even when the mapping itself supports usefully large folios well below the cap. Both issues are most visible on arm64 with a 64K base page size, where HPAGE_PMD_ORDER is 13 (512MB) -- above MAX_PAGECACHE_ORDER (11) -- and where fault_around_pages collapses to 1 disabling should_fault_around() (one of the two mmap_miss decrement sites). However the fixes are architecture-agnostic: patch 1 reflects the nature of VM_EXEC readahead regardless of base page size, and patch 2 generalises the gate so any mapping advertising a usefully large maximum folio order can benefit. I created a benchmark that mmaps a large executable file madvises it as huge and calls RET-stub functions at PAGE_SIZE offsets across it. "Cold" measures fault + readahead cost. "Random" first faults in all pages with a sequential sweep (not measured), then measures time for calling random offsets, isolating iTLB miss cost for scattered execution. The benchmark results on Neoverse V2 (Grace), arm64 with 64K base pages, 512MB executable file on ext4, averaged over 3 runs: Phase | Baseline | Patched | Improvement -----------|--------------|--------------|------------------ Cold fault | 83.4 ms | 41.3 ms | 50% faster Random | 76.0 ms | 58.3 ms | 23% faster This patch (of 2): The mmap_miss heuristic is intended to stop speculative mmap readahead when a file looks like a random-access workload. That does not fit the VM_EXEC path very well. VM_EXEC readahead is already constrained differently from ordinary mmap read-around: it is bounded by the VMA, uses exec_folio_order() to choose an order useful for executable mappings, and sets async_size to 0 so it does not create follow-on readahead. When VM_HUGEPAGE is also present, the larger readahead is an explicit userspace opt-in. The mmap_miss counter is decremented from cache-hit paths in do_async_mmap_readahead() and filemap_map_pages(). Those paths are not always enough to balance the synchronous miss increments for executable mappings. In particular, when fault-around is effectively disabled, such as configurations where fault_around_pages is 1, filemap_map_pages() is not reached from the fault path. The counter can then become a stale throttle for VM_EXEC mappings and suppress the readahead behavior that the executable-specific path is trying to provide. Skip both mmap_miss increments and decrements for VM_EXEC mappings, matching the existing VM_SEQ_READ treatment and keeping the counter accounting symmetric. Link: https://lore.kernel.org/20260601102205.3985788-1-usama.arif@linux.dev Link: https://lore.kernel.org/20260601102205.3985788-2-usama.arif@linux.dev Signed-off-by: Usama Arif Reviewed-by: Jan Kara Reviewed-by: Kiryl Shutsemau (Meta) Reviewed-by: Oscar Salvador (SUSE) Reviewed-by: Pedro Falcato Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Catalin Marinas Cc: Christian Brauner Cc: David Hildenbrand Cc: Dev Jain Cc: Heiher Cc: Johannes Weiner Cc: Kees Cook Cc: Kevin Brodsky Cc: Lance Yang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pasha Tatashin Cc: Rohan McLure Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/filemap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 6bf0b540ef19..58d8ba867b52 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3340,7 +3340,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } } - if (!(vm_flags & VM_SEQ_READ)) { + if (!(vm_flags & (VM_SEQ_READ | VM_EXEC))) { /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) @@ -3435,12 +3435,12 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * times for a single folio and break the balance with mmap_miss * increase in do_sync_mmap_readahead(). * - * VM_SEQ_READ mappings skip the mmap_miss increment in + * VM_SEQ_READ and VM_EXEC mappings skip the mmap_miss increment in * do_sync_mmap_readahead(), so skip the decrement here as well to * keep the counter symmetric. */ if (likely(!folio_test_locked(folio)) && - !(vmf->vma->vm_flags & VM_SEQ_READ)) { + !(vmf->vma->vm_flags & (VM_SEQ_READ | VM_EXEC))) { mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); @@ -3942,14 +3942,14 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. * - * VM_SEQ_READ mappings skip the mmap_miss increment in - * do_sync_mmap_readahead(), so skip the decrement here as - * well to keep the counter symmetric. + * VM_SEQ_READ and VM_EXEC mappings skip the mmap_miss + * increment in do_sync_mmap_readahead(), so skip the + * decrement here as well to keep the counter symmetric. */ if ((map_ret & VM_FAULT_NOPAGE) && !(vmf->flags & FAULT_FLAG_TRIED) && !folio_test_workingset(folio) && - !(vma->vm_flags & VM_SEQ_READ)) { + !(vma->vm_flags & (VM_SEQ_READ | VM_EXEC))) { unsigned short mmap_miss; mmap_miss = READ_ONCE(file->f_ra.mmap_miss); From 8732e14b719129b77e24d9003a506ec949d9427c Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Mon, 1 Jun 2026 03:21:18 -0700 Subject: [PATCH 295/321] mm: use mapping_max_folio_order() for force_thp_readahead order The force_thp_readahead path in do_sync_mmap_readahead() is gated on HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER and always requests HPAGE_PMD_ORDER / HPAGE_PMD_NR. On configurations where HPAGE_PMD_ORDER exceeds MAX_PAGECACHE_ORDER, notably arm64 with a 64K base page size, VM_HUGEPAGE mappings cannot use this path and fall back to the non-forced mmap readahead path even when the mapping supports useful large folios. Enable forced readahead for mappings that support large folios and request the max folio order supported by the mapping, capped at 2M. 2MB is chosen as the cap because it matches the PMD size on x86_64 and on arm64 with 4K base pages, so the size/memory-pressure tradeoff for folios of that size is already well understood. On arm64 with 16K and 64K base page sizes, 2MB is also the contiguous-PTE (contpte) block size, so the resulting folios coalesce into a single TLB entry and reduce TLB pressure on the readahead path. This will result in 32M folios not being faulted in with 16K base page size for arm64, but with contpte, the performance difference should be negligible. The final allocation order may still be clamped by page_cache_ra_order() to the mapping and request geometry, but this gives VM_HUGEPAGE mappings on such configurations a large-folio readahead request instead of dropping back to base-page readahead. Link: https://lore.kernel.org/20260601102205.3985788-3-usama.arif@linux.dev Signed-off-by: Usama Arif Reviewed-by: Jan Kara Reviewed-by: Pedro Falcato Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Catalin Marinas Cc: Christian Brauner Cc: David Hildenbrand Cc: Dev Jain Cc: Heiher Cc: Johannes Weiner Cc: Kees Cook Cc: Kevin Brodsky Cc: Lance Yang Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pasha Tatashin Cc: Rohan McLure Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau (Meta) Cc: Oscar Salvador (SUSE) Signed-off-by: Andrew Morton --- mm/filemap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 58d8ba867b52..98434acc69c1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3313,14 +3313,26 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; bool force_thp_readahead = false; + unsigned int thp_order = 0; unsigned short mmap_miss; ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; /* Use the readahead code, even if readahead is disabled */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) - force_thp_readahead = true; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (vm_flags & VM_HUGEPAGE)) { + /* + * Cap max THP order at 2MB: this is the common PMD-sized + * hugepage size, and it avoids memory pressure from very + * large forced readahead when mapping_max_folio_order() is + * high (for example, 128MB with 64K base pages on arm64). + */ + if (mapping_large_folio_support(mapping)) { + force_thp_readahead = true; + thp_order = min_t(unsigned int, + mapping_max_folio_order(mapping), + get_order(SZ_2M)); + } + } if (!force_thp_readahead) { /* @@ -3355,17 +3367,19 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } if (force_thp_readahead) { + unsigned long folio_nr_pages = 1UL << thp_order; + fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); - ra->size = HPAGE_PMD_NR; + ractl._index &= ~(folio_nr_pages - 1); + ra->size = folio_nr_pages; /* - * Fetch two PMD folios, so we get the chance to actually + * Fetch two folios so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; - ra->async_size = HPAGE_PMD_NR; - ra->order = HPAGE_PMD_ORDER; + ra->async_size = folio_nr_pages; + ra->order = thp_order; page_cache_ra_order(&ractl, ra); return fpin; } From 3d4f1a54160046d5059ec6c5f2152e054e7b12d7 Mon Sep 17 00:00:00 2001 From: fujunjie Date: Tue, 26 May 2026 09:12:48 +0000 Subject: [PATCH 296/321] mm/page_alloc: fix deferred compaction accounting COMPACT_DEFERRED means compaction did not start because past failures caused the zone to be deferred. try_to_compact_pages() returns the maximum result seen while walking the zonelist, so a final COMPACT_DEFERRED result means no later zone reported that compaction actually ran. __alloc_pages_direct_compact() skips COMPACTSTALL and COMPACTFAIL accounting when try_to_compact_pages() returns COMPACT_SKIPPED, but not when it returns COMPACT_DEFERRED. A deferred-only direct compaction attempt can therefore look like a stall, and then a failure if the allocation still cannot be satisfied. Treat COMPACT_DEFERRED like COMPACT_SKIPPED in this accounting path. If a later zone runs compaction and returns a result above COMPACT_DEFERRED, or compact_zone_order() reports COMPACT_SUCCESS for a captured page, the final result is not COMPACT_DEFERRED and the existing accounting still runs. Link: https://lore.kernel.org/tencent_368AF1F3821E46232637BE16D65C45CF3308@qq.com Fixes: 06dac2f467fe ("mm: compaction: update the COMPACT[STALL|FAIL] events properly") Signed-off-by: fujunjie Reviewed-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd2d3d5ac1b1..f7db8f049bd2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4161,7 +4161,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, psi_memstall_leave(&pflags); delayacct_compact_end(); - if (*compact_result == COMPACT_SKIPPED) + if (*compact_result == COMPACT_SKIPPED || + *compact_result == COMPACT_DEFERRED) return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's From 3f08b20eb12f7a05824295d083561cfddfdf76c2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 28 May 2026 09:55:07 +0200 Subject: [PATCH 297/321] mm/page_vma_mapped_walk: use ptep_get_lockless() for lockless access When not holding the lock, there is a chance that the pte gets modified under our feet, so we need to use the lockless API to make sure that the entries remain consistent during the read." Switch from ptep_get() to ptep_get_lockless() accessor for PTE reads when no lock is taken. [osalvador@suse.de: changelog addition] Link: https://lore.kernel.org/ahhNq0pFKvSKZQbR@localhost.localdomain Link: https://lore.kernel.org/20260528075507.1821939-1-agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Reviewed-by: Oscar Salvador (SUSE) Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Cc: Anshuman Khandual Cc: Harry Yoo Cc: Jann Horn Cc: Liam Howlett Cc: Rik van Riel Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a4d52fdb3056..2ccbabfb2cc1 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -41,7 +41,7 @@ again: if (!pvmw->pte) return false; - ptent = ptep_get(pvmw->pte); + ptent = ptep_get_lockless(pvmw->pte); if (pte_none(ptent)) { return false; @@ -183,6 +183,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) struct mm_struct *mm = vma->vm_mm; unsigned long end; spinlock_t *ptl; + pte_t pteval; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -310,7 +311,11 @@ next_pte: goto restart; } pvmw->pte++; - } while (pte_none(ptep_get(pvmw->pte))); + if (!pvmw->ptl) + pteval = ptep_get_lockless(pvmw->pte); + else + pteval = ptep_get(pvmw->pte); + } while (pte_none(pteval)); if (!pvmw->ptl) { spin_lock(ptl); From d1aba985984781947ad67c1b44ac64bd498c8f27 Mon Sep 17 00:00:00 2001 From: Cunlong Li Date: Thu, 28 May 2026 10:48:45 +0800 Subject: [PATCH 298/321] zram: drop unused bio parameter from write helpers After "zram: fix use-after-free in zram_bvec_write_partial()", zram_bvec_write_partial() always passes NULL to zram_read_page() and no longer needs the parent bio. Mirror the read side (zram_bvec_read_partial() has not taken a bio since commit 4e3c87b9421d ("zram: fix synchronous reads")) and drop the parameter from zram_bvec_write_partial() and zram_bvec_write(). No functional change. Link: https://lore.kernel.org/20260528-zram-v3-2-cab86eef8764@gmail.com Signed-off-by: Cunlong Li Reviewed-by: Christoph Hellwig Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Cc: Yisheng Xie Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7917fc7a2a29..fd12604ff8d7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2334,7 +2334,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) * This is a partial IO. Read the full page before writing the changes. */ static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset) { struct page *page = alloc_page(GFP_NOIO); int ret; @@ -2352,10 +2352,10 @@ static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec, } static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset) { if (is_partial_io(bvec)) - return zram_bvec_write_partial(zram, bvec, index, offset, bio); + return zram_bvec_write_partial(zram, bvec, index, offset); return zram_write_page(zram, bvec->bv_page, index); } @@ -2752,7 +2752,7 @@ static void zram_bio_write(struct zram *zram, struct bio *bio) bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); - if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) { + if (zram_bvec_write(zram, &bv, index, offset) < 0) { atomic64_inc(&zram->stats.failed_writes); bio->bi_status = BLK_STS_IOERR; break; From 8f7275c174bc5bcc8fc1bec8024e2b3e6fe17f46 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Thu, 28 May 2026 09:13:36 +0800 Subject: [PATCH 299/321] lib/test_hmm: fix memory leak in dmirror_migrate_to_system() Move the kvcalloc() calls after the early return checks to avoid leaking src_pfns and dst_pfns when end < start or mmget_not_zero() fails. Link: https://lore.kernel.org/20260528011336.20797-1-hao.ge@linux.dev Fixes: 775465fd26a3 ("lib/test_hmm: add zone device private THP test infrastructure") Signed-off-by: Hao Ge Reviewed-by: Alistair Popple Reported-by: Sashiko Reviewed-by: Balbir Singh Cc: Jason Gunthorpe Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- lib/test_hmm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 63bf77dee987..35f774ed2d99 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1111,9 +1111,6 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, unsigned long *src_pfns; unsigned long *dst_pfns; - src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); - dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); - start = cmd->addr; end = start + size; if (end < start) @@ -1123,6 +1120,9 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, if (!mmget_not_zero(mm)) return -EINVAL; + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); + cmd->cpages = 0; mmap_read_lock(mm); for (addr = start; addr < end; addr = next) { From cdea4acce026f4dc6a1689cb991a2bf3a4333ecd Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 1 Jun 2026 11:40:09 +0000 Subject: [PATCH 300/321] mm: delete stale comment about cachelines These comments have been wrong since commit a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") added NR_FREE_PAGES_BLOCKS. Since nobody has complained about it in the last year, it seems unlikely these comments were particularly useful anyway, so delete them. Link: https://lore.kernel.org/20260601-zone_stat_item-comment-v1-1-f452dd91d5eb@google.com Signed-off-by: Brendan Jackman Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka (SUSE) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e449f524f26..ca2712187147 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -214,7 +214,6 @@ enum numa_stat_item { #endif enum zone_stat_item { - /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ @@ -225,7 +224,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - /* Second 128 byte cacheline */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif From 3862816c98152553106dd762c66c0f390337fa38 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 1 Jun 2026 16:55:04 -0700 Subject: [PATCH 301/321] MAINTAINERS: add testing ABI documents for mm A few mm subsystem entries in MAINTAINERS are missing their testing ABI documents. Add those. Link: https://lore.kernel.org/20260601235506.85123-1-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ef31a8dd9e5b..63fa4f9fa4c8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16854,6 +16854,7 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/ABI/testing/sysfs-kernel-mm-ksm F: Documentation/admin-guide/mm/ksm.rst F: Documentation/mm/ksm.rst F: include/linux/ksm.h @@ -16876,6 +16877,8 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/ABI/testing/sysfs-kernel-mm-mempolicy +F: Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave F: include/linux/mempolicy.h F: include/uapi/linux/mempolicy.h F: include/linux/migrate.h @@ -16918,6 +16921,10 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/ABI/testing/sysfs-kernel-mm +F: Documentation/ABI/testing/sysfs-kernel-mm-cma +F: Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers +F: Documentation/ABI/testing/sysfs-kernel-mm-numa F: Documentation/admin-guide/mm/ F: Documentation/mm/ F: include/linux/cma.h @@ -17041,6 +17048,7 @@ R: Barry Song R: Youngjun Park L: linux-mm@kvack.org S: Maintained +F: Documentation/ABI/testing/sysfs-kernel-mm-swap F: Documentation/mm/swap-table.rst F: include/linux/swap.h F: include/linux/swapfile.h @@ -17068,6 +17076,7 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage F: Documentation/admin-guide/mm/transhuge.rst F: include/linux/huge_mm.h F: include/linux/khugepaged.h From 04718f7c9290f95385f0dd328758753dc1c36dec Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:25 +0100 Subject: [PATCH 302/321] fs/proc/task_mmu: fix make_uffd_wp_huge_pte() prot-update race Patch series "userfaultfd/pagemap: pre-existing fixes". These are pre-existing bug fixes that were carried at the front of the userfaultfd RWP working-set-tracking series up to v5 [1]. Per review feedback that fixes should not sit in the middle of a feature series, they are split out and sent on their own; the RWP series is reposted rebased on top of this. All six were flagged by the Sashiko AI review of the RWP series and carry Reported-by: Sashiko AI review . They are independent of RWP, apply to mm-new directly, and carry Cc: stable@. 1: fs/proc/task_mmu: a missing huge_ptep_modify_prot_start() in make_uffd_wp_huge_pte() can lose hardware Dirty/Accessed updates when PAGEMAP_SCAN write-protects a hugetlb PTE. 2: fs/proc/task_mmu: pagemap_scan_hugetlb_entry() compares the range against HPAGE_SIZE rather than the hstate page size, so it never write-protects gigantic hugetlb pages. 3: fs/proc/task_mmu: PAGEMAP_SCAN with PM_SCAN_WP_MATCHING over an unpopulated hugetlb range self-deadlocks -- pagemap_scan_pte_hole() calls uffd_wp_range() while walk_hugetlb_range() holds the hugetlb vma lock for read, and hugetlb_change_protection() then takes it for write. Install the marker inline instead. 4: mm/huge_memory: change_non_present_huge_pmd() drops pmd_swp_uffd_wp on a device-private PMD permission downgrade, silently losing the uffd-wp marker. 5: userfaultfd: must_wait() applies pte_write() to a locklessly read PTE without checking pte_present(), so swap/migration entries decode random offset bits and a thread can stay parked on a stale fault. 6: userfaultfd: __VMA_UFFD_FLAGS feeds VMA_UFFD_MINOR_BIT (41) to mk_vma_flags() unconditionally, an out-of-bounds write into the single-word vma_flags_t on 32-bit. Build the mask from config-gated per-mode masks so an unavailable bit is never materialised. This patch (of 6): make_uffd_wp_huge_pte() arms the UFFD_WP bit on a present HugeTLB PTE by calling huge_ptep_modify_prot_commit() with a ptent snapshot that was fetched without the corresponding huge_ptep_modify_prot_start(). The start helper is what atomically clears the entry so the kernel-owned snapshot stays consistent until the commit; without it, the hardware may set Dirty or Accessed in the live PTE between the original read and the commit, and huge_ptep_modify_prot_commit() (whose generic implementation just calls set_huge_pte_at()) then writes the stale snapshot back over the live hardware bits, losing the update. The non-hugetlb sibling make_uffd_wp_pte() does this correctly via ptep_modify_prot_start() / ptep_modify_prot_commit(). Mirror that pattern for the present-PTE branch. The migration case stays as-is -- migration entries are non-present, so there's no hardware update to race against. Link: https://lore.kernel.org/20260529172331.356655-1-kas@kernel.org Link: https://lore.kernel.org/20260529172331.356655-2-kas@kernel.org Link: https://lore.kernel.org/all/20260526130509.2748441-1-kirill@shutemov.name/ [1] Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Reviewed-by: Lorenzo Stoakes Reviewed-by: Dev Jain Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1e3a15bf46f4..e21a38ac745b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2610,12 +2610,16 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) return; - if (softleaf_is_migration(entry)) + if (softleaf_is_migration(entry)) { set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); - else - huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, - huge_pte_mkuffd_wp(ptent)); + } else { + pte_t old_pte, new_pte; + + old_pte = huge_ptep_modify_prot_start(vma, addr, ptep); + new_pte = huge_pte_mkuffd_wp(old_pte); + huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, new_pte); + } } #endif /* CONFIG_HUGETLB_PAGE */ From 1b074e3270e1c061c829150c742eb83bad4dddd1 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:26 +0100 Subject: [PATCH 303/321] fs/proc/task_mmu: use huge_page_size() in pagemap_scan_hugetlb_entry() The partial-page check compares against HPAGE_SIZE (PMD_SIZE), which is wrong for gigantic hugetlb hstates (e.g. 1G). The walker hands the callback a huge_page_size()-sized range, never start + HPAGE_SIZE, so the comparison always declares it partial and aborts the WP. Compare against the actual hstate's page size. Link: https://lore.kernel.org/20260529172331.356655-3-kas@kernel.org Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Reviewed-by: Lorenzo Stoakes Reviewed-by: Dev Jain Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e21a38ac745b..1489c67e88f7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2960,7 +2960,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, if (~categories & PAGE_IS_WRITTEN) goto out_unlock; - if (end != start + HPAGE_SIZE) { + if (end != start + huge_page_size(hstate_vma(vma))) { /* Partial HugeTLB page WP isn't possible. */ pagemap_scan_backout_range(p, start, end); p->arg.walk_end = start; From e92d92bbafb264dc0518d52b846a3c07ed8d523f Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:27 +0100 Subject: [PATCH 304/321] fs/proc/task_mmu: fix hugetlb self-deadlock in pagemap_scan_pte_hole() A PAGEMAP_SCAN ioctl requesting PM_SCAN_WP_MATCHING on a hugetlb VMA hangs the calling thread, unkillably, as soon as the scan reaches an unpopulated part of the range: do_pagemap_scan() walk_page_range() walk_hugetlb_range() hugetlb_vma_lock_read() # take the vma lock for read ... pagemap_scan_pte_hole() # ... ->pte_hole() for a hole uffd_wp_range() change_protection() hugetlb_change_protection() hugetlb_vma_lock_write() # ... and block taking it for write walk_hugetlb_range() holds the hugetlb vma lock for read across the whole walk. A present entry goes to ->hugetlb_entry(); an unpopulated one goes to ->pte_hole(), i.e. pagemap_scan_pte_hole(). To write-protect the hole that handler calls uffd_wp_range(), which on a hugetlb VMA reaches hugetlb_change_protection() and takes the same vma lock for write. The thread then blocks in down_write() waiting for the read lock it is itself holding. The populated path avoids this: pagemap_scan_hugetlb_entry() write-protects the entry inline under the page-table lock and never enters hugetlb_change_protection(). Do the same for holes. Fault in the page table and install the uffd-wp marker directly with make_uffd_wp_huge_pte() under the page-table lock, rather than routing through uffd_wp_range(). That is the same sequence hugetlb_change_protection() runs for an unpopulated entry, minus the vma write lock -- which is safe to skip because PMD sharing is disabled on uffd-wp VMAs (hugetlb_unshare_all_pmds() runs at registration), leaving nothing for that lock to serialise against. Link: https://lore.kernel.org/20260529172331.356655-4-kas@kernel.org Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Assisted-by: Claude:claude-opus-4-8 Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 59 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1489c67e88f7..06fb94a965ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2977,8 +2977,62 @@ out_unlock: return ret; } + +/* + * Write-protect the unpopulated hugetlb entries covering [addr, end) by + * installing uffd-wp markers inline, exactly as pagemap_scan_hugetlb_entry() + * does for populated entries. + * + * walk_hugetlb_range() currently calls ->pte_hole() once per huge page, so the + * loop normally runs a single iteration; it is written to cover the full range + * in case the walker ever coalesces adjacent holes. + * + * The obvious route -- uffd_wp_range() -> hugetlb_change_protection() -- + * cannot be used here: it takes hugetlb_vma_lock_write(), but the page-table + * walker (walk_hugetlb_range()) already holds hugetlb_vma_lock_read() on the + * same VMA, so the scanning thread would deadlock against itself. PMD sharing + * is disabled on uffd-wp VMAs (hugetlb_unshare_all_pmds() at registration), so + * the vma lock guards nothing that matters for these entries anyway. + */ +static int pagemap_scan_hugetlb_hole_wp(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct hstate *h = hstate_vma(vma); + unsigned long psize = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pte_t *ptep; + pte_t pte; + + for (addr = ALIGN_DOWN(addr, psize); addr < end; addr += psize) { + ptep = huge_pte_alloc(mm, vma, addr, psize); + if (!ptep) + return -ENOMEM; + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(h, mm, ptep); + pte = huge_ptep_get(mm, addr, ptep); + make_uffd_wp_huge_pte(vma, addr, ptep, pte); + /* + * A none entry has no cached translation, so installing the + * marker needs no TLB flush. Flush only if a fault populated + * the entry between huge_pte_alloc() and the page table lock. + */ + if (!huge_pte_none(pte)) + flush_hugetlb_tlb_range(vma, addr, addr + psize); + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + + return 0; +} #else #define pagemap_scan_hugetlb_entry NULL +static int pagemap_scan_hugetlb_hole_wp(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} #endif static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, @@ -2998,7 +3052,10 @@ static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, if (~p->arg.flags & PM_SCAN_WP_MATCHING) return ret; - err = uffd_wp_range(vma, addr, end - addr, true); + if (is_vm_hugetlb_page(vma)) + err = pagemap_scan_hugetlb_hole_wp(vma, addr, end); + else + err = uffd_wp_range(vma, addr, end - addr, true); if (err < 0) ret = err; From f7e2c21bd1f57cd5350eecdfdb5d6025ca6afbab Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:28 +0100 Subject: [PATCH 305/321] mm/huge_memory: preserve pmd_swp_uffd_wp on device-private PMD downgrade change_non_present_huge_pmd() rewrites a writable device-private PMD swap entry into a readable one without carrying pmd_swp_uffd_wp() across. The PTE-level change_softleaf_pte() does this correctly; mirror that here, matching what copy_huge_pmd() does for the fork path. Without the carry, a plain mprotect() over a UFFD_WP-marked device-private THP strips the bit and the trap is bypassed on swap-in. Link: https://lore.kernel.org/20260529172331.356655-5-kas@kernel.org Fixes: 368076f52ebe ("mm/huge_memory: add device-private THP support to PMD operations") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Reviewed-by: Balbir Singh Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index da851a5696d5..a5176653ba1f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2565,6 +2565,8 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, } else if (softleaf_is_device_private_write(entry)) { entry = make_readable_device_private_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_uffd_wp(*pmd)) + newpmd = pmd_swp_mkuffd_wp(newpmd); } else { newpmd = *pmd; } From 8e80af52db652fbc41320eee45a4f73bc029faf2 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:29 +0100 Subject: [PATCH 306/321] userfaultfd: gate must_wait writability check on pte_present() userfaultfd_must_wait() and userfaultfd_huge_must_wait() read the PTE without taking the page table lock and then apply pte_write() / huge_pte_write() to it. Those accessors decode bits from the present encoding only; on a swap or migration entry they read the offset bits that happen to share the same position and return an undefined result. The intent of the check is "is this fault still WP-blocked?". A non-marker swap entry means the page is in transit -- the userfault context the original fault delivered against is no longer the same, and the swap-in or migration completion path will re-deliver a fresh fault if userspace still needs to handle it. Worst case under the current code the garbage write bit says "wait", and the thread stays asleep until a UFFDIO_WAKE that may never arrive. Gate the writability check on pte_present() so the lockless re-check only inspects present-PTE bits when the entry is actually present. The non-present, non-marker case returns "don't wait" and lets the fault path retry. Link: https://lore.kernel.org/20260529172331.356655-6-kas@kernel.org Fixes: 369cd2121be4 ("userfaultfd: hugetlbfs: userfaultfd_huge_must_wait for hugepmd ranges") Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index c86daf38d154..246af12bf801 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -2542,6 +2542,15 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, /* UFFD PTE markers require userspace to resolve the fault. */ if (pte_is_uffd_marker(pte)) return true; + /* + * Concurrent migration may have replaced the present PTE with a + * non-marker swap entry between fault delivery and this lockless + * re-check. huge_pte_write() on a swap entry decodes random offset + * bits, so gate it on pte_present(). The migration completion path + * will re-deliver the fault if it still needs userspace. + */ + if (!pte_present(pte)) + return false; /* * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to * resolve the fault. @@ -2628,6 +2637,17 @@ again: /* UFFD PTE markers require userspace to resolve the fault. */ if (pte_is_uffd_marker(ptent)) goto out; + /* + * Concurrent swap-out / migration may have replaced the present PTE + * with a non-marker swap entry between fault delivery and this + * lockless re-check. pte_write() on a swap entry decodes random + * offset bits, so gate it on pte_present(). The page-in path will + * re-deliver the fault if it still needs userspace. + */ + if (!pte_present(ptent)) { + ret = false; + goto out; + } /* * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to * resolve the fault. From cc7a9f6e57c4f71e8e1fee3274b1ae8770f2a743 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 29 May 2026 18:23:30 +0100 Subject: [PATCH 307/321] userfaultfd: build __VMA_UFFD_FLAGS from config-gated masks The VMA flags bitmap is a single word today: NUM_VMA_FLAG_BITS is BITS_PER_LONG, so on 32-bit vma_flags_t holds only 32 bits. (The bitmap type exists so this can grow past BITS_PER_LONG later; until it does, anything declared above the first word is out of range on 32-bit.) The bit enum nevertheless declares some bits unconditionally above BITS_PER_LONG -- VMA_UFFD_MINOR_BIT is 41, with VM_UFFD_MINOR == VM_NONE on 32-bit so no VMA actually carries the bit. __VMA_UFFD_FLAGS feeds VMA_UFFD_MINOR_BIT to mk_vma_flags() unconditionally. On 32-bit that becomes __set_bit(41, &one_long), a write one word past the end of the single-word bitmap. The compiler folds the out-of-bounds store with wraparound (1UL << (41 % 32) == bit 9) into the first word; bit 9 is already in __VMA_UFFD_FLAGS so the mask happens to come out right today, but it is an out-of-bounds write all the same, and any high-numbered bit whose mod-BITS_PER_LONG position is otherwise unused would silently OR an extra bit into the mask. Rather than feed bit numbers that may not exist on the current build to mk_vma_flags(), build the mask from whole per-mode masks that collapse to EMPTY_VMA_FLAGS when their feature is unavailable. Add mk_vma_flags_from_masks() for that, and define VMA_UFFD_MISSING / _WP / _MINOR alongside the VM_UFFD_* flags, gating VMA_UFFD_MINOR on the same config as VM_UFFD_MINOR (which implies 64BIT, where bit 41 fits). An out-of-range bit is then never materialised, on any arch, and the in-range fast path stays a compile-time constant. Link: https://lore.kernel.org/20260529172331.356655-7-kas@kernel.org Fixes: 9ea35a25d51b ("mm: introduce VMA flags bitmap type") Signed-off-by: Kiryl Shutsemau Reported-by: Sashiko AI review Suggested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Assisted-by: Claude:claude-opus-4-8 Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 39 +++++++++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 4 ++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f2612a70fb1..485df9c2dbdd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -496,6 +496,21 @@ enum { #else #define VM_UFFD_MINOR VM_NONE #endif + +/* + * vma_flags_t masks for the userfaultfd VMA flags. VMA_UFFD_MINOR is gated on + * the same config as VM_UFFD_MINOR -- which implies 64BIT, where the bit fits + * -- so an out-of-range bit is never fed to mk_vma_flags() on a build whose + * bitmap cannot hold it. + */ +#define VMA_UFFD_MISSING mk_vma_flags(VMA_UFFD_MISSING_BIT) +#define VMA_UFFD_WP mk_vma_flags(VMA_UFFD_WP_BIT) +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VMA_UFFD_MINOR mk_vma_flags(VMA_UFFD_MINOR_BIT) +#else +#define VMA_UFFD_MINOR EMPTY_VMA_FLAGS +#endif + #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) #define VM_SEALED INIT_VM_FLAG(SEALED) @@ -1238,6 +1253,30 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, #define vma_flags_set(flags, ...) \ vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline vma_flags_t __mk_vma_flags_from_masks(size_t count, + const vma_flags_t *masks) +{ + vma_flags_t flags = EMPTY_VMA_FLAGS; + size_t i; + + for (i = 0; i < count; i++) + vma_flags_set_mask(&flags, masks[i]); + return flags; +} + +/* + * Combine pre-computed vma_flags_t masks into one value, e.g.: + * + * vma_flags_t flags = mk_vma_flags_from_masks(VMA_UFFD_WP, VMA_UFFD_MINOR); + * + * Unlike mk_vma_flags(), which takes bit numbers, this takes whole masks -- + * each of which may be EMPTY_VMA_FLAGS when its feature is unavailable -- so a + * bit that does not exist on the current build is never materialised. + */ +#define mk_vma_flags_from_masks(...) \ + __mk_vma_flags_from_masks(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flags_t []){__VA_ARGS__}) + /* Clear all of the to-clear flags in flags, non-atomically. */ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3ec8e1071673..68edac4dcd78 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -23,8 +23,8 @@ /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) -#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ - VMA_UFFD_MINOR_BIT) +#define __VMA_UFFD_FLAGS mk_vma_flags_from_masks(VMA_UFFD_MISSING, VMA_UFFD_WP, \ + VMA_UFFD_MINOR) /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining From a71204ec911d0c0e9be20e8e7cadda54e4464e8b Mon Sep 17 00:00:00 2001 From: Nakamura Shuta Date: Fri, 29 May 2026 17:53:16 +0900 Subject: [PATCH 308/321] rust: page: mark Page::nid as inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building the kernel, the following Rust symbol is generated: $ nm vmlinux | grep ' _R'.*Page | rustfilt ::nid `Page::nid` is a trivial wrapper around the C function `page_to_nid`. It does not make sense to go through a trivial wrapper for this function, so mark it inline. This follows commit 878620c5a93a ("rust: page: optimize rust symbol generation for Page"), which did the same for `alloc_page` and `drop`. Link: https://github.com/Rust-for-Linux/linux/issues/1145 Link: https://lore.kernel.org/20260529085316.27432-1-nakamura.shuta@gmail.com Signed-off-by: Nakamura Shuta Reviewed-by: Alice Ryhl Reviewed-by: Gary Guo Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Danilo Krummrich Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- rust/kernel/page.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs index adecb200c654..764bb5acc90a 100644 --- a/rust/kernel/page.rs +++ b/rust/kernel/page.rs @@ -193,6 +193,7 @@ impl Page { } /// Get the node id containing this page. + #[inline] pub fn nid(&self) -> i32 { // SAFETY: Always safe to call with a valid page. unsafe { bindings::page_to_nid(self.as_ptr()) } From 0b6073ff1574efcdb291bc3d33342f22283f9817 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 1 Jun 2026 16:48:40 +0800 Subject: [PATCH 309/321] mm/sparse-vmemmap: provide generic vmemmap_set_pmd() and vmemmap_check_pmd() Patch series "mm/sparse-vmemmap: Provide generic vmemmap_set_pmd() and vmemmap_check_pmd()", v3. The weak vmemmap_set_pmd() and vmemmap_check_pmd() hooks are currently no-ops in the generic code, which leaves architectures that need PMD-level handling to open-code the same logic locally. This series provides generic implementations for both helpers in mm/sparse-vmemmap.c. vmemmap_set_pmd() installs a huge PMD with PAGE_KERNEL protection, and vmemmap_check_pmd() verifies a present leaf PMD before reusing the existing vmemmap_verify() helper. With those generic helpers in place, patches 2-5 remove the now redundant arch-specific implementations from arm64, riscv, loongarch, and sparc. This patch (of 5): The two weak functions are currently no-ops on every architecture, forcing each platform that needs them to duplicate the same handful of lines. Provide a generic implementation: - vmemmap_set_pmd() simply sets a huge PMD with PAGE_KERNEL protection. - vmemmap_check_pmd() verifies that the PMD is present and leaf, then calls the existing vmemmap_verify() helper. Architectures that need special handling can continue to override the weak symbols; everyone else gets the standard version for free. Link: https://lore.kernel.org/20260601084845.3792171-1-songmuchun@bytedance.com Link: https://lore.kernel.org/20260601084845.3792171-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador (SUSE) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Huacai Chen Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 112ccf9c71ca..99e2be39671b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -386,12 +386,17 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) { + WARN_ON_ONCE(!pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL)); } int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next) { - return 0; + if (!pmd_leaf(pmdp_get(pmd))) + return 0; + vmemmap_verify((pte_t *)pmd, node, addr, next); + + return 1; } int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, From f521f198b50adc71171697bf2b7d49c50101def1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 1 Jun 2026 16:48:41 +0800 Subject: [PATCH 310/321] arm64/mm: drop vmemmap_pmd helpers and use generic code The generic implementations now suffice; remove the arm64 copies. Link: https://lore.kernel.org/20260601084845.3792171-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Will Deacon Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador (SUSE) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Huacai Chen Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e5a42b7a0160..6bbdd400fd46 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1775,20 +1775,6 @@ static void free_empty_tables(unsigned long addr, unsigned long end, } #endif -void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, - unsigned long addr, unsigned long next) -{ - pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); -} - -int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, - unsigned long addr, unsigned long next) -{ - vmemmap_verify((pte_t *)pmdp, node, addr, next); - - return pmd_leaf(READ_ONCE(*pmdp)); -} - int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { From abff0ecf7602a0881ac9c7b4644aed829d2d20e9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 1 Jun 2026 16:48:42 +0800 Subject: [PATCH 311/321] riscv/mm: drop vmemmap_pmd helpers and use generic code The generic implementations now suffice; remove the riscv copies. Link: https://lore.kernel.org/20260601084845.3792171-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador (SUSE) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Huacai Chen Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 885f1db4e9bf..5f680eb83e86 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1359,19 +1359,6 @@ void __init misc_mem_init(void) } #ifdef CONFIG_SPARSEMEM_VMEMMAP -void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, - unsigned long addr, unsigned long next) -{ - pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL); -} - -int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, - unsigned long addr, unsigned long next) -{ - vmemmap_verify((pte_t *)pmdp, node, addr, next); - return 1; -} - int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { From ecca7da924b11775b9d45a6888ac655a9b33ace0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 1 Jun 2026 16:48:43 +0800 Subject: [PATCH 312/321] loongarch/mm: drop vmemmap_check_pmd helper and use generic code The generic implementations now suffice; remove the loongarch copy. Link: https://lore.kernel.org/20260601084845.3792171-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador (SUSE) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Huacai Chen Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/loongarch/mm/init.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 687980b6e91f..3407030f3e7a 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -140,17 +140,6 @@ void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, set_pmd_at(&init_mm, addr, pmd, entry); } -int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, - unsigned long addr, unsigned long next) -{ - int huge = pmd_val(pmdp_get(pmd)) & _PAGE_HUGE; - - if (huge) - vmemmap_verify((pte_t *)pmd, node, addr, next); - - return huge; -} - int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { From d3d58e9469008dc706863a7681fb9ae1856c8a4b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 1 Jun 2026 16:48:44 +0800 Subject: [PATCH 313/321] sparc/mm: drop vmemmap_check_pmd helper and use generic code The generic implementations now suffice; remove the sparc copy. Link: https://lore.kernel.org/20260601084845.3792171-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador (SUSE) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Huacai Chen Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 3b679b1d1d72..103db4683b16 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2559,17 +2559,6 @@ void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, pmd_val(*pmd) = pte_base | __pa(p); } -int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, - unsigned long addr, unsigned long next) -{ - int large = pmd_leaf(*pmdp); - - if (large) - vmemmap_verify((pte_t *)pmdp, node, addr, next); - - return large; -} - int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, int node, struct vmem_altmap *altmap) { From c55dd3b46c1208d6d2ea737a8aefef4aa4c70cb8 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Fri, 29 May 2026 09:41:30 +0800 Subject: [PATCH 314/321] vmalloc: fix NULL pointer dereference in is_vm_area_hugepages() find_vm_area() can return NULL if the given address is not a valid vmalloc area. Check the return value before dereferencing it to avoid a kernel crash. Link: https://lore.kernel.org/20260529014130.671291-1-hui.zhu@linux.dev Fixes: 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings") Signed-off-by: Hui Zhu Reviewed-by: Dev Jain Reviewed-by: Uladzislau Rezki (Sony) Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3b02c0c6b371..d87dc7f77f4e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -265,7 +265,9 @@ static inline bool is_vm_area_hugepages(const void *addr) * allocated in the vmalloc layer. */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC - return find_vm_area(addr)->page_order > 0; + struct vm_struct *area = find_vm_area(addr); + + return area && area->page_order > 0; #else return false; #endif From a51cbdf02aec619f90db7e9f06e295adb8009d4d Mon Sep 17 00:00:00 2001 From: tanze Date: Mon, 1 Jun 2026 19:04:23 +0800 Subject: [PATCH 315/321] mm/filemap: use folio_next_index() for start Use folio_next_index() instead of open-coding folio->index + folio_nr_pages(folio) when updating @start in filemap_get_folios_contig(), filemap_get_folios_tag(), and filemap_get_folios_dirty(). Link: https://lore.kernel.org/20260601110425.44784-1-tanze@kylinos.cn Signed-off-by: tanze Reviewed-by: Jan Kara Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 98434acc69c1..5d9f9b36e9d8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2284,8 +2284,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, goto put_folio; if (!folio_batch_add(fbatch, folio)) { - nr = folio_nr_pages(folio); - *start = folio->index + nr; + *start = folio_next_index(folio); goto out; } xas_advance(&xas, folio_next_index(folio) - 1); @@ -2345,8 +2344,7 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { - unsigned long nr = folio_nr_pages(folio); - *start = folio->index + nr; + *start = folio_next_index(folio); goto out; } } @@ -2404,8 +2402,7 @@ unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, } } if (!folio_batch_add(fbatch, folio)) { - unsigned long nr = folio_nr_pages(folio); - *start = folio->index + nr; + *start = folio_next_index(folio); goto out; } } From c13a0316aef5f4b73e8b4bf6943737f836d65e1d Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Tue, 24 Mar 2026 01:08:21 +0900 Subject: [PATCH 316/321] mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device Patch series "mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device", v8. Currently, in the uswsusp path, only the swap type value is retrieved at lookup time without holding a reference. If swapoff races after the type is acquired, subsequent slot allocations operate on a stale swap device. Additionally, grabbing and releasing the swap device reference on every slot allocation is inefficient across the entire hibernation swap path. This patch series addresses these issues: - Patch 1: Fixes the swapoff race in uswsusp by pinning the swap device from the point it is looked up until the session completes. - Patch 2: Removes the overhead of per-slot reference counting in alloc/free paths and cleans up the redundant SWP_WRITEOK check. This patch (of 2): Hibernation via uswsusp (/dev/snapshot ioctls) has a race window: after selecting the resume swap area but before user space is frozen, swapoff may run and invalidate the selected swap device. Fix this by pinning the swap device with SWP_HIBERNATION while it is in use. The pin is exclusive, which is sufficient since hibernate_acquire() already prevents concurrent hibernation sessions. The kernel swsusp path (sysfs-based hibernate/resume) uses find_hibernation_swap_type() which is not affected by the pin. It freezes user space before touching swap, so swapoff cannot race. Introduce dedicated helpers: - pin_hibernation_swap_type(): Look up and pin the swap device. Used by the uswsusp path. - find_hibernation_swap_type(): Lookup without pinning. Used by the kernel swsusp path. - unpin_hibernation_swap_type(): Clear the hibernation pin. While a swap device is pinned, swapoff is prevented from proceeding. Link: https://lore.kernel.org/20260323160822.1409904-1-youngjun.park@lge.com Link: https://lore.kernel.org/20260323160822.1409904-2-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Cc: "Rafael J . Wysocki" Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +- kernel/power/swap.c | 2 +- kernel/power/user.c | 15 ++++- mm/swapfile.c | 137 +++++++++++++++++++++++++++++++++++++------ 4 files changed, 137 insertions(+), 22 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8c43bc3055c9..8f0f68e245ba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -213,6 +213,7 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_HIBERNATION = (1 << 13), /* pinned for hibernation */ /* add others here before... */ }; @@ -432,7 +433,9 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int swap_type_of(dev_t device, sector_t offset); +extern int pin_hibernation_swap_type(dev_t device, sector_t offset); +extern void unpin_hibernation_swap_type(int type); +extern int find_hibernation_swap_type(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2e64869bb5a0..cc4764149e8f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -341,7 +341,7 @@ static int swsusp_swap_check(void) * This is called before saving the image. */ if (swsusp_resume_device) - res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + res = find_hibernation_swap_type(swsusp_resume_device, swsusp_resume_block); else res = find_first_swap(&swsusp_resume_device); if (res < 0) diff --git a/kernel/power/user.c b/kernel/power/user.c index be77f3556bd7..d0fcfba7ac23 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swap_type_of(swsusp_resume_device, 0); + data->swap = pin_hibernation_swap_type(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); @@ -90,8 +90,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->free_bitmaps = !error; } } - if (error) + if (error) { + unpin_hibernation_swap_type(data->swap); hibernate_release(); + } data->frozen = false; data->ready = false; @@ -115,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) data = filp->private_data; data->dev = 0; free_all_swap_pages(data->swap); + unpin_hibernation_swap_type(data->swap); if (data->frozen) { pm_restore_gfp_mask(); free_basic_memory_bitmaps(); @@ -235,11 +238,17 @@ static int snapshot_set_swap_area(struct snapshot_data *data, offset = swap_area.offset; } + /* + * Unpin the swap device if a swap area was already + * set by SNAPSHOT_SET_SWAP_AREA. + */ + unpin_hibernation_swap_type(data->swap); + /* * User space encodes device types as two-byte values, * so we need to recode them */ - data->swap = swap_type_of(swdev, offset); + data->swap = pin_hibernation_swap_type(swdev, offset); if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; diff --git a/mm/swapfile.c b/mm/swapfile.c index 615d90867111..5e1e605ad9a1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -132,7 +132,7 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { /* May return NULL on invalid type, caller must check for NULL return */ static struct swap_info_struct *swap_type_to_info(int type) { - if (type >= MAX_SWAPFILES) + if (type < 0 || type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } @@ -2199,22 +2199,15 @@ void swap_free_hibernation_slot(swp_entry_t entry) put_swap_device(si); } -/* - * Find the swap type that corresponds to given device (if any). - * - * @offset - number of the PAGE_SIZE-sized block of the device, starting - * from 0, in which the swap header is expected to be located. - * - * This is needed for the suspend to disk (aka swsusp). - */ -int swap_type_of(dev_t device, sector_t offset) +static int __find_hibernation_swap_type(dev_t device, sector_t offset) { int type; - if (!device) - return -1; + lockdep_assert_held(&swap_lock); + + if (!device) + return -EINVAL; - spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; @@ -2224,16 +2217,118 @@ int swap_type_of(dev_t device, sector_t offset) if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); - if (se->start_block == offset) { - spin_unlock(&swap_lock); + if (se->start_block == offset) return type; - } } } - spin_unlock(&swap_lock); return -ENODEV; } +/** + * pin_hibernation_swap_type - Pin the swap device for hibernation + * @device: Block device containing the resume image + * @offset: Offset identifying the swap area + * + * Locate the swap device for @device/@offset and mark it as pinned + * for hibernation. While pinned, swapoff() is prevented. + * + * Only one uswsusp context may pin a swap device at a time. + * If already pinned, this function returns -EBUSY. + * + * Return: + * >= 0 on success (swap type). + * -EINVAL if @device is invalid. + * -ENODEV if the swap device is not found. + * -EBUSY if the device is already pinned for hibernation. + */ +int pin_hibernation_swap_type(dev_t device, sector_t offset) +{ + int type; + struct swap_info_struct *si; + + spin_lock(&swap_lock); + + type = __find_hibernation_swap_type(device, offset); + if (type < 0) { + spin_unlock(&swap_lock); + return type; + } + + si = swap_type_to_info(type); + if (WARN_ON_ONCE(!si)) { + spin_unlock(&swap_lock); + return -ENODEV; + } + + /* + * hibernate_acquire() prevents concurrent hibernation sessions. + * This check additionally guards against double-pinning within + * the same session. + */ + if (WARN_ON_ONCE(si->flags & SWP_HIBERNATION)) { + spin_unlock(&swap_lock); + return -EBUSY; + } + + si->flags |= SWP_HIBERNATION; + + spin_unlock(&swap_lock); + return type; +} + +/** + * unpin_hibernation_swap_type - Unpin the swap device for hibernation + * @type: Swap type previously returned by pin_hibernation_swap_type() + * + * Clear the hibernation pin on the given swap device, allowing + * swapoff() to proceed normally. + * + * If @type does not refer to a valid swap device, this function + * does nothing. + */ +void unpin_hibernation_swap_type(int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + si = swap_type_to_info(type); + if (!si) { + spin_unlock(&swap_lock); + return; + } + si->flags &= ~SWP_HIBERNATION; + spin_unlock(&swap_lock); +} + +/** + * find_hibernation_swap_type - Find swap type for hibernation + * @device: Block device containing the resume image + * @offset: Offset within the device identifying the swap area + * + * Locate the swap device corresponding to @device and @offset. + * + * Unlike pin_hibernation_swap_type(), this function only performs a + * lookup and does not mark the swap device as pinned for hibernation. + * + * This is safe in the sysfs-based hibernation path where user space + * is already frozen and swapoff() cannot run concurrently. + * + * Return: + * A non-negative swap type on success. + * -EINVAL if @device is invalid. + * -ENODEV if no matching swap device is found. + */ +int find_hibernation_swap_type(dev_t device, sector_t offset) +{ + int type; + + spin_lock(&swap_lock); + type = __find_hibernation_swap_type(device, offset); + spin_unlock(&swap_lock); + + return type; +} + int find_first_swap(dev_t *device) { int type; @@ -2996,6 +3091,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } + + /* Refuse swapoff while the device is pinned for hibernation */ + if (p->flags & SWP_HIBERNATION) { + err = -EBUSY; + spin_unlock(&swap_lock); + goto out_dput; + } + if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { From 0d97349679c5fe9941d283715ca109d61bbdc06e Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Tue, 24 Mar 2026 01:08:22 +0900 Subject: [PATCH 317/321] mm/swap: remove redundant swap device reference in alloc/free In the previous commit, uswsusp was modified to pin the swap device when the swap type is determined, ensuring the device remains valid throughout the hibernation I/O path. Therefore, it is no longer necessary to repeatedly get and put the swap device reference for each swap slot allocation and free operation. For hibernation via the sysfs interface, user-space tasks are frozen before swap allocation begins, so swapoff cannot race with allocation. After resume, tasks remain frozen while swap slots are freed, so additional reference management is not required there either. Remove the redundant swap device get/put operations from the hibernation swap allocation and free paths. Also remove the SWP_WRITEOK check before allocation, as the cluster allocation logic already validates the swap device state. Update function comments to document the caller's responsibility for ensuring swap device stability. Link: https://lore.kernel.org/20260323160822.1409904-3-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Cc: "Rafael J . Wysocki" Signed-off-by: Andrew Morton --- mm/swapfile.c | 68 +++++++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5e1e605ad9a1..78b49b0658ad 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2138,7 +2138,16 @@ out: } #ifdef CONFIG_HIBERNATION -/* Allocate a slot for hibernation */ +/** + * swap_alloc_hibernation_slot() - Allocate a swap slot for hibernation. + * @type: swap device type index to allocate from. + * + * The caller must ensure the swap device is stable, either by pinning + * it (SWP_HIBERNATION) or by freezing user-space. + * + * Return: a valid swp_entry_t on success, or an empty entry (val == 0) + * on failure. + */ swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *pcp_si, *si = swap_type_to_info(type); @@ -2149,46 +2158,42 @@ swp_entry_t swap_alloc_hibernation_slot(int type) if (!si) goto fail; - /* This is called for allocating swap entry, not cache */ - if (get_swap_device_info(si)) { - if (si->flags & SWP_WRITEOK) { - /* - * Try the local cluster first if it matches the device. If - * not, try grab a new cluster and override local cluster. - */ - local_lock(&percpu_swap_cluster.lock); - pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); - pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); - if (pcp_si == si && pcp_offset) { - ci = swap_cluster_lock(si, pcp_offset); - if (cluster_is_usable(ci, 0)) - offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); - else - swap_cluster_unlock(ci); - } - if (!offset) - offset = cluster_alloc_swap_entry(si, NULL); - local_unlock(&percpu_swap_cluster.lock); - if (offset) - entry = swp_entry(si->type, offset); - } - put_swap_device(si); + /* + * Try the local cluster first if it matches the device. If + * not, try grab a new cluster and override local cluster. + */ + local_lock(&percpu_swap_cluster.lock); + pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); + pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); + if (pcp_si == si && pcp_offset) { + ci = swap_cluster_lock(si, pcp_offset); + if (cluster_is_usable(ci, 0)) + offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); + else + swap_cluster_unlock(ci); } + if (!offset) + offset = cluster_alloc_swap_entry(si, NULL); + local_unlock(&percpu_swap_cluster.lock); + if (offset) + entry = swp_entry(si->type, offset); + fail: return entry; } -/* Free a slot allocated by swap_alloc_hibernation_slot */ +/** + * swap_free_hibernation_slot() - Free a swap slot allocated for hibernation. + * @entry: swap entry to free. + * + * The caller must ensure the swap device is stable. + */ void swap_free_hibernation_slot(swp_entry_t entry) { - struct swap_info_struct *si; + struct swap_info_struct *si = __swap_entry_to_info(entry); struct swap_cluster_info *ci; pgoff_t offset = swp_offset(entry); - si = get_swap_device(entry); - if (WARN_ON(!si)) - return; - ci = swap_cluster_lock(si, offset); __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); @@ -2196,7 +2201,6 @@ void swap_free_hibernation_slot(swp_entry_t entry) /* In theory readahead might add it to the swap cache by accident */ __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); - put_swap_device(si); } static int __find_hibernation_swap_type(dev_t device, sector_t offset) From 32a2b73ec232b284b029d34bcfaa9a7f424151d2 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 3 Jun 2026 23:17:25 -0700 Subject: [PATCH 318/321] mm/compaction: cap compact_gap() at COMPACT_CLUSTER_MAX compact_gap() returns 2 << order, which is used as watermark headroom in __compaction_suitable() and as a threshold in kswapd reclaim decisions. The computed value scales exponentially by order. For order-9 THP allocations this evaluates to 1024 pages, but the compaction free scanner's working set is bounded by COMPACT_CLUSTER_MAX (32 pages). The scanner stops isolating free pages once it matches the migration batch. The current gap over-reserves by 32x. On fragmented production hosts, kswapd will try to reclaim up to the gap, but it only reaches that threshold in 18% of attempts. As a result, reclaim continues in the majority of cases despite many lower-order free pages being available. The over-sized gap also causes 46% of order-9 compaction suitability checks to fail unnecessarily: the zone has sufficient free pages for the scanner to operate, but not enough to clear the inflated threshold. Cap compact_gap() at COMPACT_CLUSTER_MAX so the watermark headroom reflects the scanner's actual capacity. This function is used by two key heuristics. The first is when kswapd can stop high-order reclaim and downgrade to order-0 balancing, allowing kcompactd to be woken for the original higher allocation order. The second is zone suitability checking, where the smaller gap allows compaction to start sooner. Note that orders 0-4 are unaffected since their gap is already less than or equal to COMPACT_CLUSTER_MAX. A/B test on v6.13-based instagram production hosts (64GB, 60s measurement): Unpatched (43 hosts) pgscan_kswapd (mean/host): ~1.6M reclaim efficiency (steal/scan): 83.8% per-compaction success (success/stall): 2.1% THP success (alloc/alloc+fallback): 4.9% forced lru_add_drain (mean/host): ~107K Patched (59 hosts) pgscan_kswapd (mean/host): ~449K reclaim efficiency (steal/scan): 91.0% per-compaction success (success/stall): 28.3% THP success (alloc/alloc+fallback): 17.2% forced lru_add_drain (mean/host): ~64K Additional tests were also performed using a workload of similar shape and based on mm-new at the time of testing. Across three 60s runs, the patch showed improvements consistent with the previous test: reduced kswapd reclaim and fewer THP fault fallbacks. Unpatched kswapd_shrink_node downgrade to order-0 (mean): 0 thp_fault_fallback (mean): 1217 pgscan_kswapd (mean): 6328 pgsteal_kswapd (mean): 5657 Patched kswapd_shrink_node downgrade to order-0 (mean): 28 thp_fault_fallback (mean): 738 pgscan_kswapd (mean): 3773 pgsteal_kswapd (mean): 3243 Link: https://lore.kernel.org/20260604061725.13800-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn (Meta) Reviewed-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/compaction.h | 8 ++++---- mm/vmscan.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c829c48d1c71..f29ef0653546 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -2,6 +2,8 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H +#include + /* * Determines how hard direct compaction should try to succeed. * Lower value means higher priority, analogically to reclaim priority. @@ -73,11 +75,9 @@ static inline unsigned long compact_gap(unsigned int order) * effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum * that the migrate scanner can have isolated on migrate list, and free * scanner is only invoked when the number of isolated free pages is - * lower than that. But it's not worth to complicate the formula here - * as a bigger gap for higher orders than strictly necessary can also - * improve chances of compaction success. + * lower than that. */ - return 2UL << order; + return min(2UL << order, COMPACT_CLUSTER_MAX); } static inline int current_is_kcompactd(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index e8a90911bf88..3f3ff25e561a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7014,7 +7014,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, /* * Fragmentation may mean that the system cannot be rebalanced for - * high-order allocations. If twice the allocation size has been + * high-order allocations. If at least the compaction gap has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. From 8198657c74170ea78808d1f1d886c7d35fd3694e Mon Sep 17 00:00:00 2001 From: Qiang Liu Date: Thu, 21 May 2026 10:18:58 +0800 Subject: [PATCH 319/321] lib/test_hmm: check alloc_page_vma() return value and handle OOM Check alloc_page_vma() return status for page allocation failures, free allocated pages and return VM_FAULT_OOM on error. Handle return codes of dmirror_devmem_fault_alloc_and_copy(), call migrate_vma_finalize() to remove migration entries from migrate_vma_setup(). Link: https://lore.kernel.org/20260521021858.21511-1-liuqiangneo@163.com Signed-off-by: Qiang Liu Cc: Alistair Popple Cc: Jason Gunthorpe Cc: Leon Romanovsky [akpm@linux-foundation.org: fix dmirror_devmem_fault_alloc_and_copy() retval handling] Link: https://lore.kernel.org/oe-kbuild-all/202606011329.zWs2BKy4-lkp@intel.com/ Signed-off-by: Andrew Morton --- lib/test_hmm.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 35f774ed2d99..9c59d1ceb5b5 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1063,6 +1063,25 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, /* Try with smaller pages if large allocation fails */ if (!dpage && order) { dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) { + /* Unlock and free pages already allocated. */ + while (i > 0) { + struct page *fpage; + + fpage = migrate_pfn_to_page(dst[--i]); + unlock_page(fpage); + __free_page(fpage); + } + /* Clear remaining dst entries to avoid + * migrate_vma_pages/finalize() using + * uninitialized values. + */ + while (i < (1 << order)) { + dst[i] = 0; + i++; + } + return VM_FAULT_OOM; + } lock_page(dpage); dst[i] = migrate_pfn(page_to_pfn(dpage)); dst_page = pfn_to_page(page_to_pfn(dpage)); @@ -1148,7 +1167,11 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, goto out; pr_debug("Migrating from device mem to sys mem\n"); - dmirror_devmem_fault_alloc_and_copy(&args, dmirror); + if (dmirror_devmem_fault_alloc_and_copy(&args, dmirror)) { + migrate_vma_finalize(&args); + ret = -ENOMEM; + goto out; + } migrate_vma_pages(&args); cmd->cpages += dmirror_successful_migrated_pages(&args); @@ -1689,8 +1712,10 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) } ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); - if (ret) + if (ret) { + migrate_vma_finalize(&args); goto err; + } migrate_vma_pages(&args); /* * No device finalize step is needed since From cd1fc0e3c1f67c0c31dfc215e5d9b771133dedc0 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 4 Jun 2026 05:53:05 +0000 Subject: [PATCH 320/321] fs/proc/task_mmu: do not warn on seeing non-migration pmd entry Patch series "mm/hmm: A fix and a selftest", v3. Patch 1 fixes a stale warning present from the time when only migration softleaf entries were supported at the PMD level. Patch 2 adds some code into hmm-tests.c which exercises the pagemap path for PMD device-private entries. This patch (of 2): pagemap_pmd_range_thp() warns if a non-present PMD is not a migration entry. This became false once device-private entries at the PMD level were added. Therefore, remove the stale migration-only assertion. Link: https://lore.kernel.org/20260604055308.1947679-1-dev.jain@arm.com Link: https://lore.kernel.org/20260604055308.1947679-2-dev.jain@arm.com Fixes: a30b48bf1b24 ("mm/migrate_device: implement THP migration of zone device pages") Signed-off-by: Dev Jain Reviewed-by: Balbir Singh Reviewed-by: Lorenzo Stoakes Tested-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Reviewed-by: Oscar Salvador (SUSE) Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 06fb94a965ff..d32408f7cd5e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2129,7 +2129,6 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; - VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); page = softleaf_to_page(entry); } From e3d8707358ea76b78bdec9928937bb9a797f2c8f Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 4 Jun 2026 05:53:06 +0000 Subject: [PATCH 321/321] selftests/mm/hmm-tests: test pagemap reads of PMD device-private entries To cover pagemap paths scanning PMD entries, add assertions to check whether a device-private PMD entry has the correct pagemap information - the PM_SWAP bit must be on in the pagemap entry. Before that, we must assert through HMM_DMIRROR_SNAPSHOT snapshot that the leaf entry is at PMD level and not PTE level. Link: https://lore.kernel.org/20260604055308.1947679-3-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Lorenzo Stoakes Cc: Balbir Singh Cc: David Hildenbrand (Arm) Cc: Oscar Salvador (SUSE) Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 34 ++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 7a4daadfb0c8..6a23c09ac2da 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -2274,8 +2274,11 @@ TEST_F(hmm, migrate_anon_huge_fault) unsigned long npages; unsigned long size; unsigned long i; + unsigned char *m; + uint64_t entry; void *old_ptr; void *map; + int pagemap_fd; int *ptr; int ret; @@ -2298,8 +2301,6 @@ TEST_F(hmm, migrate_anon_huge_fault) npages = size >> self->page_shift; map = (void *)ALIGN((uintptr_t)buffer->ptr, size); - ret = madvise(map, size, MADV_HUGEPAGE); - ASSERT_EQ(ret, 0); old_ptr = buffer->ptr; buffer->ptr = map; @@ -2307,6 +2308,9 @@ TEST_F(hmm, migrate_anon_huge_fault) for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ptr[i] = i; + ret = madvise(map, size, MADV_COLLAPSE); + ASSERT_EQ(ret, 0); + /* Migrate memory to device. */ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); @@ -2316,6 +2320,32 @@ TEST_F(hmm, migrate_anon_huge_fault) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); + if (!hmm_is_coherent_type(variant->device_number)) { + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, + buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE | + HMM_DMIRROR_PROT_PMD); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + ASSERT_GE(pagemap_fd, 0); + + for (i = 0; i < npages; ++i) { + entry = pagemap_get_entry(pagemap_fd, + (char *)buffer->ptr + i * self->page_size); + + ASSERT_NE(entry & PM_SWAP, 0); + ASSERT_FALSE(PAGEMAP_PRESENT(entry)); + } + + close(pagemap_fd); + } + /* Fault pages back to system memory and check them. */ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i);