mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-04-08 12:02:33 +02:00
Patch series "mm/khugepaged: fix dirty page handling for MADV_COLLAPSE",
v5.
MADV_COLLAPSE on file-backed mappings fails with -EINVAL when TEXT pages
are dirty. This affects scenarios like package/container updates or
executing binaries immediately after writing them, etc.
The issue is that collapse_file() triggers async writeback and returns
SCAN_FAIL (maps to -EINVAL), expecting khugepaged to revisit later. But
MADV_COLLAPSE is synchronous and userspace expects immediate success or
a clear retry signal.
Reproduction:
- Compile or copy 2MB-aligned executable to XFS/ext4 FS
- Call MADV_COLLAPSE on .text section
- First call fails with -EINVAL (text pages dirty from copy)
- Second call succeeds (async writeback completed)
Issue Report:
https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com
This patch (of 2):
When collapse_file encounters dirty or writeback pages in file-backed
mappings, it currently returns SCAN_FAIL which maps to -EINVAL. This is
misleading as EINVAL suggests invalid arguments, whereas dirty/writeback
pages represent transient conditions that may resolve on retry.
Introduce SCAN_PAGE_DIRTY_OR_WRITEBACK to cover both dirty and writeback
states, mapping it to -EAGAIN. For MADV_COLLAPSE, this provides userspace
with a clear signal that retry may succeed after writeback completes. For
khugepaged, this is harmless as it will naturally revisit the range during
periodic scans after async writeback completes.
Link: https://lkml.kernel.org/r/20260118190939.8986-2-shivankg@amd.com
Link: https://lkml.kernel.org/r/20260118190939.8986-4-shivankg@amd.com
Fixes: 34488399fa ("mm/madvise: add file and shmem support to MADV_COLLAPSE")
Signed-off-by: Shivank Garg <shivankg@amd.com>
Reported-by: Branden Moore <Branden.Moore@amd.com>
Closes: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
242 lines
6.3 KiB
C
242 lines
6.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#undef TRACE_SYSTEM
|
|
#define TRACE_SYSTEM huge_memory
|
|
|
|
#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
#define __HUGE_MEMORY_H
|
|
|
|
#include <linux/tracepoint.h>
|
|
|
|
#define SCAN_STATUS \
|
|
EM( SCAN_FAIL, "failed") \
|
|
EM( SCAN_SUCCEED, "succeeded") \
|
|
EM( SCAN_NO_PTE_TABLE, "no_pte_table") \
|
|
EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \
|
|
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
|
|
EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
|
|
EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
|
|
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
|
|
EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
|
|
EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \
|
|
EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \
|
|
EM( SCAN_PAGE_NULL, "page_null") \
|
|
EM( SCAN_SCAN_ABORT, "scan_aborted") \
|
|
EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
|
|
EM( SCAN_PAGE_LRU, "page_not_in_lru") \
|
|
EM( SCAN_PAGE_LOCK, "page_locked") \
|
|
EM( SCAN_PAGE_ANON, "page_not_anon") \
|
|
EM( SCAN_PAGE_COMPOUND, "page_compound") \
|
|
EM( SCAN_ANY_PROCESS, "no_process_for_page") \
|
|
EM( SCAN_VMA_NULL, "vma_null") \
|
|
EM( SCAN_VMA_CHECK, "vma_check_failed") \
|
|
EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
|
|
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
|
|
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
|
|
EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
|
|
EM( SCAN_TRUNCATED, "truncated") \
|
|
EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
|
|
EM( SCAN_STORE_FAILED, "store_failed") \
|
|
EM( SCAN_COPY_MC, "copy_poisoned_page") \
|
|
EM( SCAN_PAGE_FILLED, "page_filled") \
|
|
EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
|
|
|
|
#undef EM
|
|
#undef EMe
|
|
#define EM(a, b) TRACE_DEFINE_ENUM(a);
|
|
#define EMe(a, b) TRACE_DEFINE_ENUM(a);
|
|
|
|
SCAN_STATUS
|
|
|
|
#undef EM
|
|
#undef EMe
|
|
#define EM(a, b) {a, b},
|
|
#define EMe(a, b) {a, b}
|
|
|
|
TRACE_EVENT(mm_khugepaged_scan_pmd,
|
|
|
|
TP_PROTO(struct mm_struct *mm, struct folio *folio,
|
|
int referenced, int none_or_zero, int status, int unmapped),
|
|
|
|
TP_ARGS(mm, folio, referenced, none_or_zero, status, unmapped),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(struct mm_struct *, mm)
|
|
__field(unsigned long, pfn)
|
|
__field(int, referenced)
|
|
__field(int, none_or_zero)
|
|
__field(int, status)
|
|
__field(int, unmapped)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->mm = mm;
|
|
__entry->pfn = folio ? folio_pfn(folio) : -1;
|
|
__entry->referenced = referenced;
|
|
__entry->none_or_zero = none_or_zero;
|
|
__entry->status = status;
|
|
__entry->unmapped = unmapped;
|
|
),
|
|
|
|
TP_printk("mm=%p, scan_pfn=0x%lx, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
|
|
__entry->mm,
|
|
__entry->pfn,
|
|
__entry->referenced,
|
|
__entry->none_or_zero,
|
|
__print_symbolic(__entry->status, SCAN_STATUS),
|
|
__entry->unmapped)
|
|
);
|
|
|
|
TRACE_EVENT(mm_collapse_huge_page,
|
|
|
|
TP_PROTO(struct mm_struct *mm, int isolated, int status),
|
|
|
|
TP_ARGS(mm, isolated, status),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(struct mm_struct *, mm)
|
|
__field(int, isolated)
|
|
__field(int, status)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->mm = mm;
|
|
__entry->isolated = isolated;
|
|
__entry->status = status;
|
|
),
|
|
|
|
TP_printk("mm=%p, isolated=%d, status=%s",
|
|
__entry->mm,
|
|
__entry->isolated,
|
|
__print_symbolic(__entry->status, SCAN_STATUS))
|
|
);
|
|
|
|
TRACE_EVENT(mm_collapse_huge_page_isolate,
|
|
|
|
TP_PROTO(struct folio *folio, int none_or_zero,
|
|
int referenced, int status),
|
|
|
|
TP_ARGS(folio, none_or_zero, referenced, status),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, pfn)
|
|
__field(int, none_or_zero)
|
|
__field(int, referenced)
|
|
__field(int, status)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = folio ? folio_pfn(folio) : -1;
|
|
__entry->none_or_zero = none_or_zero;
|
|
__entry->referenced = referenced;
|
|
__entry->status = status;
|
|
),
|
|
|
|
TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
|
|
__entry->pfn,
|
|
__entry->none_or_zero,
|
|
__entry->referenced,
|
|
__print_symbolic(__entry->status, SCAN_STATUS))
|
|
);
|
|
|
|
TRACE_EVENT(mm_collapse_huge_page_swapin,
|
|
|
|
TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
|
|
|
|
TP_ARGS(mm, swapped_in, referenced, ret),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(struct mm_struct *, mm)
|
|
__field(int, swapped_in)
|
|
__field(int, referenced)
|
|
__field(int, ret)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->mm = mm;
|
|
__entry->swapped_in = swapped_in;
|
|
__entry->referenced = referenced;
|
|
__entry->ret = ret;
|
|
),
|
|
|
|
TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
|
|
__entry->mm,
|
|
__entry->swapped_in,
|
|
__entry->referenced,
|
|
__entry->ret)
|
|
);
|
|
|
|
TRACE_EVENT(mm_khugepaged_scan_file,
|
|
|
|
TP_PROTO(struct mm_struct *mm, struct folio *folio, struct file *file,
|
|
int present, int swap, int result),
|
|
|
|
TP_ARGS(mm, folio, file, present, swap, result),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(struct mm_struct *, mm)
|
|
__field(unsigned long, pfn)
|
|
__string(filename, file->f_path.dentry->d_iname)
|
|
__field(int, present)
|
|
__field(int, swap)
|
|
__field(int, result)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->mm = mm;
|
|
__entry->pfn = folio ? folio_pfn(folio) : -1;
|
|
__assign_str(filename);
|
|
__entry->present = present;
|
|
__entry->swap = swap;
|
|
__entry->result = result;
|
|
),
|
|
|
|
TP_printk("mm=%p, scan_pfn=0x%lx, filename=%s, present=%d, swap=%d, result=%s",
|
|
__entry->mm,
|
|
__entry->pfn,
|
|
__get_str(filename),
|
|
__entry->present,
|
|
__entry->swap,
|
|
__print_symbolic(__entry->result, SCAN_STATUS))
|
|
);
|
|
|
|
TRACE_EVENT(mm_khugepaged_collapse_file,
|
|
TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
|
|
unsigned long addr, bool is_shmem, struct file *file,
|
|
int nr, int result),
|
|
TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
|
|
TP_STRUCT__entry(
|
|
__field(struct mm_struct *, mm)
|
|
__field(unsigned long, hpfn)
|
|
__field(pgoff_t, index)
|
|
__field(unsigned long, addr)
|
|
__field(bool, is_shmem)
|
|
__string(filename, file->f_path.dentry->d_iname)
|
|
__field(int, nr)
|
|
__field(int, result)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->mm = mm;
|
|
__entry->hpfn = new_folio ? folio_pfn(new_folio) : -1;
|
|
__entry->index = index;
|
|
__entry->addr = addr;
|
|
__entry->is_shmem = is_shmem;
|
|
__assign_str(filename);
|
|
__entry->nr = nr;
|
|
__entry->result = result;
|
|
),
|
|
|
|
TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
|
|
__entry->mm,
|
|
__entry->hpfn,
|
|
__entry->index,
|
|
__entry->addr,
|
|
__entry->is_shmem,
|
|
__get_str(filename),
|
|
__entry->nr,
|
|
__print_symbolic(__entry->result, SCAN_STATUS))
|
|
);
|
|
|
|
#endif /* __HUGE_MEMORY_H */
|
|
#include <trace/define_trace.h>
|