mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-06-21 15:43:21 +02:00
32a2b73ec2
compact_gap() returns 2 << order, which is used as watermark headroom in __compaction_suitable() and as a threshold in kswapd reclaim decisions. The computed value scales exponentially by order. For order-9 THP allocations this evaluates to 1024 pages, but the compaction free scanner's working set is bounded by COMPACT_CLUSTER_MAX (32 pages). The scanner stops isolating free pages once it matches the migration batch. The current gap over-reserves by 32x. On fragmented production hosts, kswapd will try to reclaim up to the gap, but it only reaches that threshold in 18% of attempts. As a result, reclaim continues in the majority of cases despite many lower-order free pages being available. The over-sized gap also causes 46% of order-9 compaction suitability checks to fail unnecessarily: the zone has sufficient free pages for the scanner to operate, but not enough to clear the inflated threshold. Cap compact_gap() at COMPACT_CLUSTER_MAX so the watermark headroom reflects the scanner's actual capacity. This function is used by two key heuristics. The first is when kswapd can stop high-order reclaim and downgrade to order-0 balancing, allowing kcompactd to be woken for the original higher allocation order. The second is zone suitability checking, where the smaller gap allows compaction to start sooner. Note that orders 0-4 are unaffected since their gap is already less than or equal to COMPACT_CLUSTER_MAX. A/B test on v6.13-based instagram production hosts (64GB, 60s measurement): Unpatched (43 hosts) pgscan_kswapd (mean/host): ~1.6M reclaim efficiency (steal/scan): 83.8% per-compaction success (success/stall): 2.1% THP success (alloc/alloc+fallback): 4.9% forced lru_add_drain (mean/host): ~107K Patched (59 hosts) pgscan_kswapd (mean/host): ~449K reclaim efficiency (steal/scan): 91.0% per-compaction success (success/stall): 28.3% THP success (alloc/alloc+fallback): 17.2% forced lru_add_drain (mean/host): ~64K Additional tests were also performed using a workload of similar shape and based on mm-new at the time of testing. Across three 60s runs, the patch showed improvements consistent with the previous test: reduced kswapd reclaim and fewer THP fault fallbacks. Unpatched kswapd_shrink_node downgrade to order-0 (mean): 0 thp_fault_fallback (mean): 1217 pgscan_kswapd (mean): 6328 pgsteal_kswapd (mean): 5657 Patched kswapd_shrink_node downgrade to order-0 (mean): 28 thp_fault_fallback (mean): 738 pgscan_kswapd (mean): 3773 pgsteal_kswapd (mean): 3243 Link: https://lore.kernel.org/20260604061725.13800-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn (Meta) <jp.kobryn@linux.dev> Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org> Cc: Brendan Jackman <jackmanb@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
154 lines
4.5 KiB
C
154 lines
4.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_COMPACTION_H
|
|
#define _LINUX_COMPACTION_H
|
|
|
|
#include <linux/swap.h>
|
|
|
|
/*
|
|
* Determines how hard direct compaction should try to succeed.
|
|
* Lower value means higher priority, analogically to reclaim priority.
|
|
*/
|
|
enum compact_priority {
|
|
COMPACT_PRIO_SYNC_FULL,
|
|
MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
|
|
COMPACT_PRIO_SYNC_LIGHT,
|
|
MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
|
|
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
|
|
COMPACT_PRIO_ASYNC,
|
|
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
|
|
};
|
|
|
|
/* Return values for compact_zone() and try_to_compact_pages() */
|
|
/* When adding new states, please adjust include/trace/events/compaction.h */
|
|
enum compact_result {
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
COMPACT_NOT_SUITABLE_ZONE,
|
|
/*
|
|
* compaction didn't start as it was not possible or direct reclaim
|
|
* was more suitable
|
|
*/
|
|
COMPACT_SKIPPED,
|
|
/* compaction didn't start as it was deferred due to past failures */
|
|
COMPACT_DEFERRED,
|
|
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
COMPACT_NO_SUITABLE_PAGE,
|
|
/* compaction should continue to another pageblock */
|
|
COMPACT_CONTINUE,
|
|
|
|
/*
|
|
* The full zone was compacted scanned but wasn't successful to compact
|
|
* suitable pages.
|
|
*/
|
|
COMPACT_COMPLETE,
|
|
/*
|
|
* direct compaction has scanned part of the zone but wasn't successful
|
|
* to compact suitable pages.
|
|
*/
|
|
COMPACT_PARTIAL_SKIPPED,
|
|
|
|
/* compaction terminated prematurely due to lock contentions */
|
|
COMPACT_CONTENDED,
|
|
|
|
/*
|
|
* direct compaction terminated after concluding that the allocation
|
|
* should now succeed
|
|
*/
|
|
COMPACT_SUCCESS,
|
|
};
|
|
|
|
struct alloc_context; /* in mm/internal.h */
|
|
|
|
/*
|
|
* Number of free order-0 pages that should be available above given watermark
|
|
* to make sure compaction has reasonable chance of not running out of free
|
|
* pages that it needs to isolate as migration target during its work.
|
|
*/
|
|
static inline unsigned long compact_gap(unsigned int order)
|
|
{
|
|
/*
|
|
* Although all the isolations for migration are temporary, compaction
|
|
* free scanner may have up to 1 << order pages on its list and then
|
|
* try to split an (order - 1) free page. At that point, a gap of
|
|
* 1 << order might not be enough, so it's safer to require twice that
|
|
* amount. Note that the number of pages on the list is also
|
|
* effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum
|
|
* that the migrate scanner can have isolated on migrate list, and free
|
|
* scanner is only invoked when the number of isolated free pages is
|
|
* lower than that.
|
|
*/
|
|
return min(2UL << order, COMPACT_CLUSTER_MAX);
|
|
}
|
|
|
|
static inline int current_is_kcompactd(void)
|
|
{
|
|
return current->flags & PF_KCOMPACTD;
|
|
}
|
|
|
|
#ifdef CONFIG_COMPACTION
|
|
|
|
extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);
|
|
extern int fragmentation_index(struct zone *zone, unsigned int order);
|
|
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
|
|
unsigned int order, unsigned int alloc_flags,
|
|
const struct alloc_context *ac, enum compact_priority prio,
|
|
struct page **page);
|
|
extern void reset_isolation_suitable(pg_data_t *pgdat);
|
|
extern bool compaction_suitable(struct zone *zone, int order,
|
|
unsigned long watermark, int highest_zoneidx);
|
|
|
|
extern void compaction_defer_reset(struct zone *zone, int order,
|
|
bool alloc_success);
|
|
|
|
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
|
|
int alloc_flags, gfp_t gfp_mask);
|
|
|
|
extern void __meminit kcompactd_run(int nid);
|
|
extern void __meminit kcompactd_stop(int nid);
|
|
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
|
|
|
|
#else
|
|
static inline void reset_isolation_suitable(pg_data_t *pgdat)
|
|
{
|
|
}
|
|
|
|
static inline bool compaction_suitable(struct zone *zone, int order,
|
|
unsigned long watermark,
|
|
int highest_zoneidx)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void kcompactd_run(int nid)
|
|
{
|
|
}
|
|
static inline void kcompactd_stop(int nid)
|
|
{
|
|
}
|
|
|
|
static inline void wakeup_kcompactd(pg_data_t *pgdat,
|
|
int order, int highest_zoneidx)
|
|
{
|
|
}
|
|
|
|
#endif /* CONFIG_COMPACTION */
|
|
|
|
struct node;
|
|
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
|
|
extern int compaction_register_node(struct node *node);
|
|
extern void compaction_unregister_node(struct node *node);
|
|
|
|
#else
|
|
|
|
static inline int compaction_register_node(struct node *node)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void compaction_unregister_node(struct node *node)
|
|
{
|
|
}
|
|
#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */
|
|
|
|
#endif /* _LINUX_COMPACTION_H */
|