Files
linux-stable-mirror/include/linux/compaction.h
T
JP Kobryn 32a2b73ec2 mm/compaction: cap compact_gap() at COMPACT_CLUSTER_MAX
compact_gap() returns 2 << order, which is used as watermark headroom in
__compaction_suitable() and as a threshold in kswapd reclaim decisions. 
The computed value scales exponentially by order.  For order-9 THP
allocations this evaluates to 1024 pages, but the compaction free
scanner's working set is bounded by COMPACT_CLUSTER_MAX (32 pages).  The
scanner stops isolating free pages once it matches the migration batch. 
The current gap over-reserves by 32x.

On fragmented production hosts, kswapd will try to reclaim up to the gap,
but it only reaches that threshold in 18% of attempts.  As a result,
reclaim continues in the majority of cases despite many lower-order free
pages being available.  The over-sized gap also causes 46% of order-9
compaction suitability checks to fail unnecessarily: the zone has
sufficient free pages for the scanner to operate, but not enough to clear
the inflated threshold.

Cap compact_gap() at COMPACT_CLUSTER_MAX so the watermark headroom
reflects the scanner's actual capacity.  This function is used by two key
heuristics.  The first is when kswapd can stop high-order reclaim and
downgrade to order-0 balancing, allowing kcompactd to be woken for the
original higher allocation order.  The second is zone suitability
checking, where the smaller gap allows compaction to start sooner.

Note that orders 0-4 are unaffected since their gap is already less than
or equal to COMPACT_CLUSTER_MAX.

A/B test on v6.13-based instagram production hosts (64GB, 60s
measurement):

Unpatched (43 hosts)
pgscan_kswapd (mean/host): ~1.6M
reclaim efficiency (steal/scan): 83.8%
per-compaction success (success/stall): 2.1%
THP success (alloc/alloc+fallback): 4.9%
forced lru_add_drain (mean/host): ~107K

Patched (59 hosts)
pgscan_kswapd (mean/host): ~449K
reclaim efficiency (steal/scan): 91.0%
per-compaction success (success/stall): 28.3%
THP success (alloc/alloc+fallback): 17.2%
forced lru_add_drain (mean/host): ~64K

Additional tests were also performed using a workload of similar shape and
based on mm-new at the time of testing.  Across three 60s runs, the patch
showed improvements consistent with the previous test: reduced kswapd
reclaim and fewer THP fault fallbacks.

Unpatched
kswapd_shrink_node downgrade to order-0 (mean): 0
thp_fault_fallback (mean): 1217
pgscan_kswapd (mean): 6328
pgsteal_kswapd (mean): 5657

Patched
kswapd_shrink_node downgrade to order-0 (mean): 28
thp_fault_fallback (mean): 738
pgscan_kswapd (mean): 3773
pgsteal_kswapd (mean): 3243

Link: https://lore.kernel.org/20260604061725.13800-1-jp.kobryn@linux.dev
Signed-off-by: JP Kobryn (Meta) <jp.kobryn@linux.dev>
Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2026-06-08 18:21:32 -07:00

154 lines
4.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_COMPACTION_H
#define _LINUX_COMPACTION_H
#include <linux/swap.h>
/*
* Determines how hard direct compaction should try to succeed.
* Lower value means higher priority, analogically to reclaim priority.
*/
enum compact_priority {
COMPACT_PRIO_SYNC_FULL,
MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
COMPACT_PRIO_SYNC_LIGHT,
MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
COMPACT_PRIO_ASYNC,
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
};
/* Return values for compact_zone() and try_to_compact_pages() */
/* When adding new states, please adjust include/trace/events/compaction.h */
enum compact_result {
/* For more detailed tracepoint output - internal to compaction */
COMPACT_NOT_SUITABLE_ZONE,
/*
* compaction didn't start as it was not possible or direct reclaim
* was more suitable
*/
COMPACT_SKIPPED,
/* compaction didn't start as it was deferred due to past failures */
COMPACT_DEFERRED,
/* For more detailed tracepoint output - internal to compaction */
COMPACT_NO_SUITABLE_PAGE,
/* compaction should continue to another pageblock */
COMPACT_CONTINUE,
/*
* The full zone was compacted scanned but wasn't successful to compact
* suitable pages.
*/
COMPACT_COMPLETE,
/*
* direct compaction has scanned part of the zone but wasn't successful
* to compact suitable pages.
*/
COMPACT_PARTIAL_SKIPPED,
/* compaction terminated prematurely due to lock contentions */
COMPACT_CONTENDED,
/*
* direct compaction terminated after concluding that the allocation
* should now succeed
*/
COMPACT_SUCCESS,
};
struct alloc_context; /* in mm/internal.h */
/*
* Number of free order-0 pages that should be available above given watermark
* to make sure compaction has reasonable chance of not running out of free
* pages that it needs to isolate as migration target during its work.
*/
static inline unsigned long compact_gap(unsigned int order)
{
/*
* Although all the isolations for migration are temporary, compaction
* free scanner may have up to 1 << order pages on its list and then
* try to split an (order - 1) free page. At that point, a gap of
* 1 << order might not be enough, so it's safer to require twice that
* amount. Note that the number of pages on the list is also
* effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum
* that the migrate scanner can have isolated on migrate list, and free
* scanner is only invoked when the number of isolated free pages is
* lower than that.
*/
return min(2UL << order, COMPACT_CLUSTER_MAX);
}
static inline int current_is_kcompactd(void)
{
return current->flags & PF_KCOMPACTD;
}
#ifdef CONFIG_COMPACTION
extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, unsigned int alloc_flags,
const struct alloc_context *ac, enum compact_priority prio,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
unsigned long watermark, int highest_zoneidx);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
int alloc_flags, gfp_t gfp_mask);
extern void __meminit kcompactd_run(int nid);
extern void __meminit kcompactd_stop(int nid);
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
#else
static inline void reset_isolation_suitable(pg_data_t *pgdat)
{
}
static inline bool compaction_suitable(struct zone *zone, int order,
unsigned long watermark,
int highest_zoneidx)
{
return false;
}
static inline void kcompactd_run(int nid)
{
}
static inline void kcompactd_stop(int nid)
{
}
static inline void wakeup_kcompactd(pg_data_t *pgdat,
int order, int highest_zoneidx)
{
}
#endif /* CONFIG_COMPACTION */
struct node;
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
extern int compaction_register_node(struct node *node);
extern void compaction_unregister_node(struct node *node);
#else
static inline int compaction_register_node(struct node *node)
{
return 0;
}
static inline void compaction_unregister_node(struct node *node)
{
}
#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */
#endif /* _LINUX_COMPACTION_H */