mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-06-21 15:43:21 +02:00
e1bf796284
The IOCB_DONTCACHE writeback path in generic_write_sync() calls
filemap_flush_range() on every write, submitting writeback inline in
the writer's context. Perf lock contention profiling shows the
performance problem is not lock contention but the writeback submission
work itself — walking the page tree and submitting I/O blocks the writer
for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
(dontcache).
Replace the inline filemap_flush_range() call with a flusher kick that
drains dirty pages in the background. This moves writeback submission
completely off the writer's hot path.
To avoid flushing unrelated buffered dirty data, add a dedicated
WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
the per-wb WB_DONTCACHE_DIRTY counter to determine how many pages to
write back. The flusher writes back that many pages from the oldest dirty
inodes (not restricted to dontcache-specific inodes). This helps
preserve I/O batching while limiting the scope of expedited writeback.
Like WB_start_all, the WB_start_dontcache bit coalesces multiple
DONTCACHE writes into a single flusher wakeup without per-write
allocations. Use test_and_clear_bit to atomically consume the kick
request before reading the dirty counter and starting writeback, so that
concurrent DONTCACHE writes during writeback can re-set the bit and
schedule a follow-up flusher run.
Read the dirty counter with wb_stat_sum() (aggregating per-CPU batches)
rather than wb_stat() (which reads only the global counter) to ensure
small writes below the percpu batch threshold are visible to the flusher.
In filemap_dontcache_kick_writeback(), set the WB_start_dontcache bit
inside the unlocked_inode_to_wb_begin/end section for correct cgroup
writeback domain targeting, but defer the wb_wakeup() call until after
the section ends, since wb_wakeup() uses spin_unlock_irq() which would
unconditionally re-enable interrupts while the i_pages xa_lock may still
be held under irqsave during a cgroup writeback switch. Pin the wb with
wb_get() inside the RCU critical section before calling wb_wakeup()
outside it, since cgroup bdi_writeback structures are RCU-freed and the
wb pointer could become invalid after unlocked_inode_to_wb_end() drops
the RCU read lock.
Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
visibility.
dontcache-bench results (same host, T6F_SKL_1920GBF, 251 GiB RAM,
xfs on NVMe, fio io_uring):
Buffered and direct I/O paths are unaffected by this patchset. All
improvements are confined to the dontcache path:
Single-stream throughput (MB/s):
Before After Change
seq-write/dontcache 298 897 +201%
rand-write/dontcache 131 236 +80%
Tail latency improvements (seq-write/dontcache):
p99: 135,266 us -> 23,986 us (-82%)
p99.9: 8,925,479 us -> 28,443 us (-99.7%)
Multi-writer (4 jobs, sequential write):
Before After Change
dontcache aggregate (MB/s) 2,529 4,532 +79%
dontcache p99 (us) 8,553 1,002 -88%
dontcache p99.9 (us) 109,314 1,057 -99%
Dontcache multi-writer throughput now matches buffered (4,532 vs
4,616 MB/s).
32-file write (Axboe test):
Before After Change
dontcache aggregate (MB/s) 1,548 3,499 +126%
dontcache p99 (us) 10,170 602 -94%
Peak dirty pages (MB) 1,837 213 -88%
Dontcache now reaches 81% of buffered throughput (was 35%).
Competing writers (dontcache vs buffered, separate files):
Before After
buffered writer 868 433 MB/s
dontcache writer 415 433 MB/s
Aggregate 1,284 866 MB/s
Previously the buffered writer starved the dontcache writer 2:1.
With per-bdi_writeback tracking, both writers now receive equal
bandwidth. The aggregate matches the buffered-vs-buffered baseline
(863 MB/s), indicating fair sharing regardless of I/O mode.
The dontcache writer's p99.9 latency collapsed from 119 ms to
33 ms (-73%), eliminating the severe periodic stalls seen in the
baseline. Both writers now share identical latency profiles,
matching the buffered-vs-buffered pattern.
The per-bdi_writeback dirty tracking dramatically reduces peak dirty
pages in dontcache workloads, with the 32-file test dropping from
1.8 GB to 213 MB. Dontcache sequential write throughput triples and
multi-writer throughput reaches parity with buffered I/O, with tail
latencies collapsing by 1-2 orders of magnitude.
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20260511-dontcache-v7-3-2848ddce8090@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
308 lines
8.8 KiB
C
308 lines
8.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_BACKING_DEV_DEFS_H
|
|
#define __LINUX_BACKING_DEV_DEFS_H
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/radix-tree.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/percpu_counter.h>
|
|
#include <linux/percpu-refcount.h>
|
|
#include <linux/flex_proportions.h>
|
|
#include <linux/timer.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/kref.h>
|
|
#include <linux/refcount.h>
|
|
|
|
struct page;
|
|
struct device;
|
|
struct dentry;
|
|
|
|
/*
|
|
* Bits in bdi_writeback.state
|
|
*/
|
|
enum wb_state {
|
|
WB_registered, /* bdi_register() was done */
|
|
WB_writeback_running, /* Writeback is in progress */
|
|
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
|
|
WB_start_all, /* nr_pages == 0 (all) work pending */
|
|
WB_start_dontcache, /* dontcache writeback pending */
|
|
};
|
|
|
|
enum wb_stat_item {
|
|
WB_RECLAIMABLE,
|
|
WB_WRITEBACK,
|
|
WB_DIRTIED,
|
|
WB_WRITTEN,
|
|
WB_DONTCACHE_DIRTY,
|
|
NR_WB_STAT_ITEMS
|
|
};
|
|
|
|
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
|
|
|
|
/*
|
|
* why some writeback work was initiated
|
|
*/
|
|
enum wb_reason {
|
|
WB_REASON_BACKGROUND,
|
|
WB_REASON_VMSCAN,
|
|
WB_REASON_SYNC,
|
|
WB_REASON_PERIODIC,
|
|
WB_REASON_FS_FREE_SPACE,
|
|
/*
|
|
* There is no bdi forker thread any more and works are done
|
|
* by emergency worker, however, this is TPs userland visible
|
|
* and we'll be exposing exactly the same information,
|
|
* so it has a mismatch name.
|
|
*/
|
|
WB_REASON_FORKER_THREAD,
|
|
WB_REASON_FOREIGN_FLUSH,
|
|
WB_REASON_DONTCACHE,
|
|
|
|
WB_REASON_MAX,
|
|
};
|
|
|
|
struct wb_completion {
|
|
atomic_t cnt;
|
|
wait_queue_head_t *waitq;
|
|
unsigned long progress_stamp; /* The jiffies when slow progress is detected */
|
|
unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */
|
|
};
|
|
|
|
#define __WB_COMPLETION_INIT(_waitq) \
|
|
(struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }
|
|
|
|
/*
|
|
* If one wants to wait for one or more wb_writeback_works, each work's
|
|
* ->done should be set to a wb_completion defined using the following
|
|
* macro. Once all work items are issued with wb_queue_work(), the caller
|
|
* can wait for the completion of all using wb_wait_for_completion(). Work
|
|
* items which are waited upon aren't freed automatically on completion.
|
|
*/
|
|
#define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq)
|
|
|
|
#define DEFINE_WB_COMPLETION(cmpl, bdi) \
|
|
struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)
|
|
|
|
/*
|
|
* Each wb (bdi_writeback) can perform writeback operations, is measured
|
|
* and throttled, independently. Without cgroup writeback, each bdi
|
|
* (bdi_writeback) is served by its embedded bdi->wb.
|
|
*
|
|
* On the default hierarchy, blkcg implicitly enables memcg. This allows
|
|
* using memcg's page ownership for attributing writeback IOs, and every
|
|
* memcg - blkcg combination can be served by its own wb by assigning a
|
|
* dedicated wb to each memcg, which enables isolation across different
|
|
* cgroups and propagation of IO back pressure down from the IO layer upto
|
|
* the tasks which are generating the dirty pages to be written back.
|
|
*
|
|
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
|
|
* refcounted with the number of inodes attached to it, and pins the memcg
|
|
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
|
|
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
|
|
* is tested for blkcg after lookup and removed from index on mismatch so
|
|
* that a new wb for the combination can be created.
|
|
*
|
|
* Each bdi_writeback that is not embedded into the backing_dev_info must hold
|
|
* a reference to the parent backing_dev_info. See cgwb_create() for details.
|
|
*/
|
|
struct bdi_writeback {
|
|
struct backing_dev_info *bdi; /* our parent bdi */
|
|
|
|
unsigned long state; /* Always use atomic bitops on this */
|
|
unsigned long last_old_flush; /* last old data flush */
|
|
|
|
struct list_head b_dirty; /* dirty inodes */
|
|
struct list_head b_io; /* parked for writeback */
|
|
struct list_head b_more_io; /* parked for more writeback */
|
|
struct list_head b_dirty_time; /* time stamps are dirty */
|
|
spinlock_t list_lock; /* protects the b_* lists */
|
|
|
|
atomic_t writeback_inodes; /* number of inodes under writeback */
|
|
struct percpu_counter stat[NR_WB_STAT_ITEMS];
|
|
|
|
unsigned long bw_time_stamp; /* last time write bw is updated */
|
|
unsigned long dirtied_stamp;
|
|
unsigned long written_stamp; /* pages written at bw_time_stamp */
|
|
unsigned long write_bandwidth; /* the estimated write bandwidth */
|
|
unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
|
|
|
|
/*
|
|
* The base dirty throttle rate, re-calculated on every 200ms.
|
|
* All the bdi tasks' dirty rate will be curbed under it.
|
|
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
|
|
* in small steps and is much more smooth/stable than the latter.
|
|
*/
|
|
unsigned long dirty_ratelimit;
|
|
unsigned long balanced_dirty_ratelimit;
|
|
|
|
struct fprop_local_percpu completions;
|
|
int dirty_exceeded;
|
|
enum wb_reason start_all_reason;
|
|
|
|
spinlock_t work_lock; /* protects work_list & dwork scheduling */
|
|
struct list_head work_list;
|
|
struct delayed_work dwork; /* work item used for writeback */
|
|
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
|
|
|
|
struct list_head bdi_node; /* anchored at bdi->wb_list */
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
struct percpu_ref refcnt; /* used only for !root wb's */
|
|
struct fprop_local_percpu memcg_completions;
|
|
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
|
|
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
|
|
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
|
|
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
|
|
struct list_head b_attached; /* attached inodes, protected by list_lock */
|
|
struct list_head offline_node; /* anchored at offline_cgwbs */
|
|
struct work_struct switch_work; /* work used to perform inode switching
|
|
* to this wb */
|
|
struct llist_head switch_wbs_ctxs; /* queued contexts for
|
|
* writeback switching */
|
|
|
|
union {
|
|
struct work_struct release_work;
|
|
struct rcu_head rcu;
|
|
};
|
|
#endif
|
|
};
|
|
|
|
struct backing_dev_info {
|
|
u64 id;
|
|
struct rb_node rb_node; /* keyed by ->id */
|
|
struct list_head bdi_list;
|
|
/* max readahead in PAGE_SIZE units */
|
|
unsigned long __data_racy ra_pages;
|
|
|
|
unsigned long io_pages; /* max allowed IO size */
|
|
|
|
struct kref refcnt; /* Reference counter for the structure */
|
|
unsigned int capabilities; /* Device capabilities */
|
|
unsigned int min_ratio;
|
|
unsigned int max_ratio, max_prop_frac;
|
|
|
|
/*
|
|
* Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are
|
|
* any dirty wbs, which is depended upon by bdi_has_dirty().
|
|
*/
|
|
atomic_long_t tot_write_bandwidth;
|
|
/*
|
|
* Jiffies when last process was dirty throttled on this bdi. Used by
|
|
* blk-wbt.
|
|
*/
|
|
unsigned long last_bdp_sleep;
|
|
|
|
struct bdi_writeback wb; /* the root writeback info for this bdi */
|
|
struct list_head wb_list; /* list of all wbs */
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
|
|
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
|
|
struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
|
|
#endif
|
|
wait_queue_head_t wb_waitq;
|
|
|
|
struct device *dev;
|
|
char dev_name[64];
|
|
struct device *owner;
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
struct dentry *debug_dir;
|
|
#endif
|
|
};
|
|
|
|
struct wb_lock_cookie {
|
|
bool locked;
|
|
unsigned long flags;
|
|
};
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
/**
|
|
* wb_tryget - try to increment a wb's refcount
|
|
* @wb: bdi_writeback to get
|
|
*/
|
|
static inline bool wb_tryget(struct bdi_writeback *wb)
|
|
{
|
|
if (wb != &wb->bdi->wb)
|
|
return percpu_ref_tryget(&wb->refcnt);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* wb_get - increment a wb's refcount
|
|
* @wb: bdi_writeback to get
|
|
*/
|
|
static inline void wb_get(struct bdi_writeback *wb)
|
|
{
|
|
if (wb != &wb->bdi->wb)
|
|
percpu_ref_get(&wb->refcnt);
|
|
}
|
|
|
|
/**
|
|
* wb_put_many - decrement a wb's refcount
|
|
* @wb: bdi_writeback to put
|
|
* @nr: number of references to put
|
|
*/
|
|
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
|
|
{
|
|
if (WARN_ON_ONCE(!wb->bdi)) {
|
|
/*
|
|
* A driver bug might cause a file to be removed before bdi was
|
|
* initialized.
|
|
*/
|
|
return;
|
|
}
|
|
|
|
if (wb != &wb->bdi->wb)
|
|
percpu_ref_put_many(&wb->refcnt, nr);
|
|
}
|
|
|
|
/**
|
|
* wb_put - decrement a wb's refcount
|
|
* @wb: bdi_writeback to put
|
|
*/
|
|
static inline void wb_put(struct bdi_writeback *wb)
|
|
{
|
|
wb_put_many(wb, 1);
|
|
}
|
|
|
|
/**
|
|
* wb_dying - is a wb dying?
|
|
* @wb: bdi_writeback of interest
|
|
*
|
|
* Returns whether @wb is unlinked and being drained.
|
|
*/
|
|
static inline bool wb_dying(struct bdi_writeback *wb)
|
|
{
|
|
return percpu_ref_is_dying(&wb->refcnt);
|
|
}
|
|
|
|
#else /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
static inline bool wb_tryget(struct bdi_writeback *wb)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline void wb_get(struct bdi_writeback *wb)
|
|
{
|
|
}
|
|
|
|
static inline void wb_put(struct bdi_writeback *wb)
|
|
{
|
|
}
|
|
|
|
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
|
|
{
|
|
}
|
|
|
|
static inline bool wb_dying(struct bdi_writeback *wb)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
#endif /* __LINUX_BACKING_DEV_DEFS_H */
|