Files
Mathieu Desnoyers 3b68df9781 rseq: slice ext: Ensure rseq feature size differs from original rseq size
Before rseq became extensible, its original size was 32 bytes even
though the active rseq area was only 20 bytes. This had the following
impact in terms of userspace ecosystem evolution:

* The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set
  to 32, even though the size of the active rseq area is really 20.
* The GNU libc 2.40 changes this __rseq_size to 20, thus making it
  express the active rseq area.
* Starting from glibc 2.41, __rseq_size corresponds to the
  AT_RSEQ_FEATURE_SIZE from getauxval(3).

This means that users of __rseq_size can always expect it to
correspond to the active rseq area, except for the value 32, for
which the active rseq area is 20 bytes.

Exposing a 32 bytes feature size would make life needlessly painful
for userspace. Therefore, add a reserved field at the end of the
rseq area to bump the feature size to 33 bytes. This reserved field
is expected to be replaced with whatever field will come next,
expecting that this field will be larger than 1 byte.

The effect of this change is to increase the size from 32 to 64 bytes
before we actually have fields using that memory.

Clarify the allocation size and alignment requirements in the struct
rseq uapi comment.

Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the
value of the active rseq area size rounded up to next power of 2, which
guarantees that the rseq structure will always be aligned on the nearest
power of two large enough to contain it, even as it grows. Change the
alignment check in the rseq registration accordingly.

This will minimize the amount of ABI corner-cases we need to document
and require userspace to play games with. The rule stays simple when
__rseq_size != 32:

  #define rseq_field_available(field)	(__rseq_size >= offsetofend(struct rseq_abi, field))

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com
2026-02-23 11:19:19 +01:00

190 lines
5.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
#include <linux/sched.h>
#include <uapi/linux/rseq.h>
void __rseq_handle_slowpath(struct pt_regs *regs);
/* Invoked from resume_user_mode_work() */
static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (current->rseq.event.slowpath)
__rseq_handle_slowpath(regs);
} else {
/* '&' is intentional to spare one conditional branch */
if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
__rseq_handle_slowpath(regs);
}
}
void __rseq_signal_deliver(int sig, struct pt_regs *regs);
/*
* Invoked from signal delivery to fixup based on the register context before
* switching to the signal delivery context.
*/
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
/* '&' is intentional to spare one conditional branch */
if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
__rseq_signal_deliver(ksig->sig, regs);
} else {
if (current->rseq.event.has_rseq)
__rseq_signal_deliver(ksig->sig, regs);
}
}
static inline void rseq_raise_notify_resume(struct task_struct *t)
{
set_tsk_thread_flag(t, TIF_RSEQ);
}
/* Invoked from context switch to force evaluation on exit to user */
static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
struct rseq_event *ev = &t->rseq.event;
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
/*
* Avoid a boat load of conditionals by using simple logic
* to determine whether NOTIFY_RESUME needs to be raised.
*
* It's required when the CPU or MM CID has changed or
* the entry was from user space.
*/
bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
if (raise) {
ev->sched_switch = true;
rseq_raise_notify_resume(t);
}
} else {
if (ev->has_rseq) {
t->rseq.event.sched_switch = true;
rseq_raise_notify_resume(t);
}
}
}
/*
* Invoked from __set_task_cpu() when a task migrates or from
* mm_cid_schedin() when the CID changes to enforce an IDs update.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
t->rseq.event.ids_changed = true;
}
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
if (current->rseq.event.has_rseq) {
current->rseq.event.ids_changed = true;
current->rseq.event.sched_switch = true;
rseq_raise_notify_resume(current);
}
}
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME on architectures that don't use the
* generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
*
* To avoid updating user space RSEQ in that case just to do it eventually
* again before returning to user space, because __rseq_handle_slowpath()
* does nothing when invoked with NULL register state.
*
* After returning from guest mode, before exiting to userspace, hypervisors
* must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
*/
static inline void rseq_virt_userspace_exit(void)
{
/*
* The generic optimization for deferring RSEQ updates until the next
* exit relies on having a dedicated TIF_RSEQ.
*/
if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
current->rseq.event.sched_switch)
rseq_raise_notify_resume(current);
}
static inline void rseq_reset(struct task_struct *t)
{
memset(&t->rseq, 0, sizeof(t->rseq));
t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
static inline void rseq_execve(struct task_struct *t)
{
rseq_reset(t);
}
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
*
* On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
* on the COW page on exit to user space, when the child stays on the same
* CPU as the parent. That's obviously not guaranteed, but in overcommit
* scenarios it is more likely and optimizes for the fork/exec case without
* taking the fault.
*/
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
if (clone_flags & CLONE_VM)
rseq_reset(t);
else
t->rseq = current->rseq;
}
/*
* Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
* registration. This is the active rseq area size rounded up to next
* power of 2, which guarantees that the rseq structure will always be
* aligned on the nearest power of two large enough to contain it, even
* as it grows.
*/
static inline unsigned int rseq_alloc_align(void)
{
return 1U << get_count_order(offsetof(struct rseq, end));
}
#else /* CONFIG_RSEQ */
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
void rseq_syscall(struct pt_regs *regs);
#else /* CONFIG_DEBUG_RSEQ */
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
void rseq_syscall_enter_work(long syscall);
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline void rseq_syscall_enter_work(long syscall) { }
static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
return -ENOTSUPP;
}
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
#endif /* _LINUX_RSEQ_H */