mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-03-03 18:28:01 +01:00
Before rseq became extensible, its original size was 32 bytes even though the active rseq area was only 20 bytes. This had the following impact in terms of userspace ecosystem evolution: * The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set to 32, even though the size of the active rseq area is really 20. * The GNU libc 2.40 changes this __rseq_size to 20, thus making it express the active rseq area. * Starting from glibc 2.41, __rseq_size corresponds to the AT_RSEQ_FEATURE_SIZE from getauxval(3). This means that users of __rseq_size can always expect it to correspond to the active rseq area, except for the value 32, for which the active rseq area is 20 bytes. Exposing a 32 bytes feature size would make life needlessly painful for userspace. Therefore, add a reserved field at the end of the rseq area to bump the feature size to 33 bytes. This reserved field is expected to be replaced with whatever field will come next, expecting that this field will be larger than 1 byte. The effect of this change is to increase the size from 32 to 64 bytes before we actually have fields using that memory. Clarify the allocation size and alignment requirements in the struct rseq uapi comment. Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the value of the active rseq area size rounded up to next power of 2, which guarantees that the rseq structure will always be aligned on the nearest power of two large enough to contain it, even as it grows. Change the alignment check in the rseq registration accordingly. This will minimize the amount of ABI corner-cases we need to document and require userspace to play games with. The rule stays simple when __rseq_size != 32: #define rseq_field_available(field) (__rseq_size >= offsetofend(struct rseq_abi, field)) Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com
190 lines
5.9 KiB
C
190 lines
5.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
|
|
#ifndef _LINUX_RSEQ_H
|
|
#define _LINUX_RSEQ_H
|
|
|
|
#ifdef CONFIG_RSEQ
|
|
#include <linux/sched.h>
|
|
|
|
#include <uapi/linux/rseq.h>
|
|
|
|
void __rseq_handle_slowpath(struct pt_regs *regs);
|
|
|
|
/* Invoked from resume_user_mode_work() */
|
|
static inline void rseq_handle_slowpath(struct pt_regs *regs)
|
|
{
|
|
if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
|
|
if (current->rseq.event.slowpath)
|
|
__rseq_handle_slowpath(regs);
|
|
} else {
|
|
/* '&' is intentional to spare one conditional branch */
|
|
if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
|
|
__rseq_handle_slowpath(regs);
|
|
}
|
|
}
|
|
|
|
void __rseq_signal_deliver(int sig, struct pt_regs *regs);
|
|
|
|
/*
|
|
* Invoked from signal delivery to fixup based on the register context before
|
|
* switching to the signal delivery context.
|
|
*/
|
|
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
|
|
{
|
|
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
|
|
/* '&' is intentional to spare one conditional branch */
|
|
if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
|
|
__rseq_signal_deliver(ksig->sig, regs);
|
|
} else {
|
|
if (current->rseq.event.has_rseq)
|
|
__rseq_signal_deliver(ksig->sig, regs);
|
|
}
|
|
}
|
|
|
|
static inline void rseq_raise_notify_resume(struct task_struct *t)
|
|
{
|
|
set_tsk_thread_flag(t, TIF_RSEQ);
|
|
}
|
|
|
|
/* Invoked from context switch to force evaluation on exit to user */
|
|
static __always_inline void rseq_sched_switch_event(struct task_struct *t)
|
|
{
|
|
struct rseq_event *ev = &t->rseq.event;
|
|
|
|
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
|
|
/*
|
|
* Avoid a boat load of conditionals by using simple logic
|
|
* to determine whether NOTIFY_RESUME needs to be raised.
|
|
*
|
|
* It's required when the CPU or MM CID has changed or
|
|
* the entry was from user space.
|
|
*/
|
|
bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
|
|
|
|
if (raise) {
|
|
ev->sched_switch = true;
|
|
rseq_raise_notify_resume(t);
|
|
}
|
|
} else {
|
|
if (ev->has_rseq) {
|
|
t->rseq.event.sched_switch = true;
|
|
rseq_raise_notify_resume(t);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Invoked from __set_task_cpu() when a task migrates or from
|
|
* mm_cid_schedin() when the CID changes to enforce an IDs update.
|
|
*
|
|
* This does not raise TIF_NOTIFY_RESUME as that happens in
|
|
* rseq_sched_switch_event().
|
|
*/
|
|
static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
|
|
{
|
|
t->rseq.event.ids_changed = true;
|
|
}
|
|
|
|
/* Enforce a full update after RSEQ registration and when execve() failed */
|
|
static inline void rseq_force_update(void)
|
|
{
|
|
if (current->rseq.event.has_rseq) {
|
|
current->rseq.event.ids_changed = true;
|
|
current->rseq.event.sched_switch = true;
|
|
rseq_raise_notify_resume(current);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
|
|
* which clears TIF_NOTIFY_RESUME on architectures that don't use the
|
|
* generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
|
|
*
|
|
* To avoid updating user space RSEQ in that case just to do it eventually
|
|
* again before returning to user space, because __rseq_handle_slowpath()
|
|
* does nothing when invoked with NULL register state.
|
|
*
|
|
* After returning from guest mode, before exiting to userspace, hypervisors
|
|
* must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
|
|
*/
|
|
static inline void rseq_virt_userspace_exit(void)
|
|
{
|
|
/*
|
|
* The generic optimization for deferring RSEQ updates until the next
|
|
* exit relies on having a dedicated TIF_RSEQ.
|
|
*/
|
|
if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
|
|
current->rseq.event.sched_switch)
|
|
rseq_raise_notify_resume(current);
|
|
}
|
|
|
|
static inline void rseq_reset(struct task_struct *t)
|
|
{
|
|
memset(&t->rseq, 0, sizeof(t->rseq));
|
|
t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
|
|
}
|
|
|
|
static inline void rseq_execve(struct task_struct *t)
|
|
{
|
|
rseq_reset(t);
|
|
}
|
|
|
|
/*
|
|
* If parent process has a registered restartable sequences area, the
|
|
* child inherits. Unregister rseq for a clone with CLONE_VM set.
|
|
*
|
|
* On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
|
|
* on the COW page on exit to user space, when the child stays on the same
|
|
* CPU as the parent. That's obviously not guaranteed, but in overcommit
|
|
* scenarios it is more likely and optimizes for the fork/exec case without
|
|
* taking the fault.
|
|
*/
|
|
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
|
|
{
|
|
if (clone_flags & CLONE_VM)
|
|
rseq_reset(t);
|
|
else
|
|
t->rseq = current->rseq;
|
|
}
|
|
|
|
/*
|
|
* Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
|
|
* registration. This is the active rseq area size rounded up to next
|
|
* power of 2, which guarantees that the rseq structure will always be
|
|
* aligned on the nearest power of two large enough to contain it, even
|
|
* as it grows.
|
|
*/
|
|
static inline unsigned int rseq_alloc_align(void)
|
|
{
|
|
return 1U << get_count_order(offsetof(struct rseq, end));
|
|
}
|
|
|
|
#else /* CONFIG_RSEQ */
|
|
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
|
|
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
|
|
static inline void rseq_sched_switch_event(struct task_struct *t) { }
|
|
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
|
|
static inline void rseq_force_update(void) { }
|
|
static inline void rseq_virt_userspace_exit(void) { }
|
|
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
|
|
static inline void rseq_execve(struct task_struct *t) { }
|
|
#endif /* !CONFIG_RSEQ */
|
|
|
|
#ifdef CONFIG_DEBUG_RSEQ
|
|
void rseq_syscall(struct pt_regs *regs);
|
|
#else /* CONFIG_DEBUG_RSEQ */
|
|
static inline void rseq_syscall(struct pt_regs *regs) { }
|
|
#endif /* !CONFIG_DEBUG_RSEQ */
|
|
|
|
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
|
|
void rseq_syscall_enter_work(long syscall);
|
|
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
|
|
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
|
|
static inline void rseq_syscall_enter_work(long syscall) { }
|
|
static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
|
|
{
|
|
return -ENOTSUPP;
|
|
}
|
|
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
|
|
|
|
#endif /* _LINUX_RSEQ_H */
|