Files
Thomas Gleixner dd0a046069 rseq: Implement syscall entry work for time slice extensions
The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice
extension. This allows to handle the rseq_slice_yield() syscall, which is
used by user space to relinquish the CPU after finishing the critical
section for which it requested an extension.

In case the kernel state is still GRANTED, the kernel resets both kernel
and user space state with a set of sanity checks. If the kernel state is
already cleared, then this raced against the timer or some other interrupt
and just clears the work bit.

Doing it in syscall entry work allows to catch misbehaving user space,
which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the
critical section. Contrary to the initial strict requirement to use
rseq_slice_yield() arbitrary syscalls are not considered a violation of the
ABI contract anymore to allow onion architecture applications, which cannot
control the code inside a critical section, to utilize this as well.

If the code detects inconsistent user space that result in a SIGSEGV for
the application.

If the grant was still active and the task was not preempted yet, the work
code reschedules immediately before continuing through the syscall.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de
2026-01-22 11:11:18 +01:00

178 lines
5.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
#include <linux/sched.h>
#include <uapi/linux/rseq.h>
void __rseq_handle_slowpath(struct pt_regs *regs);
/* Invoked from resume_user_mode_work() */
static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (current->rseq.event.slowpath)
__rseq_handle_slowpath(regs);
} else {
/* '&' is intentional to spare one conditional branch */
if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
__rseq_handle_slowpath(regs);
}
}
void __rseq_signal_deliver(int sig, struct pt_regs *regs);
/*
* Invoked from signal delivery to fixup based on the register context before
* switching to the signal delivery context.
*/
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
/* '&' is intentional to spare one conditional branch */
if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
__rseq_signal_deliver(ksig->sig, regs);
} else {
if (current->rseq.event.has_rseq)
__rseq_signal_deliver(ksig->sig, regs);
}
}
static inline void rseq_raise_notify_resume(struct task_struct *t)
{
set_tsk_thread_flag(t, TIF_RSEQ);
}
/* Invoked from context switch to force evaluation on exit to user */
static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
struct rseq_event *ev = &t->rseq.event;
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
/*
* Avoid a boat load of conditionals by using simple logic
* to determine whether NOTIFY_RESUME needs to be raised.
*
* It's required when the CPU or MM CID has changed or
* the entry was from user space.
*/
bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
if (raise) {
ev->sched_switch = true;
rseq_raise_notify_resume(t);
}
} else {
if (ev->has_rseq) {
t->rseq.event.sched_switch = true;
rseq_raise_notify_resume(t);
}
}
}
/*
* Invoked from __set_task_cpu() when a task migrates or from
* mm_cid_schedin() when the CID changes to enforce an IDs update.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
t->rseq.event.ids_changed = true;
}
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
if (current->rseq.event.has_rseq) {
current->rseq.event.ids_changed = true;
current->rseq.event.sched_switch = true;
rseq_raise_notify_resume(current);
}
}
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME on architectures that don't use the
* generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
*
* To avoid updating user space RSEQ in that case just to do it eventually
* again before returning to user space, because __rseq_handle_slowpath()
* does nothing when invoked with NULL register state.
*
* After returning from guest mode, before exiting to userspace, hypervisors
* must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
*/
static inline void rseq_virt_userspace_exit(void)
{
/*
* The generic optimization for deferring RSEQ updates until the next
* exit relies on having a dedicated TIF_RSEQ.
*/
if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
current->rseq.event.sched_switch)
rseq_raise_notify_resume(current);
}
static inline void rseq_reset(struct task_struct *t)
{
memset(&t->rseq, 0, sizeof(t->rseq));
t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
static inline void rseq_execve(struct task_struct *t)
{
rseq_reset(t);
}
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
*
* On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
* on the COW page on exit to user space, when the child stays on the same
* CPU as the parent. That's obviously not guaranteed, but in overcommit
* scenarios it is more likely and optimizes for the fork/exec case without
* taking the fault.
*/
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
if (clone_flags & CLONE_VM)
rseq_reset(t);
else
t->rseq = current->rseq;
}
#else /* CONFIG_RSEQ */
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
void rseq_syscall(struct pt_regs *regs);
#else /* CONFIG_DEBUG_RSEQ */
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
void rseq_syscall_enter_work(long syscall);
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline void rseq_syscall_enter_work(long syscall) { }
static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
return -ENOTSUPP;
}
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
#endif /* _LINUX_RSEQ_H */