Files
linux-stable-mirror/include/linux/entry-common.h
T
Linus Torvalds c43267e679 Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull arm64 updates from Catalin Marinas:
 "The biggest changes are MPAM enablement in drivers/resctrl and new PMU
  support under drivers/perf.

  On the core side, FEAT_LSUI lets futex atomic operations with EL0
  permissions, avoiding PAN toggling.

  The rest is mostly TLB invalidation refactoring, further generic entry
  work, sysreg updates and a few fixes.

  Core features:

   - Add support for FEAT_LSUI, allowing futex atomic operations without
     toggling Privileged Access Never (PAN)

   - Further refactor the arm64 exception handling code towards the
     generic entry infrastructure

   - Optimise __READ_ONCE() with CONFIG_LTO=y and allow alias analysis
     through it

  Memory management:

   - Refactor the arm64 TLB invalidation API and implementation for
     better control over barrier placement and level-hinted invalidation

   - Enable batched TLB flushes during memory hot-unplug

   - Fix rodata=full block mapping support for realm guests (when
     BBML2_NOABORT is available)

  Perf and PMU:

   - Add support for a whole bunch of system PMUs featured in NVIDIA's
     Tegra410 SoC (cspmu extensions for the fabric and PCIe, new drivers
     for CPU/C2C memory latency PMUs)

   - Clean up iomem resource handling in the Arm CMN driver

   - Fix signedness handling of AA64DFR0.{PMUVer,PerfMon}

  MPAM (Memory Partitioning And Monitoring):

   - Add architecture context-switch and hiding of the feature from KVM

   - Add interface to allow MPAM to be exposed to user-space using
     resctrl

   - Add errata workaround for some existing platforms

   - Add documentation for using MPAM and what shape of platforms can
     use resctrl

  Miscellaneous:

   - Check DAIF (and PMR, where relevant) at task-switch time

   - Skip TFSR_EL1 checks and barriers in synchronous MTE tag check mode
     (only relevant to asynchronous or asymmetric tag check modes)

   - Remove a duplicate allocation in the kexec code

   - Remove redundant save/restore of SCS SP on entry to/from EL0

   - Generate the KERNEL_HWCAP_ definitions from the arm64 hwcap
     descriptions

   - Add kselftest coverage for cmpbr_sigill()

   - Update sysreg definitions"

* tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (109 commits)
  arm64: rsi: use linear-map alias for realm config buffer
  arm64: Kconfig: fix duplicate word in CMDLINE help text
  arm64: mte: Skip TFSR_EL1 checks and barriers in synchronous tag check mode
  arm64/sysreg: Update ID_AA64SMFR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64ZFR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64FPFR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64ISAR2_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64ISAR0_EL1 description to DDI0601 2025-12
  arm64/hwcap: Generate the KERNEL_HWCAP_ definitions for the hwcaps
  arm64: kexec: Remove duplicate allocation for trans_pgd
  ACPI: AGDI: fix missing newline in error message
  arm64: Check DAIF (and PMR) at task-switch time
  arm64: entry: Use split preemption logic
  arm64: entry: Use irqentry_{enter_from,exit_to}_kernel_mode()
  arm64: entry: Consistently prefix arm64-specific wrappers
  arm64: entry: Don't preempt with SError or Debug masked
  entry: Split preemption from irqentry_exit_to_kernel_mode()
  entry: Split kernel mode logic from irqentry_{enter,exit}()
  entry: Move irqentry_enter() prototype later
  entry: Remove local_irq_{enable,disable}_exit_to_user()
  ...
2026-04-14 16:48:56 -07:00

334 lines
10 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_ENTRYCOMMON_H
#define __LINUX_ENTRYCOMMON_H
#include <linux/audit.h>
#include <linux/irq-entry-common.h>
#include <linux/livepatch.h>
#include <linux/ptrace.h>
#include <linux/resume_user_mode.h>
#include <linux/seccomp.h>
#include <linux/sched.h>
#include <asm/entry-common.h>
#include <asm/syscall.h>
#ifndef _TIF_UPROBE
# define _TIF_UPROBE (0)
#endif
/*
* SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
*/
#ifndef ARCH_SYSCALL_WORK_ENTER
# define ARCH_SYSCALL_WORK_ENTER (0)
#endif
/*
* SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
*/
#ifndef ARCH_SYSCALL_WORK_EXIT
# define ARCH_SYSCALL_WORK_EXIT (0)
#endif
#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \
SYSCALL_WORK_SYSCALL_TRACEPOINT | \
SYSCALL_WORK_SYSCALL_TRACE | \
SYSCALL_WORK_SYSCALL_EMU | \
SYSCALL_WORK_SYSCALL_AUDIT | \
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \
ARCH_SYSCALL_WORK_ENTER)
#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
SYSCALL_WORK_SYSCALL_TRACE | \
SYSCALL_WORK_SYSCALL_AUDIT | \
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
ARCH_SYSCALL_WORK_EXIT)
/**
* arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
* @regs: Pointer to the register state at syscall entry
*
* Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
*
* This allows architecture specific ptrace_report_syscall_entry()
* implementations. If not defined by the architecture this falls back to
* to ptrace_report_syscall_entry().
*/
static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
#ifndef arch_ptrace_report_syscall_entry
static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
{
return ptrace_report_syscall_entry(regs);
}
#endif
bool syscall_user_dispatch(struct pt_regs *regs);
long trace_syscall_enter(struct pt_regs *regs, long syscall);
void trace_syscall_exit(struct pt_regs *regs, long ret);
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
unsigned long args[6];
syscall_get_arguments(current, regs, args);
audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
}
}
static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
{
long syscall, ret = 0;
/*
* Handle Syscall User Dispatch. This must comes first, since
* the ABI here can be something that doesn't make sense for
* other syscall_work features.
*/
if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
if (syscall_user_dispatch(regs))
return -1L;
}
/*
* User space got a time slice extension granted and relinquishes
* the CPU. The work stops the slice timer to avoid an extra round
* through hrtimer_interrupt().
*/
if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
rseq_syscall_enter_work(syscall_get_nr(current, regs));
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
ret = arch_ptrace_report_syscall_entry(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
/* Do seccomp after ptrace, to catch any tracer changes. */
if (work & SYSCALL_WORK_SECCOMP) {
ret = __secure_computing();
if (ret == -1L)
return ret;
}
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
syscall = trace_syscall_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
return ret ? : syscall;
}
/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
* a syscall
* @regs: Pointer to currents pt_regs
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* enabled after invoking enter_from_user_mode(), enabling interrupts and
* extra architecture specific work.
*
* Returns: The original or a modified syscall number
*
* If the returned syscall number is -1 then the syscall should be
* skipped. In this case the caller may invoke syscall_set_error() or
* syscall_set_return_value() first. If neither of those are called and -1
* is returned, then the syscall will fail with ENOSYS.
*
* It handles the following work items:
*
* 1) syscall_work flag dependent invocations of
* ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
* 2) Invocation of audit_syscall_entry()
*/
static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
if (work & SYSCALL_WORK_ENTER)
syscall = syscall_trace_enter(regs, work);
return syscall;
}
/**
* syscall_enter_from_user_mode - Establish state and check and handle work
* before invoking a syscall
* @regs: Pointer to currents pt_regs
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
* This is the combination of enter_from_user_mode() and
* syscall_enter_from_user_mode_work() to be used when there is no
* architecture specific work to be done between the two.
*
* Returns: The original or a modified syscall number. See
* syscall_enter_from_user_mode_work() for further explanation.
*/
static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
long ret;
enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
ret = syscall_enter_from_user_mode_work(regs, syscall);
instrumentation_end();
return ret;
}
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
* instruction has been already reported in syscall_enter_from_user_mode().
*/
static __always_inline bool report_single_step(unsigned long work)
{
if (work & SYSCALL_WORK_SYSCALL_EMU)
return false;
return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
}
/**
* arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
* @regs: Pointer to the register state at syscall exit
* @step: Indicates a single-step exit rather than a normal syscall exit
*
* This allows architecture specific ptrace_report_syscall_exit()
* implementations. If not defined by the architecture this falls back to
* to ptrace_report_syscall_exit().
*/
static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
int step);
#ifndef arch_ptrace_report_syscall_exit
static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
int step)
{
ptrace_report_syscall_exit(regs, step);
}
#endif
/**
* syscall_exit_work - Handle work before returning to user mode
* @regs: Pointer to current pt_regs
* @work: Current thread syscall work
*
* Do one-time syscall specific work.
*/
static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
/*
* If the syscall was rolled back due to syscall user dispatching,
* then the tracers below are not invoked for the same reason as
* the entry side was not invoked in syscall_trace_enter(): The ABI
* of these syscalls is unknown.
*/
if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
if (unlikely(current->syscall_dispatch.on_dispatch)) {
current->syscall_dispatch.on_dispatch = false;
return;
}
}
audit_syscall_exit(regs);
if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
trace_syscall_exit(regs, syscall_get_return_value(current, regs));
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
arch_ptrace_report_syscall_exit(regs, step);
}
/**
* syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
* @regs: Pointer to currents pt_regs
*
* Step 1 of syscall_exit_to_user_mode() with the same calling convention.
*
* The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
*/
static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
unsigned long nr = syscall_get_nr(current, regs);
CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
local_irq_enable();
}
rseq_debug_syscall_return(regs);
/*
* Do one-time syscall specific work. If these work items are
* enabled, we want to run them exactly once per syscall exit with
* interrupts enabled.
*/
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
}
/**
* syscall_exit_to_user_mode - Handle work before returning to user mode
* @regs: Pointer to currents pt_regs
*
* Invoked with interrupts enabled and fully valid @regs. Returns with all
* work handled, interrupts disabled such that the caller can immediately
* switch to user mode. Called from architecture specific syscall and ret
* from fork code.
*
* The call order is:
* 1) One-time syscall exit work:
* - rseq syscall exit
* - audit
* - syscall tracing
* - ptrace (single stepping)
*
* 2) Preparatory work
* - Disable interrupts
* - Exit to user mode loop (common TIF handling). Invokes
* arch_exit_to_user_mode_work() for architecture specific TIF work
* - Architecture specific one time work arch_exit_to_user_mode_prepare()
* - Address limit and lockdep checks
*
* 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
* functionality in exit_to_user_mode().
*
* This is a combination of syscall_exit_to_user_mode_work() (1), disabling
* interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
* exit_to_user_mode() (3). This function is preferred unless there is a
* compelling architectural reason to invoke the functions separately.
*/
static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
syscall_exit_to_user_mode_work(regs);
local_irq_disable();
syscall_exit_to_user_mode_prepare(regs);
instrumentation_end();
exit_to_user_mode();
}
#endif