Files
linux-stable-mirror/kernel/futex/syscalls.c
T
Thomas Gleixner 3ca9595d9f futex: Add support for unlocking robust futexes
Unlocking robust non-PI futexes happens in user space with the following
sequence:

  1)	robust_list_set_op_pending(mutex);
  2)	robust_list_remove(mutex);

  	lval = 0;
  3)	lval = atomic_xchg(lock, lval);
  4)	if (lval & WAITERS)
  5)		sys_futex(WAKE,....);
  6)	robust_list_clear_op_pending();

That opens a window between #3 and #6 where the mutex could be acquired by
some other task which observes that it is the last user and:

  A) unmaps the mutex memory
  B) maps a different file, which ends up covering the same address

When the original task exits before reaching #6 then the kernel robust list
handling observes the pending op entry and tries to fix up user space.

In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupting unrelated data.

PI futexes have a similar problem both for the non-contented user space
unlock and the in kernel unlock:

  1)	robust_list_set_op_pending(mutex);
  2)	robust_list_remove(mutex);

  	lval = gettid();
  3)	if (!atomic_try_cmpxchg(lock, lval, 0))
  4)		sys_futex(UNLOCK_PI,....);
  5)	robust_list_clear_op_pending();

Address the first part of the problem where the futexes have waiters and
need to enter the kernel anyway. Add a new FUTEX_ROBUST_UNLOCK flag, which
is valid for the sys_futex() FUTEX_UNLOCK_PI, FUTEX_WAKE, FUTEX_WAKE_BITSET
operations.

This deliberately omits FUTEX_WAKE_OP from this treatment as it's unclear
whether this is needed and there is no usage of it in glibc either to
investigate.

For the futex2 syscall family this needs to be implemented with a new
syscall.

The sys_futex() case [ab]uses the @uaddr2 argument to hand the pointer to
robust_list_head::list_pending_op into the kernel. This argument is only
evaluated when the FUTEX_ROBUST_UNLOCK bit is set and is therefore backward
compatible.

This is an explicit argument to avoid the lookup of the robust list pointer
and retrieving the pending op pointer from there. User space has the
pointer already available so it can just put it into the @uaddr2
argument. Aside of that this allows the usage of multiple robust lists in
the future without any changes to the internal functions as they just operate
on the provided pointer.

This requires a second flag FUTEX_ROBUST_LIST32 which indicates that the
robust list pointer points to an u32 and not to an u64. This is required
for two reasons:

    1) sys_futex() has no compat variant

    2) The gaming emulators use both both 64-bit and compat 32-bit robust
       lists in the same 64-bit application

As a consequence 32-bit applications have to set this flag unconditionally
so they can run on a 64-bit kernel in compat mode unmodified. 32-bit
kernels return an error code when the flag is not set. 64-bit kernels will
happily clear the full 64 bits if user space fails to set it.

In case of FUTEX_UNLOCK_PI this clears the robust list pending op when the
unlock succeeded. In case of errors, the user space value is still locked
by the caller and therefore the above cannot happen.

In case of FUTEX_WAKE* this does the unlock of the futex in the kernel and
clears the robust list pending op when the unlock was successful. If not,
the user space value is still locked and user space has to deal with the
returned error. That means that the unlocking of non-PI robust futexes has
to use the same try_cmpxchg() unlock scheme as PI futexes.

If the clearing of the pending list op fails (fault) then the kernel clears
the registered robust list pointer if it matches to prevent that exit()
will try to handle invalid data. That's a valid paranoid decision because
the robust list head sits usually in the TLS and if the TLS is not longer
accessible then the chance for fixing up the resulting mess is very close
to zero.

The problem of non-contended unlocks still exists and will be addressed
separately.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://patch.msgid.link/20260602090535.670514505@kernel.org
2026-06-03 11:38:51 +02:00

527 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/syscalls.h>
#include <linux/time_namespace.h>
#include "futex.h"
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
*
* Implementation: user-space maintains a per-thread list of locks it
* is holding. Upon do_exit(), the kernel carefully walks this list,
* and marks all locks that are owned by this thread with the
* FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
* always manipulated with the lock held, so the list is private and
* per-thread. Userspace also maintains a per-thread 'list_op_pending'
* field, to allow the kernel to clean up if the thread dies after
* acquiring the lock, but just before it could have added itself to
* the list. There can only be one such pending lock.
*/
/**
* sys_set_robust_list() - Set the robust-futex list head of a task
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
*/
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len)
{
/* The kernel knows only one size for now. */
if (unlikely(len != sizeof(*head)))
return -EINVAL;
current->futex.robust_list = head;
return 0;
}
static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
{
#ifdef CONFIG_COMPAT
if (compat)
return p->futex.compat_robust_list;
#endif
return p->futex.robust_list;
}
static void __user *futex_get_robust_list_common(int pid, bool compat)
{
struct task_struct *p = current;
void __user *head;
int ret;
scoped_guard(rcu) {
if (pid) {
p = find_task_by_vpid(pid);
if (!p)
return (void __user *)ERR_PTR(-ESRCH);
}
get_task_struct(p);
}
/*
* Hold exec_update_lock to serialize with concurrent exec()
* so ptrace_may_access() is checked against stable credentials
*/
ret = down_read_killable(&p->signal->exec_update_lock);
if (ret)
goto err_put;
ret = -EPERM;
if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
goto err_unlock;
head = futex_task_robust_list(p, compat);
up_read(&p->signal->exec_update_lock);
put_task_struct(p);
return head;
err_unlock:
up_read(&p->signal->exec_update_lock);
err_put:
put_task_struct(p);
return (void __user *)ERR_PTR(ret);
}
/**
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
* @len_ptr: pointer to a length field, the kernel fills in the header size
*/
SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,
size_t __user *, len_ptr)
{
struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
if (IS_ERR(head))
return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
unsigned int flags = futex_to_flags(op);
int cmd = op & FUTEX_CMD_MASK;
if (flags & FLAGS_CLOCKRT) {
if (cmd != FUTEX_WAIT_BITSET &&
cmd != FUTEX_WAIT_REQUEUE_PI &&
cmd != FUTEX_LOCK_PI2)
return -ENOSYS;
}
if (flags & FLAGS_ROBUST_UNLOCK) {
if (cmd != FUTEX_WAKE &&
cmd != FUTEX_WAKE_BITSET &&
cmd != FUTEX_UNLOCK_PI)
return -ENOSYS;
}
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
case FUTEX_WAIT_BITSET:
return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, uaddr2, val, val3);
case FUTEX_REQUEUE:
return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
flags |= FLAGS_CLOCKRT;
fallthrough;
case FUTEX_LOCK_PI2:
return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
return futex_unlock_pi(uaddr, flags, uaddr2);
case FUTEX_TRYLOCK_PI:
return futex_lock_pi(uaddr, flags, NULL, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
uaddr2);
case FUTEX_CMP_REQUEUE_PI:
return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
}
return -ENOSYS;
}
static __always_inline bool futex_cmd_has_timeout(u32 cmd)
{
switch (cmd) {
case FUTEX_WAIT:
case FUTEX_LOCK_PI:
case FUTEX_LOCK_PI2:
case FUTEX_WAIT_BITSET:
case FUTEX_WAIT_REQUEUE_PI:
return true;
}
return false;
}
static __always_inline int
futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
{
if (!timespec64_valid(ts))
return -EINVAL;
*t = timespec64_to_ktime(*ts);
if (cmd == FUTEX_WAIT)
*t = ktime_add_safe(ktime_get(), *t);
else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
return 0;
}
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
const struct __kernel_timespec __user *, utime,
u32 __user *, uaddr2, u32, val3)
{
int ret, cmd = op & FUTEX_CMD_MASK;
ktime_t t, *tp = NULL;
struct timespec64 ts;
if (utime && futex_cmd_has_timeout(cmd)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT;
if (get_timespec64(&ts, utime))
return -EFAULT;
ret = futex_init_timeout(cmd, op, &ts, &t);
if (ret)
return ret;
tp = &t;
}
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
/**
* futex_parse_waitv - Parse a waitv array from userspace
* @futexv: Kernel side list of waiters to be filled
* @uwaitv: Userspace list to be parsed
* @nr_futexes: Length of futexv
* @wake: Wake to call when futex is woken
* @wake_data: Data for the wake handler
*
* Return: Error code on failure, 0 on success
*/
int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes, futex_wake_fn *wake,
void *wake_data)
{
struct futex_waitv aux;
unsigned int i;
for (i = 0; i < nr_futexes; i++) {
unsigned int flags;
if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
return -EFAULT;
if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
return -EINVAL;
flags = futex2_to_flags(aux.flags);
if (!futex_flags_valid(flags))
return -EINVAL;
if (!futex_validate_input(flags, aux.val))
return -EINVAL;
futexv[i].w.flags = flags;
futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init;
futexv[i].q.wake = wake;
futexv[i].q.wake_data = wake_data;
}
return 0;
}
static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
clockid_t clockid, struct hrtimer_sleeper *to)
{
int flag_clkid = 0, flag_init = 0;
struct timespec64 ts;
ktime_t time;
int ret;
if (!timeout)
return 0;
if (clockid == CLOCK_REALTIME) {
flag_clkid = FLAGS_CLOCKRT;
flag_init = FUTEX_CLOCK_REALTIME;
}
if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
return -EINVAL;
if (get_timespec64(&ts, timeout))
return -EFAULT;
/*
* Since there's no opcode for futex_waitv, use
* FUTEX_WAIT_BITSET that uses absolute timeout as well
*/
ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
if (ret)
return ret;
futex_setup_timer(&time, to, flag_clkid, 0);
return 0;
}
static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
{
hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
}
/**
* sys_futex_waitv - Wait on a list of futexes
* @waiters: List of futexes to wait on
* @nr_futexes: Length of futexv
* @flags: Flag for timeout (monotonic/realtime)
* @timeout: Optional absolute timeout.
* @clockid: Clock to be used for the timeout, realtime or monotonic.
*
* Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
* if a futex_wake() is performed at any uaddr. The syscall returns immediately
* if any waiter has *uaddr != val. *timeout is an optional timeout value for
* the operation. Each waiter has individual flags. The `flags` argument for
* the syscall should be used solely for specifying the timeout as realtime, if
* needed. Flags for private futexes, sizes, etc. should be used on the
* individual flags of each waiter.
*
* Returns the array index of one of the woken futexes. No further information
* is provided: any number of other futexes may also have been woken by the
* same event, and if more than one futex was woken, the retrned index may
* refer to any one of them. (It is not necessaryily the futex with the
* smallest index, nor the one most recently woken, nor...)
*/
SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
unsigned int, nr_futexes, unsigned int, flags,
struct __kernel_timespec __user *, timeout, clockid_t, clockid)
{
struct hrtimer_sleeper to;
struct futex_vector *futexv;
int ret;
/* This syscall supports no flags for now */
if (flags)
return -EINVAL;
if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
return -EINVAL;
if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
return ret;
futexv = kzalloc_objs(*futexv, nr_futexes);
if (!futexv) {
ret = -ENOMEM;
goto destroy_timer;
}
ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
NULL);
if (!ret)
ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
kfree(futexv);
destroy_timer:
if (timeout)
futex2_destroy_timeout(&to);
return ret;
}
/*
* sys_futex_wake - Wake a number of futexes
* @uaddr: Address of the futex(es) to wake
* @mask: bitmask
* @nr: Number of the futexes to wake
* @flags: FUTEX2 flags
*
* Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
* futex2 family of calls.
*/
SYSCALL_DEFINE4(futex_wake,
void __user *, uaddr,
unsigned long, mask,
int, nr,
unsigned int, flags)
{
if (flags & ~FUTEX2_VALID_MASK)
return -EINVAL;
flags = futex2_to_flags(flags);
if (!futex_flags_valid(flags))
return -EINVAL;
if (!futex_validate_input(flags, mask))
return -EINVAL;
return futex_wake(uaddr, FLAGS_STRICT | flags, NULL, nr, mask);
}
/*
* sys_futex_wait - Wait on a futex
* @uaddr: Address of the futex to wait on
* @val: Value of @uaddr
* @mask: bitmask
* @flags: FUTEX2 flags
* @timeout: Optional absolute timeout
* @clockid: Clock to be used for the timeout, realtime or monotonic
*
* Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
* futex2 familiy of calls.
*/
SYSCALL_DEFINE6(futex_wait,
void __user *, uaddr,
unsigned long, val,
unsigned long, mask,
unsigned int, flags,
struct __kernel_timespec __user *, timeout,
clockid_t, clockid)
{
struct hrtimer_sleeper to;
int ret;
if (flags & ~FUTEX2_VALID_MASK)
return -EINVAL;
flags = futex2_to_flags(flags);
if (!futex_flags_valid(flags))
return -EINVAL;
if (!futex_validate_input(flags, val) ||
!futex_validate_input(flags, mask))
return -EINVAL;
if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
return ret;
ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
if (timeout)
futex2_destroy_timeout(&to);
return ret;
}
/*
* sys_futex_requeue - Requeue a waiter from one futex to another
* @waiters: array describing the source and destination futex
* @flags: unused
* @nr_wake: number of futexes to wake
* @nr_requeue: number of futexes to requeue
*
* Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
* futex2 family of calls.
*/
SYSCALL_DEFINE4(futex_requeue,
struct futex_waitv __user *, waiters,
unsigned int, flags,
int, nr_wake,
int, nr_requeue)
{
struct futex_vector futexes[2];
u32 cmpval;
int ret;
if (flags)
return -EINVAL;
if (!waiters)
return -EINVAL;
ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
if (ret)
return ret;
/*
* For now mandate both flags are identical, like the sys_futex()
* interface has. If/when we merge the variable sized futex support,
* that patch can modify this test to allow a difference in size.
*/
if (futexes[0].w.flags != futexes[1].w.flags)
return -EINVAL;
cmpval = futexes[0].w.val;
return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
nr_wake, nr_requeue, &cmpval, 0);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head,
compat_size_t, len)
{
if (unlikely(len != sizeof(*head)))
return -EINVAL;
current->futex.compat_robust_list = head;
return 0;
}
COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
compat_uptr_t __user *, head_ptr,
compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
if (IS_ERR(head))
return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
}
#endif /* CONFIG_COMPAT */
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
int ret, cmd = op & FUTEX_CMD_MASK;
ktime_t t, *tp = NULL;
struct timespec64 ts;
if (utime && futex_cmd_has_timeout(cmd)) {
if (get_old_timespec32(&ts, utime))
return -EFAULT;
ret = futex_init_timeout(cmd, op, &ts, &t);
if (ret)
return ret;
tp = &t;
}
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */