Files
linux-stable-mirror/include/uapi/linux/futex.h
T
Thomas Gleixner 3ca9595d9f futex: Add support for unlocking robust futexes
Unlocking robust non-PI futexes happens in user space with the following
sequence:

  1)	robust_list_set_op_pending(mutex);
  2)	robust_list_remove(mutex);

  	lval = 0;
  3)	lval = atomic_xchg(lock, lval);
  4)	if (lval & WAITERS)
  5)		sys_futex(WAKE,....);
  6)	robust_list_clear_op_pending();

That opens a window between #3 and #6 where the mutex could be acquired by
some other task which observes that it is the last user and:

  A) unmaps the mutex memory
  B) maps a different file, which ends up covering the same address

When the original task exits before reaching #6 then the kernel robust list
handling observes the pending op entry and tries to fix up user space.

In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupting unrelated data.

PI futexes have a similar problem both for the non-contented user space
unlock and the in kernel unlock:

  1)	robust_list_set_op_pending(mutex);
  2)	robust_list_remove(mutex);

  	lval = gettid();
  3)	if (!atomic_try_cmpxchg(lock, lval, 0))
  4)		sys_futex(UNLOCK_PI,....);
  5)	robust_list_clear_op_pending();

Address the first part of the problem where the futexes have waiters and
need to enter the kernel anyway. Add a new FUTEX_ROBUST_UNLOCK flag, which
is valid for the sys_futex() FUTEX_UNLOCK_PI, FUTEX_WAKE, FUTEX_WAKE_BITSET
operations.

This deliberately omits FUTEX_WAKE_OP from this treatment as it's unclear
whether this is needed and there is no usage of it in glibc either to
investigate.

For the futex2 syscall family this needs to be implemented with a new
syscall.

The sys_futex() case [ab]uses the @uaddr2 argument to hand the pointer to
robust_list_head::list_pending_op into the kernel. This argument is only
evaluated when the FUTEX_ROBUST_UNLOCK bit is set and is therefore backward
compatible.

This is an explicit argument to avoid the lookup of the robust list pointer
and retrieving the pending op pointer from there. User space has the
pointer already available so it can just put it into the @uaddr2
argument. Aside of that this allows the usage of multiple robust lists in
the future without any changes to the internal functions as they just operate
on the provided pointer.

This requires a second flag FUTEX_ROBUST_LIST32 which indicates that the
robust list pointer points to an u32 and not to an u64. This is required
for two reasons:

    1) sys_futex() has no compat variant

    2) The gaming emulators use both both 64-bit and compat 32-bit robust
       lists in the same 64-bit application

As a consequence 32-bit applications have to set this flag unconditionally
so they can run on a 64-bit kernel in compat mode unmodified. 32-bit
kernels return an error code when the flag is not set. 64-bit kernels will
happily clear the full 64 bits if user space fails to set it.

In case of FUTEX_UNLOCK_PI this clears the robust list pending op when the
unlock succeeded. In case of errors, the user space value is still locked
by the caller and therefore the above cannot happen.

In case of FUTEX_WAKE* this does the unlock of the futex in the kernel and
clears the robust list pending op when the unlock was successful. If not,
the user space value is still locked and user space has to deal with the
returned error. That means that the unlocking of non-PI robust futexes has
to use the same try_cmpxchg() unlock scheme as PI futexes.

If the clearing of the pending list op fails (fault) then the kernel clears
the registered robust list pointer if it matches to prevent that exit()
will try to handle invalid data. That's a valid paranoid decision because
the robust list head sits usually in the TLS and if the TLS is not longer
accessible then the chance for fixing up the resulting mess is very close
to zero.

The problem of non-contended unlocks still exists and will be addressed
separately.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://patch.msgid.link/20260602090535.670514505@kernel.org
2026-06-03 11:38:51 +02:00

243 lines
7.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_FUTEX_H
#define _UAPI_LINUX_FUTEX_H
#include <linux/compiler.h>
#include <linux/types.h>
/* Second argument to futex syscall */
#define FUTEX_WAIT 0
#define FUTEX_WAKE 1
#define FUTEX_FD 2
#define FUTEX_REQUEUE 3
#define FUTEX_CMP_REQUEUE 4
#define FUTEX_WAKE_OP 5
#define FUTEX_LOCK_PI 6
#define FUTEX_UNLOCK_PI 7
#define FUTEX_TRYLOCK_PI 8
#define FUTEX_WAIT_BITSET 9
#define FUTEX_WAKE_BITSET 10
#define FUTEX_WAIT_REQUEUE_PI 11
#define FUTEX_CMP_REQUEUE_PI 12
#define FUTEX_LOCK_PI2 13
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
#define FUTEX_ROBUST_UNLOCK 512
#define FUTEX_ROBUST_LIST32 1024
#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
FUTEX_ROBUST_UNLOCK | FUTEX_ROBUST_LIST32)
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
#define FUTEX_LOCK_PI2_PRIVATE (FUTEX_LOCK_PI2 | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
/*
* Operations to unlock a futex, clear the robust list pending op pointer and
* wake waiters.
*/
#define FUTEX_UNLOCK_PI_LIST64 (FUTEX_UNLOCK_PI | FUTEX_ROBUST_UNLOCK)
#define FUTEX_UNLOCK_PI_LIST64_PRIVATE (FUTEX_UNLOCK_PI_LIST64 | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_PI_LIST32 (FUTEX_UNLOCK_PI | FUTEX_ROBUST_UNLOCK | \
FUTEX_ROBUST_LIST32)
#define FUTEX_UNLOCK_PI_LIST32_PRIVATE (FUTEX_UNLOCK_PI_LIST32 | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_WAKE_LIST64 (FUTEX_WAKE | FUTEX_ROBUST_UNLOCK)
#define FUTEX_UNLOCK_WAKE_LIST64_PRIVATE (FUTEX_UNLOCK_WAKE_LIST64 | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_WAKE_LIST32 (FUTEX_WAKE | FUTEX_ROBUST_UNLOCK | \
FUTEX_ROBUST_LIST32)
#define FUTEX_UNLOCK_WAKE_LIST32_PRIVATE (FUTEX_UNLOCK_WAKE_LIST32 | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_BITSET_LIST64 (FUTEX_WAKE_BITSET | FUTEX_ROBUST_UNLOCK)
#define FUTEX_UNLOCK_BITSET_LIST64_PRIVATE (FUTEX_UNLOCK_BITSET_LIST64 | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_BITSET_LIST32 (FUTEX_WAKE_BITSET | FUTEX_ROBUST_UNLOCK | \
FUTEX_ROBUST_LIST32)
#define FUTEX_UNLOCK_BITSET_LIST32_PRIVATE (FUTEX_UNLOCK_BITSET_LIST32 | FUTEX_PRIVATE_FLAG)
/*
* Flags for futex2 syscalls.
*
* NOTE: these are not pure flags, they can also be seen as:
*
* union {
* u32 flags;
* struct {
* u32 size : 2,
* numa : 1,
* : 4,
* private : 1;
* };
* };
*/
#define FUTEX2_SIZE_U8 0x00
#define FUTEX2_SIZE_U16 0x01
#define FUTEX2_SIZE_U32 0x02
#define FUTEX2_SIZE_U64 0x03
#define FUTEX2_NUMA 0x04
#define FUTEX2_MPOL 0x08
/* 0x10 */
/* 0x20 */
/* 0x40 */
#define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG
#define FUTEX2_SIZE_MASK 0x03
/* do not use */
#define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */
/*
* When FUTEX2_NUMA doubles the futex word, the second word is a node value.
* The special value -1 indicates no-node. This is the same value as
* NUMA_NO_NODE, except that value is not ABI, this is.
*/
#define FUTEX_NO_NODE (-1)
/*
* Max numbers of elements in a futex_waitv array
*/
#define FUTEX_WAITV_MAX 128
/**
* struct futex_waitv - A waiter for vectorized wait
* @val: Expected value at uaddr
* @uaddr: User address to wait on
* @flags: Flags for this waiter
* @__reserved: Reserved member to preserve data alignment. Should be 0.
*/
struct futex_waitv {
__u64 val;
__u64 uaddr;
__u32 flags;
__u32 __reserved;
};
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
*/
/*
* Per-lock list entry - embedded in user-space locks, somewhere close
* to the futex field. (Note: user-space uses a double-linked list to
* achieve O(1) list add and remove, but the kernel only needs to know
* about the forward link)
*
* NOTE: this structure is part of the syscall ABI, and must not be
* changed.
*/
struct robust_list {
struct robust_list __user *next;
};
/*
* Per-thread list head:
*
* NOTE: this structure is part of the syscall ABI, and must only be
* changed if the change is first communicated with the glibc folks.
* (When an incompatible change is done, we'll increase the structure
* size, which glibc will detect)
*/
struct robust_list_head {
/*
* The head of the list. Points back to itself if empty:
*/
struct robust_list list;
/*
* This relative offset is set by user-space, it gives the kernel
* the relative position of the futex field to examine. This way
* we keep userspace flexible, to freely shape its data-structure,
* without hardcoding any particular offset into the kernel:
*/
long futex_offset;
/*
* The death of the thread may race with userspace setting
* up a lock's links. So to handle this race, userspace first
* sets this field to the address of the to-be-taken lock,
* then does the lock acquire, and then adds itself to the
* list, and then clears this field. Hence the kernel will
* always have full knowledge of all locks that the thread
* _might_ have taken. We check the owner TID in any case,
* so only truly owned locks will be handled.
*/
struct robust_list __user *list_op_pending;
};
/*
* Are there any waiters for this robust futex:
*/
#define FUTEX_WAITERS 0x80000000
/*
* The kernel signals via this bit that a thread holding a futex
* has exited without unlocking the futex. The kernel also does
* a FUTEX_WAKE on such futexes, after setting the bit, to wake
* up any possible waiters:
*/
#define FUTEX_OWNER_DIED 0x40000000
/*
* The rest of the robust-futex field is for the TID:
*/
#define FUTEX_TID_MASK 0x3fffffff
/*
* This limit protects against a deliberately circular list.
* (Not worth introducing an rlimit for it)
*/
#define ROBUST_LIST_LIMIT 2048
/* Modifiers for robust_list_head::list_op_pending */
#define FUTEX_ROBUST_MOD_PI (0x1UL)
#define FUTEX_ROBUST_MOD_MASK (FUTEX_ROBUST_MOD_PI)
/*
* bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a
* match of any bit.
*/
#define FUTEX_BITSET_MATCH_ANY 0xffffffff
#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */
#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */
#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */
#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */
#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */
#define FUTEX_OP_OPARG_SHIFT 8 /* Use (1 << OPARG) instead of OPARG. */
#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */
#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */
#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */
#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */
#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */
#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */
/* FUTEX_WAKE_OP will perform atomically
int oldval = *(int *)UADDR2;
*(int *)UADDR2 = oldval OP OPARG;
if (oldval CMP CMPARG)
wake UADDR2; */
#define FUTEX_OP(op, oparg, cmp, cmparg) \
(((op & 0xf) << 28) | ((cmp & 0xf) << 24) \
| ((oparg & 0xfff) << 12) | (cmparg & 0xfff))
#endif /* _UAPI_LINUX_FUTEX_H */