Files
linux-stable-mirror/include/linux/futex.h
T
Thomas Gleixner 7010c39d8f futex: Provide infrastructure to plug the non contended robust futex unlock race
When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
then the unlock sequence in user space looks like this:

  1)	robust_list_set_op_pending(mutex);
  2)	robust_list_remove(mutex);

  	lval = gettid();
  3)	if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
  4)		robust_list_clear_op_pending();
  	else
  5)		sys_futex(OP | FUTEX_ROBUST_UNLOCK, ....);

That still leaves a minimal race window between #3 and #4 where the mutex
could be acquired by some other task, which observes that it is the last
user and:

  1) unmaps the mutex memory
  2) maps a different file, which ends up covering the same address

When then the original task exits before reaching #5 then the kernel robust
list handling observes the pending op entry and tries to fix up user space.

In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupt unrelated data.

On X86 this boils down to this simplified assembly sequence:

		mov		%esi,%eax	// Load TID into EAX
        	xor		%ecx,%ecx	// Set ECX to 0
   #3		lock cmpxchg	%ecx,(%rdi)	// Try the TID -> 0 transition
	.Lstart:
		jnz    		.Lend
   #4 		movq		%rcx,(%rdx)	// Clear list_op_pending
	.Lend:

If the cmpxchg() succeeds and the task is interrupted before it can clear
list_op_pending in the robust list head (#4) and the task crashes in a
signal handler or gets killed then it ends up in do_exit() and subsequently
in the robust list handling, which then might run into the unmap/map issue
described above.

This is only relevant when user space was interrupted and a signal is
pending. The fix-up has to be done before signal delivery is attempted
because:

   1) The signal might be fatal so get_signal() ends up in do_exit()

   2) The signal handler might crash or the task is killed before returning
      from the handler. At that point the instruction pointer in pt_regs is
      not longer the instruction pointer of the initially interrupted unlock
      sequence.

The right place to handle this is in __exit_to_user_mode_loop() before
invoking arch_do_signal_or_restart() as this covers obviously both
scenarios.

As this is only relevant when the task was interrupted in user space, this
is tied to RSEQ and the generic entry code as RSEQ keeps track of user
space interrupts unconditionally even if the task does not have a RSEQ
region installed. That makes the decision very lightweight:

       if (current->rseq.user_irq && within(regs, csr->unlock_ip_range))
       		futex_fixup_robust_unlock(regs, csr);

futex_fixup_robust_unlock() then invokes a architecture specific function
to return the pending op pointer or NULL. The function evaluates the
register content to decide whether the pending ops pointer in the robust
list head needs to be cleared.

Assuming the above unlock sequence, then on x86 this decision is the
trivial evaluation of the zero flag:

	return regs->eflags & X86_EFLAGS_ZF ? regs->dx : NULL;

Other architectures might need to do more complex evaluations due to LLSC,
but the approach is valid in general. The size of the pointer is determined
from the matching range struct, which covers both 32-bit and 64-bit builds
including COMPAT.

The unlock sequence is going to be placed in the VDSO so that the kernel
can keep everything synchronized, especially the register usage. The
resulting code sequence for user space is:

   if (__vdso_futex_robust_list$SZ_try_unlock(lock, tid, &pending_op) != tid)
 	err = sys_futex($OP | FUTEX_ROBUST_UNLOCK,....);

Both the VDSO unlock and the kernel side unlock ensure that the pending_op
pointer is always cleared when the lock becomes unlocked.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://patch.msgid.link/20260602090535.773669210@kernel.org
2026-06-03 11:38:52 +02:00

163 lines
4.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FUTEX_H
#define _LINUX_FUTEX_H
#include <linux/sched.h>
#include <linux/ktime.h>
#include <linux/mm_types.h>
#include <uapi/linux/futex.h>
struct inode;
struct task_struct;
/*
* Futexes are matched on equal values of this key.
* The key type depends on whether it's a shared or private mapping.
* Don't rearrange members without looking at hash_futex().
*
* offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
* We use the two low order bits of offset to tell what is the kind of key :
* 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
* (no reference on an inode or mm)
* 01 : Shared futex (PTHREAD_PROCESS_SHARED)
* mapped on a file (reference on the underlying inode)
* 10 : Shared futex (PTHREAD_PROCESS_SHARED)
* (but private mapping on an mm, and reference taken on it)
*/
#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
union futex_key {
struct {
u64 i_seq;
unsigned long pgoff;
unsigned int offset;
/* unsigned int node; */
} shared;
struct {
union {
struct mm_struct *mm;
u64 __tmp;
};
unsigned long address;
unsigned int offset;
/* unsigned int node; */
} private;
struct {
u64 ptr;
unsigned long word;
unsigned int offset;
unsigned int node; /* NOT hashed! */
} both;
};
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
#ifdef CONFIG_FUTEX
enum {
FUTEX_STATE_OK,
FUTEX_STATE_EXITING,
FUTEX_STATE_DEAD,
};
static inline void futex_init_task(struct task_struct *tsk)
{
memset(&tsk->futex, 0, sizeof(tsk->futex));
INIT_LIST_HEAD(&tsk->futex.pi_state_list);
tsk->futex.state = FUTEX_STATE_OK;
mutex_init(&tsk->futex.exit_mutex);
}
void futex_exit_recursive(struct task_struct *tsk);
void futex_exit_release(struct task_struct *tsk);
void futex_exec_release(struct task_struct *tsk);
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
int futex_hash_allocate_default(void);
void futex_hash_free(struct mm_struct *mm);
#else /* CONFIG_FUTEX_PRIVATE_HASH */
static inline int futex_hash_allocate_default(void) { return 0; }
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
#endif /* !CONFIG_FUTEX_PRIVATE_HASH */
#else /* CONFIG_FUTEX */
static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_exit_recursive(struct task_struct *tsk) { }
static inline void futex_exit_release(struct task_struct *tsk) { }
static inline void futex_exec_release(struct task_struct *tsk) { }
static inline long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
return -EINVAL;
}
static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
{
return -EINVAL;
}
static inline int futex_hash_allocate_default(void) { return 0; }
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
#endif /* !CONFIG_FUTEX */
#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
#include <asm/futex_robust.h>
void futex_reset_cs_ranges(struct futex_mm_data *fd);
void __futex_fixup_robust_unlock(struct pt_regs *regs, struct futex_unlock_cs_range *csr);
static inline bool futex_within_robust_unlock(struct pt_regs *regs,
struct futex_unlock_cs_range *csr)
{
unsigned long ip = instruction_pointer(regs);
return ip >= csr->start_ip && ip < csr->start_ip + csr->len;
}
static inline void futex_fixup_robust_unlock(struct pt_regs *regs)
{
struct futex_unlock_cs_range *csr;
/*
* Avoid dereferencing current->mm if not returning from interrupt.
* current->rseq.event is going to be used subsequently, so bringing the
* cache line in is not a big deal.
*/
if (!current->rseq.event.user_irq)
return;
csr = current->mm->futex.unlock.cs_ranges;
/* The loop is optimized out for !COMPAT */
for (int r = 0; r < FUTEX_ROBUST_MAX_CS_RANGES; r++, csr++) {
if (unlikely(futex_within_robust_unlock(regs, csr))) {
__futex_fixup_robust_unlock(regs, csr);
return;
}
}
}
static inline void futex_set_vdso_cs_range(struct futex_mm_data *fd, unsigned int idx,
unsigned long start, unsigned long end, bool sz32)
{
fd->unlock.cs_ranges[idx].start_ip = start;
fd->unlock.cs_ranges[idx].len = end - start;
fd->unlock.cs_ranges[idx].pop_size32 = sz32;
}
#else /* CONFIG_FUTEX_ROBUST_UNLOCK */
static inline void futex_fixup_robust_unlock(struct pt_regs *regs) { }
#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */
#if defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_FUTEX_ROBUST_UNLOCK)
void futex_mm_init(struct mm_struct *mm);
#else
static inline void futex_mm_init(struct mm_struct *mm) { }
#endif
#endif /* _LINUX_FUTEX_H */