Merge tag 'locking-futex-2025-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull futex updates from Ingo Molnar: - Standardize on ktime_t in restart_block::time as well (Thomas Weißschuh) - Futex selftests: - Add robust list testcases (André Almeida) - Formatting fixes/cleanups (Carlos Llamas) * tag 'locking-futex-2025-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: futex: Store time as ktime_t in restart block selftests/futex: Create test for robust list selftests/futex: Skip tests if shmget unsupported selftests/futex: Add newline to ksft_exit_fail_msg() selftests/futex: Remove unused test_futex_mpol()
2025-12-11 20:38:43 +01:00 · 2025-12-10 17:21:30 +09:00
parent 2137cb863b c42ba5a87b
commit 0048fbb401
8 changed files with 567 additions and 15 deletions
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -32,7 +32,7 @@ struct restart_block {
 			u32 val;
 			u32 flags;
 			u32 bitset;
-			u64 time;
+			ktime_t time;
 			u32 __user *uaddr2;
 		} futex;
 		/* For nanosleep */
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = restart->futex.uaddr;
-	ktime_t t, *tp = NULL;
+	ktime_t *tp = NULL;
+
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT)
+		tp = &restart->futex.time;

-	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-		t = restart->futex.time;
-		tp = &t;
-	}
 	restart->fn = do_no_restart_syscall;

 	return (long)futex_wait(uaddr, restart->futex.flags,
--- a/tools/testing/selftests/futex/functional/.gitignore
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -12,3 +12,4 @@ futex_wait_uninitialized_heap
 futex_wait_wouldblock
 futex_waitv
 futex_numa
+robust_list
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -22,7 +22,8 @@ TEST_GEN_PROGS := \
 	futex_priv_hash \
 	futex_numa_mpol \
 	futex_waitv \
-	futex_numa
+	futex_numa \
+	robust_list

 TEST_PROGS := run.sh

--- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c
+++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
@@ -131,11 +131,6 @@ static void test_futex(void *futex_ptr, int err_value)
 	__test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA);
 }

-static void test_futex_mpol(void *futex_ptr, int err_value)
-{
-	__test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL);
-}
-
 TEST(futex_numa_mpol)
 {
 	struct futex32_numa *futex_numa;
--- a/tools/testing/selftests/futex/functional/futex_wait.c
+++ b/tools/testing/selftests/futex/functional/futex_wait.c
@@ -71,6 +71,8 @@ TEST(anon_page)
 	/* Testing an anon page shared memory */
 	shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);
 	if (shm_id < 0) {
+		if (errno == ENOSYS)
+			ksft_exit_skip("shmget syscall not supported\n");
 		perror("shmget");
 		exit(1);
 	}
@@ -108,14 +110,14 @@ TEST(file_backed)
 	/* Testing a file backed shared memory */
 	fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
 	if (fd < 0)
-		ksft_exit_fail_msg("open");
+		ksft_exit_fail_msg("open\n");

 	if (ftruncate(fd, sizeof(f_private)))
-		ksft_exit_fail_msg("ftruncate");
+		ksft_exit_fail_msg("ftruncate\n");

 	shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 	if (shm == MAP_FAILED)
-		ksft_exit_fail_msg("mmap");
+		ksft_exit_fail_msg("mmap\n");

 	memcpy(shm, &f_private, sizeof(f_private));

--- a/tools/testing/selftests/futex/functional/futex_waitv.c
+++ b/tools/testing/selftests/futex/functional/futex_waitv.c
@@ -86,6 +86,8 @@ TEST(shared_waitv)
 		int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);

 		if (shm_id < 0) {
+			if (errno == ENOSYS)
+				ksft_exit_skip("shmget syscall not supported\n");
 			perror("shmget");
 			exit(1);
 		}
--- a/tools/testing/selftests/futex/functional/robust_list.c
+++ b/tools/testing/selftests/futex/functional/robust_list.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025 Igalia S.L.
+ *
+ * Robust list test by André Almeida <andrealmeid@igalia.com>
+ *
+ * The robust list uAPI allows userspace to create "robust" locks, in the sense
+ * that if the lock holder thread dies, the remaining threads that are waiting
+ * for the lock won't block forever, waiting for a lock that will never be
+ * released.
+ *
+ * This is achieve by userspace setting a list where a thread can enter all the
+ * locks (futexes) that it is holding. The robust list is a linked list, and
+ * userspace register the start of the list with the syscall set_robust_list().
+ * If such thread eventually dies, the kernel will walk this list, waking up one
+ * thread waiting for each futex and marking the futex word with the flag
+ * FUTEX_OWNER_DIED.
+ *
+ * See also
+ *	man set_robust_list
+ *	Documententation/locking/robust-futex-ABI.rst
+ *	Documententation/locking/robust-futexes.rst
+ */
+
+#define _GNU_SOURCE
+
+#include "futextest.h"
+#include "../../kselftest_harness.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#define STACK_SIZE (1024 * 1024)
+
+#define FUTEX_TIMEOUT 3
+
+#define SLEEP_US 100
+
+static pthread_barrier_t barrier, barrier2;
+
+static int set_robust_list(struct robust_list_head *head, size_t len)
+{
+	return syscall(SYS_set_robust_list, head, len);
+}
+
+static int get_robust_list(int pid, struct robust_list_head **head, size_t *len_ptr)
+{
+	return syscall(SYS_get_robust_list, pid, head, len_ptr);
+}
+
+/*
+ * Basic lock struct, contains just the futex word and the robust list element
+ * Real implementations have also a *prev to easily walk in the list
+ */
+struct lock_struct {
+	_Atomic(unsigned int)	futex;
+	struct robust_list	list;
+};
+
+/*
+ * Helper function to spawn a child thread. Returns -1 on error, pid on success
+ */
+static int create_child(int (*fn)(void *arg), void *arg)
+{
+	char *stack;
+	pid_t pid;
+
+	stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (stack == MAP_FAILED)
+		return -1;
+
+	stack += STACK_SIZE;
+
+	pid = clone(fn, stack, CLONE_VM | SIGCHLD, arg);
+
+	if (pid == -1)
+		return -1;
+
+	return pid;
+}
+
+/*
+ * Helper function to prepare and register a robust list
+ */
+static int set_list(struct robust_list_head *head)
+{
+	int ret;
+
+	ret = set_robust_list(head, sizeof(*head));
+	if (ret)
+		return ret;
+
+	head->futex_offset = (size_t) offsetof(struct lock_struct, futex) -
+			     (size_t) offsetof(struct lock_struct, list);
+	head->list.next = &head->list;
+	head->list_op_pending = NULL;
+
+	return 0;
+}
+
+/*
+ * A basic (and incomplete) mutex lock function with robustness
+ */
+static int mutex_lock(struct lock_struct *lock, struct robust_list_head *head, bool error_inject)
+{
+	_Atomic(unsigned int) *futex = &lock->futex;
+	unsigned int zero = 0;
+	pid_t tid = gettid();
+	int ret = -1;
+
+	/*
+	 * Set list_op_pending before starting the lock, so the kernel can catch
+	 * the case where the thread died during the lock operation
+	 */
+	head->list_op_pending = &lock->list;
+
+	if (atomic_compare_exchange_strong(futex, &zero, tid)) {
+		/*
+		 * We took the lock, insert it in the robust list
+		 */
+		struct robust_list *list = &head->list;
+
+		/* Error injection to test list_op_pending */
+		if (error_inject)
+			return 0;
+
+		while (list->next != &head->list)
+			list = list->next;
+
+		list->next = &lock->list;
+		lock->list.next = &head->list;
+
+		ret = 0;
+	} else {
+		/*
+		 * We didn't take the lock, wait until the owner wakes (or dies)
+		 */
+		struct timespec to;
+
+		to.tv_sec = FUTEX_TIMEOUT;
+		to.tv_nsec = 0;
+
+		tid = atomic_load(futex);
+		/* Kernel ignores futexes without the waiters flag */
+		tid |= FUTEX_WAITERS;
+		atomic_store(futex, tid);
+
+		ret = futex_wait((futex_t *) futex, tid, &to, 0);
+
+		/*
+		 * A real mutex_lock() implementation would loop here to finally
+		 * take the lock. We don't care about that, so we stop here.
+		 */
+	}
+
+	head->list_op_pending = NULL;
+
+	return ret;
+}
+
+/*
+ * This child thread will succeed taking the lock, and then will exit holding it
+ */
+static int child_fn_lock(void *arg)
+{
+	struct lock_struct *lock = arg;
+	struct robust_list_head head;
+	int ret;
+
+	ret = set_list(&head);
+	if (ret) {
+		ksft_test_result_fail("set_robust_list error\n");
+		return ret;
+	}
+
+	ret = mutex_lock(lock, &head, false);
+	if (ret) {
+		ksft_test_result_fail("mutex_lock error\n");
+		return ret;
+	}
+
+	pthread_barrier_wait(&barrier);
+
+	/*
+	 * There's a race here: the parent thread needs to be inside
+	 * futex_wait() before the child thread dies, otherwise it will miss the
+	 * wakeup from handle_futex_death() that this child will emit. We wait a
+	 * little bit just to make sure that this happens.
+	 */
+	usleep(SLEEP_US);
+
+	return 0;
+}
+
+/*
+ * Spawns a child thread that will set a robust list, take the lock, register it
+ * in the robust list and die. The parent thread will wait on this futex, and
+ * should be waken up when the child exits.
+ */
+TEST(test_robustness)
+{
+	struct lock_struct lock = { .futex = 0 };
+	_Atomic(unsigned int) *futex = &lock.futex;
+	struct robust_list_head head;
+	int ret, pid, wstatus;
+
+	ret = set_list(&head);
+	ASSERT_EQ(ret, 0);
+
+	/*
+	 * Lets use a barrier to ensure that the child thread takes the lock
+	 * before the parent
+	 */
+	ret = pthread_barrier_init(&barrier, NULL, 2);
+	ASSERT_EQ(ret, 0);
+
+	pid = create_child(&child_fn_lock, &lock);
+	ASSERT_NE(pid, -1);
+
+	pthread_barrier_wait(&barrier);
+	ret = mutex_lock(&lock, &head, false);
+
+	/*
+	 * futex_wait() should return 0 and the futex word should be marked with
+	 * FUTEX_OWNER_DIED
+	 */
+	ASSERT_EQ(ret, 0);
+
+	ASSERT_TRUE(*futex & FUTEX_OWNER_DIED);
+
+	wait(&wstatus);
+	pthread_barrier_destroy(&barrier);
+
+	/* Pass only if the child hasn't return error */
+	if (!WEXITSTATUS(wstatus))
+		ksft_test_result_pass("%s\n", __func__);
+}
+
+/*
+ * The only valid value for len is sizeof(*head)
+ */
+TEST(test_set_robust_list_invalid_size)
+{
+	struct robust_list_head head;
+	size_t head_size = sizeof(head);
+	int ret;
+
+	ret = set_robust_list(&head, head_size);
+	ASSERT_EQ(ret, 0);
+
+	ret = set_robust_list(&head, head_size * 2);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	ret = set_robust_list(&head, head_size - 1);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	ret = set_robust_list(&head, 0);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	ksft_test_result_pass("%s\n", __func__);
+}
+
+/*
+ * Test get_robust_list with pid = 0, getting the list of the running thread
+ */
+TEST(test_get_robust_list_self)
+{
+	struct robust_list_head head, head2, *get_head;
+	size_t head_size = sizeof(head), len_ptr;
+	int ret;
+
+	ret = set_robust_list(&head, head_size);
+	ASSERT_EQ(ret, 0);
+
+	ret = get_robust_list(0, &get_head, &len_ptr);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(get_head, &head);
+	ASSERT_EQ(head_size, len_ptr);
+
+	ret = set_robust_list(&head2, head_size);
+	ASSERT_EQ(ret, 0);
+
+	ret = get_robust_list(0, &get_head, &len_ptr);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(get_head, &head2);
+	ASSERT_EQ(head_size, len_ptr);
+
+	ksft_test_result_pass("%s\n", __func__);
+}
+
+static int child_list(void *arg)
+{
+	struct robust_list_head *head = arg;
+	int ret;
+
+	ret = set_robust_list(head, sizeof(*head));
+	if (ret) {
+		ksft_test_result_fail("set_robust_list error\n");
+		return -1;
+	}
+
+	/*
+	 * After setting the list head, wait until the main thread can call
+	 * get_robust_list() for this thread before exiting.
+	 */
+	pthread_barrier_wait(&barrier);
+	pthread_barrier_wait(&barrier2);
+
+	return 0;
+}
+
+/*
+ * Test get_robust_list from another thread. We use two barriers here to ensure
+ * that:
+ *   1) the child thread set the list before we try to get it from the
+ * parent
+ *   2) the child thread still alive when we try to get the list from it
+ */
+TEST(test_get_robust_list_child)
+{
+	struct robust_list_head head, *get_head;
+	int ret, wstatus;
+	size_t len_ptr;
+	pid_t tid;
+
+	ret = pthread_barrier_init(&barrier, NULL, 2);
+	ret = pthread_barrier_init(&barrier2, NULL, 2);
+	ASSERT_EQ(ret, 0);
+
+	tid = create_child(&child_list, &head);
+	ASSERT_NE(tid, -1);
+
+	pthread_barrier_wait(&barrier);
+
+	ret = get_robust_list(tid, &get_head, &len_ptr);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(&head, get_head);
+
+	pthread_barrier_wait(&barrier2);
+
+	wait(&wstatus);
+	pthread_barrier_destroy(&barrier);
+	pthread_barrier_destroy(&barrier2);
+
+	/* Pass only if the child hasn't return error */
+	if (!WEXITSTATUS(wstatus))
+		ksft_test_result_pass("%s\n", __func__);
+}
+
+static int child_fn_lock_with_error(void *arg)
+{
+	struct lock_struct *lock = arg;
+	struct robust_list_head head;
+	int ret;
+
+	ret = set_list(&head);
+	if (ret) {
+		ksft_test_result_fail("set_robust_list error\n");
+		return -1;
+	}
+
+	ret = mutex_lock(lock, &head, true);
+	if (ret) {
+		ksft_test_result_fail("mutex_lock error\n");
+		return -1;
+	}
+
+	pthread_barrier_wait(&barrier);
+
+	/* See comment at child_fn_lock() */
+	usleep(SLEEP_US);
+
+	return 0;
+}
+
+/*
+ * Same as robustness test, but inject an error where the mutex_lock() exits
+ * earlier, just after setting list_op_pending and taking the lock, to test the
+ * list_op_pending mechanism
+ */
+TEST(test_set_list_op_pending)
+{
+	struct lock_struct lock = { .futex = 0 };
+	_Atomic(unsigned int) *futex = &lock.futex;
+	struct robust_list_head head;
+	int ret, wstatus;
+
+	ret = set_list(&head);
+	ASSERT_EQ(ret, 0);
+
+	ret = pthread_barrier_init(&barrier, NULL, 2);
+	ASSERT_EQ(ret, 0);
+
+	ret = create_child(&child_fn_lock_with_error, &lock);
+	ASSERT_NE(ret, -1);
+
+	pthread_barrier_wait(&barrier);
+	ret = mutex_lock(&lock, &head, false);
+
+	ASSERT_EQ(ret, 0);
+
+	ASSERT_TRUE(*futex & FUTEX_OWNER_DIED);
+
+	wait(&wstatus);
+	pthread_barrier_destroy(&barrier);
+
+	/* Pass only if the child hasn't return error */
+	if (!WEXITSTATUS(wstatus))
+		ksft_test_result_pass("%s\n", __func__);
+	else
+		ksft_test_result_fail("%s\n", __func__);
+}
+
+#define CHILD_NR 10
+
+static int child_lock_holder(void *arg)
+{
+	struct lock_struct *locks = arg;
+	struct robust_list_head head;
+	int i;
+
+	set_list(&head);
+
+	for (i = 0; i < CHILD_NR; i++) {
+		locks[i].futex = 0;
+		mutex_lock(&locks[i], &head, false);
+	}
+
+	pthread_barrier_wait(&barrier);
+	pthread_barrier_wait(&barrier2);
+
+	/* See comment at child_fn_lock() */
+	usleep(SLEEP_US);
+
+	return 0;
+}
+
+static int child_wait_lock(void *arg)
+{
+	struct lock_struct *lock = arg;
+	struct robust_list_head head;
+	int ret;
+
+	pthread_barrier_wait(&barrier2);
+	ret = mutex_lock(lock, &head, false);
+
+	if (ret) {
+		ksft_test_result_fail("mutex_lock error\n");
+		return -1;
+	}
+
+	if (!(lock->futex & FUTEX_OWNER_DIED)) {
+		ksft_test_result_fail("futex not marked with FUTEX_OWNER_DIED\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Test a robust list of more than one element. All the waiters should wake when
+ * the holder dies
+ */
+TEST(test_robust_list_multiple_elements)
+{
+	struct lock_struct locks[CHILD_NR];
+	pid_t pids[CHILD_NR + 1];
+	int i, ret, wstatus;
+
+	ret = pthread_barrier_init(&barrier, NULL, 2);
+	ASSERT_EQ(ret, 0);
+	ret = pthread_barrier_init(&barrier2, NULL, CHILD_NR + 1);
+	ASSERT_EQ(ret, 0);
+
+	pids[0] = create_child(&child_lock_holder, &locks);
+
+	/* Wait until the locker thread takes the look */
+	pthread_barrier_wait(&barrier);
+
+	for (i = 0; i < CHILD_NR; i++)
+		pids[i+1] = create_child(&child_wait_lock, &locks[i]);
+
+	/* Wait for all children to return */
+	ret = 0;
+
+	for (i = 0; i < CHILD_NR; i++) {
+		waitpid(pids[i], &wstatus, 0);
+		if (WEXITSTATUS(wstatus))
+			ret = -1;
+	}
+
+	pthread_barrier_destroy(&barrier);
+	pthread_barrier_destroy(&barrier2);
+
+	/* Pass only if the child hasn't return error */
+	if (!ret)
+		ksft_test_result_pass("%s\n", __func__);
+}
+
+static int child_circular_list(void *arg)
+{
+	static struct robust_list_head head;
+	struct lock_struct a, b, c;
+	int ret;
+
+	ret = set_list(&head);
+	if (ret) {
+		ksft_test_result_fail("set_list error\n");
+		return -1;
+	}
+
+	head.list.next = &a.list;
+
+	/*
+	 * The last element should point to head list, but we short circuit it
+	 */
+	a.list.next = &b.list;
+	b.list.next = &c.list;
+	c.list.next = &a.list;
+
+	return 0;
+}
+
+/*
+ * Create a circular robust list. The kernel should be able to destroy the list
+ * while processing it so it won't be trapped in an infinite loop while handling
+ * a process exit
+ */
+TEST(test_circular_list)
+{
+	int wstatus;
+
+	create_child(child_circular_list, NULL);
+
+	wait(&wstatus);
+
+	/* Pass only if the child hasn't return error */
+	if (!WEXITSTATUS(wstatus))
+		ksft_test_result_pass("%s\n", __func__);
+}
+
+TEST_HARNESS_MAIN