mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-05-26 11:40:24 +02:00
c8134b5f13
Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no one to reap it would become a zombie). CLONE_THREAD is rejected because autokill targets a process not a thread. The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on the struct file at clone3() time. The pidfs .release handler checks this flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only when it is set. Files from pidfd_open() or open_by_handle_at() are distinct struct files that do not carry this flag. dup()/fork() share the same struct file so they extend the child's lifetime until the last reference drops. CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without CLONE_NNP the child could escalate privileges via setuid/setgid exec after being spawned, so the caller must have CAP_SYS_ADMIN in its user namespace. With CLONE_NNP the child can never gain new privileges so unprivileged usage is allowed. This is a deliberate departure from the pdeath_signal model which is reset during secureexec and commit_creds() rendering it useless for container runtimes that need to deprivilege themselves. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-3-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
153 lines
6.4 KiB
C
153 lines
6.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPI_LINUX_SCHED_H
|
|
#define _UAPI_LINUX_SCHED_H
|
|
|
|
#include <linux/types.h>
|
|
|
|
/*
|
|
* cloning flags:
|
|
*/
|
|
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
|
|
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
|
|
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
|
|
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
|
|
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
|
|
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
|
|
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
|
|
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
|
|
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
|
|
#define CLONE_THREAD 0x00010000 /* Same thread group? */
|
|
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
|
|
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
|
|
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
|
|
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
|
|
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
|
|
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
|
|
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
|
|
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
|
|
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
|
|
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
|
|
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
|
|
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
|
|
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
|
|
#define CLONE_NEWNET 0x40000000 /* New network namespace */
|
|
#define CLONE_IO 0x80000000 /* Clone io context */
|
|
|
|
/* Flags for the clone3() syscall. */
|
|
#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */
|
|
#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */
|
|
#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
|
|
#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
|
|
#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
|
|
|
|
/*
|
|
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
|
* syscalls only:
|
|
*/
|
|
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
|
|
|
|
#ifndef __ASSEMBLY__
|
|
/**
|
|
* struct clone_args - arguments for the clone3 syscall
|
|
* @flags: Flags for the new process as listed above.
|
|
* All flags are valid except for CSIGNAL and
|
|
* CLONE_DETACHED.
|
|
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
|
|
* returned in this argument.
|
|
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
|
|
* child process will be returned in the child's
|
|
* memory.
|
|
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
|
|
* the child process will be returned in the
|
|
* parent's memory.
|
|
* @exit_signal: The exit_signal the parent process will be
|
|
* sent when the child exits.
|
|
* @stack: Specify the location of the stack for the
|
|
* child process.
|
|
* Note, @stack is expected to point to the
|
|
* lowest address. The stack direction will be
|
|
* determined by the kernel and set up
|
|
* appropriately based on @stack_size.
|
|
* @stack_size: The size of the stack for the child process.
|
|
* @tls: If CLONE_SETTLS is set, the tls descriptor
|
|
* is set to tls.
|
|
* @set_tid: Pointer to an array of type *pid_t. The size
|
|
* of the array is defined using @set_tid_size.
|
|
* This array is used to select PIDs/TIDs for
|
|
* newly created processes. The first element in
|
|
* this defines the PID in the most nested PID
|
|
* namespace. Each additional element in the array
|
|
* defines the PID in the parent PID namespace of
|
|
* the original PID namespace. If the array has
|
|
* less entries than the number of currently
|
|
* nested PID namespaces only the PIDs in the
|
|
* corresponding namespaces are set.
|
|
* @set_tid_size: This defines the size of the array referenced
|
|
* in @set_tid. This cannot be larger than the
|
|
* kernel's limit of nested PID namespaces.
|
|
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
|
|
* a file descriptor for the cgroup.
|
|
*
|
|
* The structure is versioned by size and thus extensible.
|
|
* New struct members must go at the end of the struct and
|
|
* must be properly 64bit aligned.
|
|
*/
|
|
struct clone_args {
|
|
__aligned_u64 flags;
|
|
__aligned_u64 pidfd;
|
|
__aligned_u64 child_tid;
|
|
__aligned_u64 parent_tid;
|
|
__aligned_u64 exit_signal;
|
|
__aligned_u64 stack;
|
|
__aligned_u64 stack_size;
|
|
__aligned_u64 tls;
|
|
__aligned_u64 set_tid;
|
|
__aligned_u64 set_tid_size;
|
|
__aligned_u64 cgroup;
|
|
};
|
|
#endif
|
|
|
|
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
|
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
|
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
|
|
|
/*
|
|
* Scheduling policies
|
|
*/
|
|
#define SCHED_NORMAL 0
|
|
#define SCHED_FIFO 1
|
|
#define SCHED_RR 2
|
|
#define SCHED_BATCH 3
|
|
/* SCHED_ISO: reserved but not implemented yet */
|
|
#define SCHED_IDLE 5
|
|
#define SCHED_DEADLINE 6
|
|
#define SCHED_EXT 7
|
|
|
|
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
|
|
#define SCHED_RESET_ON_FORK 0x40000000
|
|
|
|
/*
|
|
* For the sched_{set,get}attr() calls
|
|
*/
|
|
#define SCHED_FLAG_RESET_ON_FORK 0x01
|
|
#define SCHED_FLAG_RECLAIM 0x02
|
|
#define SCHED_FLAG_DL_OVERRUN 0x04
|
|
#define SCHED_FLAG_KEEP_POLICY 0x08
|
|
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
|
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
|
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
|
|
|
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
|
SCHED_FLAG_KEEP_PARAMS)
|
|
|
|
#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \
|
|
SCHED_FLAG_UTIL_CLAMP_MAX)
|
|
|
|
#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
|
|
SCHED_FLAG_RECLAIM | \
|
|
SCHED_FLAG_DL_OVERRUN | \
|
|
SCHED_FLAG_KEEP_ALL | \
|
|
SCHED_FLAG_UTIL_CLAMP)
|
|
|
|
#endif /* _UAPI_LINUX_SCHED_H */
|