Files
linux-stable-mirror/include/uapi/linux/sched.h
T
Linus Torvalds 7c8a4671dc Merge tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs mount updates from Christian Brauner:

 - Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
   namespace with the newly created filesystem attached to a copy of the
   real rootfs. This returns a namespace file descriptor instead of an
   O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for
   open_tree().

   This allows creating a new filesystem and immediately placing it in a
   new mount namespace in a single operation, which is useful for
   container runtimes and other namespace-based isolation mechanisms.

   This accompanies OPEN_TREE_NAMESPACE and avoids a needless detour via
   OPEN_TREE_NAMESPACE to get the same effect. Will be especially useful
   when you mount an actual filesystem to be used as the container
   rootfs.

 - Currently, creating a new mount namespace always copies the entire
   mount tree from the caller's namespace. For containers and sandboxes
   that intend to build their mount table from scratch this is wasteful:
   they inherit a potentially large mount tree only to immediately tear
   it down.

   This series adds support for creating a mount namespace that contains
   only a clone of the root mount, with none of the child mounts. Two
   new flags are introduced:

     - CLONE_EMPTY_MNTNS (0x400000000) for clone3(), using the 64-bit flag space
     - UNSHARE_EMPTY_MNTNS (0x00100000) for unshare()

   Both flags imply CLONE_NEWNS. The resulting namespace contains a
   single nullfs root mount with an immutable empty directory. The
   intended workflow is to then mount a real filesystem (e.g., tmpfs)
   over the root and build the mount table from there.

 - Allow MOVE_MOUNT_BENEATH to target the caller's rootfs, allowing to
   switch out the rootfs without pivot_root(2).

   The traditional approach to switching the rootfs involves
   pivot_root(2) or a chroot_fs_refs()-based mechanism that atomically
   updates fs->root for all tasks sharing the same fs_struct. This has
   consequences for fork(), unshare(CLONE_FS), and setns().

   This series instead decomposes root-switching into individually
   atomic, locally-scoped steps:

	fd_tree = open_tree(-EBADF, "/newroot", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
	fchdir(fd_tree);
	move_mount(fd_tree, "", AT_FDCWD, "/", MOVE_MOUNT_BENEATH | MOVE_MOUNT_F_EMPTY_PATH);
	chroot(".");
	umount2(".", MNT_DETACH);

   Since each step only modifies the caller's own state, the
   fork/unshare/setns races are eliminated by design.

   A key step to making this possible is to remove the locked mount
   restriction. Originally MOVE_MOUNT_BENEATH doesn't support mounting
   beneath a mount that is locked. The locked mount protects the
   underlying mount from being revealed. This is a core mechanism of
   unshare(CLONE_NEWUSER | CLONE_NEWNS). The mounts in the new mount
   namespace become locked. That effectively makes the new mount table
   useless as the caller cannot ever get rid of any of the mounts no
   matter how useless they are.

   We can lift this restriction though. We simply transfer the locked
   property from the top mount to the mount beneath. This works because
   what we care about is to protect the underlying mount aka the parent.
   The mount mounted between the parent and the top mount takes over the
   job of protecting the parent mount from the top mount mount. This
   leaves us free to remove the locked property from the top mount which
   can consequently be unmounted:

	unshare(CLONE_NEWUSER | CLONE_NEWNS)

   and we inherit a clone of procfs on /proc then currently we cannot
   unmount it as:

	umount -l /proc

   will fail with EINVAL because the procfs mount is locked.

   After this series we can now do:

	mount --beneath -t tmpfs tmpfs /proc
	umount -l /proc

   after which a tmpfs mount has been placed beneath the procfs mount.
   The tmpfs mount has become locked and the procfs mount has become
   unlocked.

   This means you can safely modify an inherited mount table after
   unprivileged namespace creation.

   Afterwards we simply make it possible to move a mount beneath the
   rootfs allowing to upgrade the rootfs.

   Removing the locked restriction makes this very useful for containers
   created with unshare(CLONE_NEWUSER | CLONE_NEWNS) to reshuffle an
   inherited mount table safely and MOVE_MOUNT_BENEATH makes it possible
   to switch out the rootfs instead of using the costly pivot_root(2).

* tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  selftests/namespaces: remove unused utils.h include from listns_efault_test
  selftests/fsmount_ns: add missing TARGETS and fix cap test
  selftests/empty_mntns: fix wrong CLONE_EMPTY_MNTNS hex value in comment
  selftests/empty_mntns: fix statmount_alloc() signature mismatch
  selftests/statmount: remove duplicate wait_for_pid()
  mount: always duplicate mount
  selftests/filesystems: add MOVE_MOUNT_BENEATH rootfs tests
  move_mount: allow MOVE_MOUNT_BENEATH on the rootfs
  move_mount: transfer MNT_LOCKED
  selftests/filesystems: add clone3 tests for empty mount namespaces
  selftests/filesystems: add tests for empty mount namespaces
  namespace: allow creating empty mount namespaces
  selftests: add FSMOUNT_NAMESPACE tests
  selftests/statmount: add statmount_alloc() helper
  tools: update mount.h header
  mount: add FSMOUNT_NAMESPACE
  mount: simplify __do_loopback()
  mount: start iterating from start of rbtree
2026-04-14 19:59:25 -07:00

163 lines
6.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_SCHED_H
#define _UAPI_LINUX_SCHED_H
#include <linux/types.h>
/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */
#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
* syscalls only:
*/
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
/*
* unshare flags share the bit space with clone flags but only apply to the
* unshare syscall:
*/
#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
#ifndef __ASSEMBLY__
/**
* struct clone_args - arguments for the clone3 syscall
* @flags: Flags for the new process as listed above.
* All flags are valid except for CSIGNAL and
* CLONE_DETACHED.
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
* returned in this argument.
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
* child process will be returned in the child's
* memory.
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
* the child process will be returned in the
* parent's memory.
* @exit_signal: The exit_signal the parent process will be
* sent when the child exits.
* @stack: Specify the location of the stack for the
* child process.
* Note, @stack is expected to point to the
* lowest address. The stack direction will be
* determined by the kernel and set up
* appropriately based on @stack_size.
* @stack_size: The size of the stack for the child process.
* @tls: If CLONE_SETTLS is set, the tls descriptor
* is set to tls.
* @set_tid: Pointer to an array of type *pid_t. The size
* of the array is defined using @set_tid_size.
* This array is used to select PIDs/TIDs for
* newly created processes. The first element in
* this defines the PID in the most nested PID
* namespace. Each additional element in the array
* defines the PID in the parent PID namespace of
* the original PID namespace. If the array has
* less entries than the number of currently
* nested PID namespaces only the PIDs in the
* corresponding namespaces are set.
* @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces.
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
* a file descriptor for the cgroup.
*
* The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and
* must be properly 64bit aligned.
*/
struct clone_args {
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
__aligned_u64 parent_tid;
__aligned_u64 exit_signal;
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
};
#endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
#define SCHED_EXT 7
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
/*
* For the sched_{set,get}attr() calls
*/
#define SCHED_FLAG_RESET_ON_FORK 0x01
#define SCHED_FLAG_RECLAIM 0x02
#define SCHED_FLAG_DL_OVERRUN 0x04
#define SCHED_FLAG_KEEP_POLICY 0x08
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \
SCHED_FLAG_UTIL_CLAMP_MAX)
#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
SCHED_FLAG_UTIL_CLAMP)
/* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */
#define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01
#endif /* _UAPI_LINUX_SCHED_H */