Files
linux-stable-mirror/include/linux/binfmts.h
T
Christian Brauner 38205ecbe6 exec: free the old mm outside the exec locks
exec_mmap() installs the new mm and then tears the old one down while
still holding exec_update_lock for writing -- and with cred_guard_mutex
held all the way to setup_new_exec():

	setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
	mm_update_next_owner(old_mm);
	mmput(old_mm);

Neither lock is needed for this. exec_update_lock only exists to make the
mm swap atomic with the later commit_creds(), so that permission-checking
readers (proc, ptrace, the futex robust list, perf, kcmp, mm_access())
never observe the new mm together with the old credentials. Those readers
all operate on task->mm, i.e. the new mm after the swap; none looks at the
detached old mm, its ->owner or signal->maxrss. cred_guard_mutex guards
credential calculation and is equally irrelevant here.

The cost is real: __mmput() runs exit_mmap() over the entire old address
space and can block in exit_aio() waiting for in-flight AIO, all while
holding exec_update_lock for writing and cred_guard_mutex. For execve() of
a large process this blocks ptrace_attach() and every exec_update_lock
reader for the duration of the teardown.

Stash the old mm in bprm->old_mm and release it from setup_new_exec()
after both locks are dropped. setup_new_exec() still runs before
setup_arg_pages() and the segment mappings, so the old address space is
freed before the new one is populated and peak memory is unchanged. The
ordering constraints are kept: old_mm's mmap_lock is still dropped in
exec_mmap() before mm_update_next_owner() (required since commit
31a78f23ba ("mm owner: fix race between swapoff and exit")), and
mm_update_next_owner() still precedes mmput(); both run in the execing
task's context, as mm_update_next_owner() requires.

If exec swaps the mm but fails before setup_new_exec() runs the old mm
would leak, so add a backstop in free_bprm(). The lazy-tlb case
(old_mm == NULL, e.g. kernel_execve()) has no address space to
free and is left in exec_mmap().

Link: https://patch.msgid.link/20260522-work-exit_mm-v1-1-bd32d5a560bb@kernel.org
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
2026-05-26 11:02:02 +02:00

154 lines
4.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BINFMTS_H
#define _LINUX_BINFMTS_H
#include <linux/sched.h>
#include <linux/unistd.h>
#include <asm/exec.h>
#include <uapi/linux/binfmts.h>
struct filename;
struct coredump_params;
#define CORENAME_MAX_SIZE 128
/*
* This structure is used to hold the arguments that are used when loading binaries.
*/
struct linux_binprm {
#ifdef CONFIG_MMU
struct vm_area_struct *vma;
unsigned long vma_pages;
unsigned long argmin; /* rlimit marker for copy_strings() */
#else
# define MAX_ARG_PAGES 32
struct page *page[MAX_ARG_PAGES];
#endif
struct mm_struct *mm;
struct mm_struct *old_mm; /* replaced address space, freed by setup_new_exec() */
/* user_ns published to task->exec_state at execve, narrowed by would_dump(). */
struct user_namespace *user_ns;
unsigned long p; /* current top of mem */
unsigned int
/* Should an execfd be passed to userspace? */
have_execfd:1,
/* Use the creds of a script (see binfmt_misc) */
execfd_creds:1,
/*
* Set by bprm_creds_for_exec hook to indicate a
* privilege-gaining exec has happened. Used to set
* AT_SECURE auxv for glibc.
*/
secureexec:1,
/*
* Set when errors can no longer be returned to the
* original userspace.
*/
point_of_no_return:1,
/* Set when "comm" must come from the dentry. */
comm_from_dentry:1,
/*
* Set by user space to check executability according to the
* caller's environment.
*/
is_check:1;
struct file *executable; /* Executable to pass to the interpreter */
struct file *interpreter;
struct file *file;
struct cred *cred; /* new credentials */
int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
unsigned int per_clear; /* bits to clear in current->personality */
int argc, envc;
const char *filename; /* Name of binary as seen by procps */
const char *interp; /* Name of the binary really executed. Most
of the time same as filename, but could be
different for binfmt_{misc,script} */
const char *fdpath; /* generated filename for execveat */
unsigned interp_flags;
int execfd; /* File descriptor of the executable */
unsigned long exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
char buf[BINPRM_BUF_SIZE];
} __randomize_layout;
#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
#define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
/* filename of the binary will be inaccessible after exec */
#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
/* preserve argv0 for the interpreter */
#define BINPRM_FLAGS_PRESERVE_ARGV0_BIT 3
#define BINPRM_FLAGS_PRESERVE_ARGV0 (1 << BINPRM_FLAGS_PRESERVE_ARGV0_BIT)
/*
* This structure defines the functions that are used to load the binary formats that
* linux accepts.
*/
struct linux_binfmt {
struct list_head lh;
struct module *module;
int (*load_binary)(struct linux_binprm *);
#ifdef CONFIG_COREDUMP
int (*core_dump)(struct coredump_params *cprm);
unsigned long min_coredump; /* minimal dump size */
#endif
} __randomize_layout;
#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc {
struct list_head entries;
rwlock_t entries_lock;
bool enabled;
} __randomize_layout;
extern struct binfmt_misc init_binfmt_misc;
#endif
extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
/* Registration of default binfmt handlers */
static inline void register_binfmt(struct linux_binfmt *fmt)
{
__register_binfmt(fmt, 0);
}
/* Same as above, but adds a new binfmt at the top of the list */
static inline void insert_binfmt(struct linux_binfmt *fmt)
{
__register_binfmt(fmt, 1);
}
extern void unregister_binfmt(struct linux_binfmt *);
extern int __must_check remove_arg_zero(struct linux_binprm *);
extern int begin_new_exec(struct linux_binprm * bprm);
extern void setup_new_exec(struct linux_binprm * bprm);
extern void finalize_exec(struct linux_binprm *bprm);
extern void would_dump(struct linux_binprm *, struct file *);
extern int suid_dumpable;
/* Stack area protections */
#define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */
#define EXSTACK_DISABLE_X 1 /* Disable executable stacks */
#define EXSTACK_ENABLE_X 2 /* Enable executable stacks */
extern int setup_arg_pages(struct linux_binprm * bprm,
unsigned long stack_top,
int executable_stack);
extern int transfer_args_to_stack(struct linux_binprm *bprm,
unsigned long *sp_location);
extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm);
int copy_string_kernel(const char *arg, struct linux_binprm *bprm);
extern void set_binfmt(struct linux_binfmt *new);
extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
int kernel_execve(const char *filename,
const char *const *argv, const char *const *envp);
#endif /* _LINUX_BINFMTS_H */