mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-03-03 18:28:01 +01:00
Pull vfs nullfs update from Christian Brauner:
"Add a completely catatonic minimal pseudo filesystem called "nullfs"
and make pivot_root() work in the initramfs.
Currently pivot_root() does not work on the real rootfs because it
cannot be unmounted. Userspace has to recursively delete initramfs
contents manually before continuing boot, using the fragile
switch_root sequence (overmount + chroot).
Add nullfs, a minimal immutable filesystem that serves as the true
root of the mount hierarchy. The mutable rootfs (tmpfs/ramfs) is
mounted on top of it. This allows userspace to simply:
chdir(new_root);
pivot_root(".", ".");
umount2(".", MNT_DETACH);
without the traditional switch_root workarounds. systemd already
handles this correctly. It tries pivot_root() first and falls back
to MS_MOVE only when that fails.
This also means rootfs mounts in unprivileged namespaces no longer
need MNT_LOCKED, since the immutable nullfs guarantees nothing can be
revealed by unmounting the covering mount.
nullfs is a single-instance filesystem (get_tree_single()) marked
SB_NOUSER | SB_I_NOEXEC | SB_I_NODEV with an immutable empty root
directory. This means sooner or later it can be used to overmount
other directories to hide their contents without any additional
protection needed.
We enable it unconditionally. If we see any real regression we'll
hide it behind a boot option.
nullfs has extensions beyond this in the future. It will serve as a
concept to support the creation of completely empty mount namespaces -
which is work coming up in the next cycle"
* tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: use nullfs unconditionally as the real rootfs
docs: mention nullfs
fs: add immutable rootfs
fs: add init_pivot_root()
fs: ensure that internal tmpfs mount gets mount id zero
522 lines
11 KiB
C
522 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
#include <linux/module.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/ctype.h>
|
|
#include <linux/fd.h>
|
|
#include <linux/tty.h>
|
|
#include <linux/suspend.h>
|
|
#include <linux/root_dev.h>
|
|
#include <linux/security.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/device.h>
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/initrd.h>
|
|
#include <linux/async.h>
|
|
#include <linux/fs_struct.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/ramfs.h>
|
|
#include <linux/shmem_fs.h>
|
|
#include <linux/ktime.h>
|
|
|
|
#include <linux/nfs_fs.h>
|
|
#include <linux/nfs_fs_sb.h>
|
|
#include <linux/nfs_mount.h>
|
|
#include <linux/raid/detect.h>
|
|
#include <uapi/linux/mount.h>
|
|
|
|
#include "do_mounts.h"
|
|
|
|
int root_mountflags = MS_RDONLY | MS_SILENT;
|
|
static char __initdata saved_root_name[64];
|
|
static int root_wait;
|
|
|
|
dev_t ROOT_DEV;
|
|
|
|
static int __init readonly(char *str)
|
|
{
|
|
if (*str)
|
|
return 0;
|
|
root_mountflags |= MS_RDONLY;
|
|
return 1;
|
|
}
|
|
|
|
static int __init readwrite(char *str)
|
|
{
|
|
if (*str)
|
|
return 0;
|
|
root_mountflags &= ~MS_RDONLY;
|
|
return 1;
|
|
}
|
|
|
|
__setup("ro", readonly);
|
|
__setup("rw", readwrite);
|
|
|
|
static int __init root_dev_setup(char *line)
|
|
{
|
|
strscpy(saved_root_name, line, sizeof(saved_root_name));
|
|
return 1;
|
|
}
|
|
|
|
__setup("root=", root_dev_setup);
|
|
|
|
static int __init rootwait_setup(char *str)
|
|
{
|
|
if (*str)
|
|
return 0;
|
|
root_wait = -1;
|
|
return 1;
|
|
}
|
|
|
|
__setup("rootwait", rootwait_setup);
|
|
|
|
static int __init rootwait_timeout_setup(char *str)
|
|
{
|
|
int sec;
|
|
|
|
if (kstrtoint(str, 0, &sec) || sec < 0) {
|
|
pr_warn("ignoring invalid rootwait value\n");
|
|
goto ignore;
|
|
}
|
|
|
|
if (check_mul_overflow(sec, MSEC_PER_SEC, &root_wait)) {
|
|
pr_warn("ignoring excessive rootwait value\n");
|
|
goto ignore;
|
|
}
|
|
|
|
return 1;
|
|
|
|
ignore:
|
|
/* Fallback to indefinite wait */
|
|
root_wait = -1;
|
|
|
|
return 1;
|
|
}
|
|
|
|
__setup("rootwait=", rootwait_timeout_setup);
|
|
|
|
static char * __initdata root_mount_data;
|
|
static int __init root_data_setup(char *str)
|
|
{
|
|
root_mount_data = str;
|
|
return 1;
|
|
}
|
|
|
|
static char * __initdata root_fs_names;
|
|
static int __init fs_names_setup(char *str)
|
|
{
|
|
root_fs_names = str;
|
|
return 1;
|
|
}
|
|
|
|
static unsigned int __initdata root_delay;
|
|
static int __init root_delay_setup(char *str)
|
|
{
|
|
if (kstrtouint(str, 0, &root_delay))
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
__setup("rootflags=", root_data_setup);
|
|
__setup("rootfstype=", fs_names_setup);
|
|
__setup("rootdelay=", root_delay_setup);
|
|
|
|
/* This can return zero length strings. Caller should check */
|
|
static int __init split_fs_names(char *page, size_t size)
|
|
{
|
|
int count = 1;
|
|
char *p = page;
|
|
|
|
strscpy(p, root_fs_names, size);
|
|
while (*p++) {
|
|
if (p[-1] == ',') {
|
|
p[-1] = '\0';
|
|
count++;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
static int __init do_mount_root(const char *name, const char *fs,
|
|
const int flags, const void *data)
|
|
{
|
|
struct super_block *s;
|
|
struct page *p = NULL;
|
|
char *data_page = NULL;
|
|
int ret;
|
|
|
|
if (data) {
|
|
/* init_mount() requires a full page as fifth argument */
|
|
p = alloc_page(GFP_KERNEL);
|
|
if (!p)
|
|
return -ENOMEM;
|
|
data_page = page_address(p);
|
|
strscpy_pad(data_page, data, PAGE_SIZE);
|
|
}
|
|
|
|
ret = init_mount(name, "/root", fs, flags, data_page);
|
|
if (ret)
|
|
goto out;
|
|
|
|
init_chdir("/root");
|
|
s = current->fs->pwd.dentry->d_sb;
|
|
ROOT_DEV = s->s_dev;
|
|
printk(KERN_INFO
|
|
"VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
|
|
s->s_type->name,
|
|
sb_rdonly(s) ? " readonly" : "",
|
|
MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
|
|
|
|
out:
|
|
if (p)
|
|
put_page(p);
|
|
return ret;
|
|
}
|
|
|
|
void __init mount_root_generic(char *name, char *pretty_name, int flags)
|
|
{
|
|
struct page *page = alloc_page(GFP_KERNEL);
|
|
char *fs_names = page_address(page);
|
|
char *p;
|
|
char b[BDEVNAME_SIZE];
|
|
int num_fs, i;
|
|
|
|
scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)",
|
|
MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
|
|
if (root_fs_names)
|
|
num_fs = split_fs_names(fs_names, PAGE_SIZE);
|
|
else
|
|
num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
|
|
retry:
|
|
for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) {
|
|
int err;
|
|
|
|
if (!*p)
|
|
continue;
|
|
err = do_mount_root(name, p, flags, root_mount_data);
|
|
switch (err) {
|
|
case 0:
|
|
goto out;
|
|
case -EACCES:
|
|
case -EINVAL:
|
|
#ifdef CONFIG_BLOCK
|
|
init_flush_fput();
|
|
#endif
|
|
continue;
|
|
}
|
|
/*
|
|
* Allow the user to distinguish between failed sys_open
|
|
* and bad superblock on root device.
|
|
* and give them a list of the available devices
|
|
*/
|
|
printk("VFS: Cannot open root device \"%s\" or %s: error %d\n",
|
|
pretty_name, b, err);
|
|
printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
|
|
printk_all_partitions();
|
|
|
|
if (root_fs_names)
|
|
num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
|
|
if (!num_fs)
|
|
pr_err("Can't find any bdev filesystem to be used for mount!\n");
|
|
else {
|
|
pr_err("List of all bdev filesystems:\n");
|
|
for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
|
|
pr_err(" %s", p);
|
|
pr_err("\n");
|
|
}
|
|
|
|
panic("VFS: Unable to mount root fs on %s", b);
|
|
}
|
|
if (!(flags & SB_RDONLY)) {
|
|
flags |= SB_RDONLY;
|
|
goto retry;
|
|
}
|
|
|
|
printk("List of all partitions:\n");
|
|
printk_all_partitions();
|
|
printk("No filesystem could mount root, tried: ");
|
|
for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
|
|
printk(" %s", p);
|
|
printk("\n");
|
|
panic("VFS: Unable to mount root fs on \"%s\" or %s", pretty_name, b);
|
|
out:
|
|
put_page(page);
|
|
}
|
|
|
|
#ifdef CONFIG_ROOT_NFS
|
|
|
|
#define NFSROOT_TIMEOUT_MIN 5
|
|
#define NFSROOT_TIMEOUT_MAX 30
|
|
#define NFSROOT_RETRY_MAX 5
|
|
|
|
static void __init mount_nfs_root(void)
|
|
{
|
|
char *root_dev, *root_data;
|
|
unsigned int timeout;
|
|
int try;
|
|
|
|
if (nfs_root_data(&root_dev, &root_data))
|
|
goto fail;
|
|
|
|
/*
|
|
* The server or network may not be ready, so try several
|
|
* times. Stop after a few tries in case the client wants
|
|
* to fall back to other boot methods.
|
|
*/
|
|
timeout = NFSROOT_TIMEOUT_MIN;
|
|
for (try = 1; ; try++) {
|
|
if (!do_mount_root(root_dev, "nfs", root_mountflags, root_data))
|
|
return;
|
|
if (try > NFSROOT_RETRY_MAX)
|
|
break;
|
|
|
|
/* Wait, in case the server refused us immediately */
|
|
ssleep(timeout);
|
|
timeout <<= 1;
|
|
if (timeout > NFSROOT_TIMEOUT_MAX)
|
|
timeout = NFSROOT_TIMEOUT_MAX;
|
|
}
|
|
fail:
|
|
pr_err("VFS: Unable to mount root fs via NFS.\n");
|
|
}
|
|
#else
|
|
static inline void mount_nfs_root(void)
|
|
{
|
|
}
|
|
#endif /* CONFIG_ROOT_NFS */
|
|
|
|
#ifdef CONFIG_CIFS_ROOT
|
|
|
|
#define CIFSROOT_TIMEOUT_MIN 5
|
|
#define CIFSROOT_TIMEOUT_MAX 30
|
|
#define CIFSROOT_RETRY_MAX 5
|
|
|
|
static void __init mount_cifs_root(void)
|
|
{
|
|
char *root_dev, *root_data;
|
|
unsigned int timeout;
|
|
int try;
|
|
|
|
if (cifs_root_data(&root_dev, &root_data))
|
|
goto fail;
|
|
|
|
timeout = CIFSROOT_TIMEOUT_MIN;
|
|
for (try = 1; ; try++) {
|
|
if (!do_mount_root(root_dev, "cifs", root_mountflags,
|
|
root_data))
|
|
return;
|
|
if (try > CIFSROOT_RETRY_MAX)
|
|
break;
|
|
|
|
ssleep(timeout);
|
|
timeout <<= 1;
|
|
if (timeout > CIFSROOT_TIMEOUT_MAX)
|
|
timeout = CIFSROOT_TIMEOUT_MAX;
|
|
}
|
|
fail:
|
|
pr_err("VFS: Unable to mount root fs via SMB.\n");
|
|
}
|
|
#else
|
|
static inline void mount_cifs_root(void)
|
|
{
|
|
}
|
|
#endif /* CONFIG_CIFS_ROOT */
|
|
|
|
static bool __init fs_is_nodev(char *fstype)
|
|
{
|
|
struct file_system_type *fs = get_fs_type(fstype);
|
|
bool ret = false;
|
|
|
|
if (fs) {
|
|
ret = !(fs->fs_flags & FS_REQUIRES_DEV);
|
|
put_filesystem(fs);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int __init mount_nodev_root(char *root_device_name)
|
|
{
|
|
char *fs_names, *fstype;
|
|
int err = -EINVAL;
|
|
int num_fs, i;
|
|
|
|
fs_names = (void *)__get_free_page(GFP_KERNEL);
|
|
if (!fs_names)
|
|
return -EINVAL;
|
|
num_fs = split_fs_names(fs_names, PAGE_SIZE);
|
|
|
|
for (i = 0, fstype = fs_names; i < num_fs;
|
|
i++, fstype += strlen(fstype) + 1) {
|
|
if (!*fstype)
|
|
continue;
|
|
if (!fs_is_nodev(fstype))
|
|
continue;
|
|
err = do_mount_root(root_device_name, fstype, root_mountflags,
|
|
root_mount_data);
|
|
if (!err)
|
|
break;
|
|
}
|
|
|
|
free_page((unsigned long)fs_names);
|
|
return err;
|
|
}
|
|
|
|
#ifdef CONFIG_BLOCK
|
|
static void __init mount_block_root(char *root_device_name)
|
|
{
|
|
int err = create_dev("/dev/root", ROOT_DEV);
|
|
|
|
if (err < 0)
|
|
pr_emerg("Failed to create /dev/root: %d\n", err);
|
|
mount_root_generic("/dev/root", root_device_name, root_mountflags);
|
|
}
|
|
#else
|
|
static inline void mount_block_root(char *root_device_name)
|
|
{
|
|
}
|
|
#endif /* CONFIG_BLOCK */
|
|
|
|
void __init mount_root(char *root_device_name)
|
|
{
|
|
switch (ROOT_DEV) {
|
|
case Root_NFS:
|
|
mount_nfs_root();
|
|
break;
|
|
case Root_CIFS:
|
|
mount_cifs_root();
|
|
break;
|
|
case Root_Generic:
|
|
mount_root_generic(root_device_name, root_device_name,
|
|
root_mountflags);
|
|
break;
|
|
case 0:
|
|
if (root_device_name && root_fs_names &&
|
|
mount_nodev_root(root_device_name) == 0)
|
|
break;
|
|
fallthrough;
|
|
default:
|
|
mount_block_root(root_device_name);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* wait for any asynchronous scanning to complete */
|
|
static void __init wait_for_root(char *root_device_name)
|
|
{
|
|
ktime_t end;
|
|
|
|
if (ROOT_DEV != 0)
|
|
return;
|
|
|
|
pr_info("Waiting for root device %s...\n", root_device_name);
|
|
|
|
end = ktime_add_ms(ktime_get_raw(), root_wait);
|
|
|
|
while (!driver_probe_done() ||
|
|
early_lookup_bdev(root_device_name, &ROOT_DEV) < 0) {
|
|
msleep(5);
|
|
if (root_wait > 0 && ktime_after(ktime_get_raw(), end))
|
|
break;
|
|
}
|
|
|
|
async_synchronize_full();
|
|
|
|
}
|
|
|
|
static dev_t __init parse_root_device(char *root_device_name)
|
|
{
|
|
int error;
|
|
dev_t dev;
|
|
|
|
if (!strncmp(root_device_name, "mtd", 3) ||
|
|
!strncmp(root_device_name, "ubi", 3))
|
|
return Root_Generic;
|
|
if (strcmp(root_device_name, "/dev/nfs") == 0)
|
|
return Root_NFS;
|
|
if (strcmp(root_device_name, "/dev/cifs") == 0)
|
|
return Root_CIFS;
|
|
if (strcmp(root_device_name, "/dev/ram") == 0)
|
|
return Root_RAM0;
|
|
|
|
error = early_lookup_bdev(root_device_name, &dev);
|
|
if (error) {
|
|
if (error == -EINVAL && root_wait) {
|
|
pr_err("Disabling rootwait; root= is invalid.\n");
|
|
root_wait = 0;
|
|
}
|
|
return 0;
|
|
}
|
|
return dev;
|
|
}
|
|
|
|
/*
|
|
* Prepare the namespace - decide what/where to mount, load ramdisks, etc.
|
|
*/
|
|
void __init prepare_namespace(void)
|
|
{
|
|
if (root_delay) {
|
|
printk(KERN_INFO "Waiting %d sec before mounting root device...\n",
|
|
root_delay);
|
|
ssleep(root_delay);
|
|
}
|
|
|
|
/*
|
|
* wait for the known devices to complete their probing
|
|
*
|
|
* Note: this is a potential source of long boot delays.
|
|
* For example, it is not atypical to wait 5 seconds here
|
|
* for the touchpad of a laptop to initialize.
|
|
*/
|
|
wait_for_device_probe();
|
|
|
|
md_run_setup();
|
|
|
|
if (saved_root_name[0])
|
|
ROOT_DEV = parse_root_device(saved_root_name);
|
|
|
|
initrd_load();
|
|
|
|
if (root_wait)
|
|
wait_for_root(saved_root_name);
|
|
mount_root(saved_root_name);
|
|
devtmpfs_mount();
|
|
|
|
if (init_pivot_root(".", ".")) {
|
|
pr_err("VFS: Failed to pivot into new rootfs\n");
|
|
return;
|
|
}
|
|
if (init_umount(".", MNT_DETACH)) {
|
|
pr_err("VFS: Failed to unmount old rootfs\n");
|
|
return;
|
|
}
|
|
pr_info("VFS: Pivoted into new rootfs\n");
|
|
}
|
|
|
|
static bool is_tmpfs;
|
|
static int rootfs_init_fs_context(struct fs_context *fc)
|
|
{
|
|
if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
|
|
return shmem_init_fs_context(fc);
|
|
|
|
return ramfs_init_fs_context(fc);
|
|
}
|
|
|
|
struct file_system_type rootfs_fs_type = {
|
|
.name = "rootfs",
|
|
.init_fs_context = rootfs_init_fs_context,
|
|
.kill_sb = kill_anon_super,
|
|
};
|
|
|
|
void __init init_rootfs(void)
|
|
{
|
|
if (IS_ENABLED(CONFIG_TMPFS)) {
|
|
if (!saved_root_name[0] && !root_fs_names)
|
|
is_tmpfs = true;
|
|
else if (root_fs_names && !!strstr(root_fs_names, "tmpfs"))
|
|
is_tmpfs = true;
|
|
}
|
|
}
|