summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.rst26
-rw-r--r--fs/Makefile2
-rw-r--r--fs/init.c17
-rw-r--r--fs/internal.h1
-rw-r--r--fs/mount.h1
-rw-r--r--fs/namespace.c159
-rw-r--r--fs/nullfs.c70
-rw-r--r--include/linux/init_syscalls.h1
-rw-r--r--include/uapi/linux/magic.h1
-rw-r--r--init/do_mounts.c12
10 files changed, 216 insertions, 74 deletions
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.rst b/Documentation/filesystems/ramfs-rootfs-initramfs.rst
index a9d271e171c3..165117a721ce 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst
@@ -76,10 +76,10 @@ What is rootfs?
---------------
Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
-always present in 2.6 systems. You can't unmount rootfs for approximately the
-same reason you can't kill the init process; rather than having special code
-to check for and handle an empty list, it's smaller and simpler for the kernel
-to just make sure certain lists can't become empty.
+always present in Linux systems. The kernel uses an immutable empty filesystem
+called nullfs as the true root of the VFS hierarchy, with the mutable rootfs
+(tmpfs/ramfs) mounted on top of it. This allows pivot_root() and unmounting
+of the initramfs to work normally.
Most systems just mount another filesystem over rootfs and ignore it. The
amount of space an empty instance of ramfs takes up is tiny.
@@ -121,16 +121,14 @@ All this differs from the old initrd in several ways:
program. See the switch_root utility, below.)
- When switching another root device, initrd would pivot_root and then
- umount the ramdisk. But initramfs is rootfs: you can neither pivot_root
- rootfs, nor unmount it. Instead delete everything out of rootfs to
- free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs
- with the new root (cd /newmount; mount --move . /; chroot .), attach
- stdin/stdout/stderr to the new /dev/console, and exec the new init.
-
- Since this is a remarkably persnickety process (and involves deleting
- commands before you can run them), the klibc package introduced a helper
- program (utils/run_init.c) to do all this for you. Most other packages
- (such as busybox) have named this command "switch_root".
+ umount the ramdisk. With nullfs as the true root, pivot_root() works
+ normally from the initramfs. Userspace can simply do::
+
+ chdir(new_root);
+ pivot_root(".", ".");
+ umount2(".", MNT_DETACH);
+
+ This is the preferred method for switching root filesystems.
Populating initramfs:
---------------------
diff --git a/fs/Makefile b/fs/Makefile
index f238cc5ea2e9..cf4a745e9679 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
- file_attr.o fserror.o
+ file_attr.o fserror.o nullfs.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
diff --git a/fs/init.c b/fs/init.c
index e0f5429c0a49..e33b2690d851 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -13,6 +13,23 @@
#include <linux/security.h>
#include "internal.h"
+int __init init_pivot_root(const char *new_root, const char *put_old)
+{
+ struct path new_path __free(path_put) = {};
+ struct path old_path __free(path_put) = {};
+ int ret;
+
+ ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path);
+ if (ret)
+ return ret;
+
+ ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path);
+ if (ret)
+ return ret;
+
+ return path_pivot_root(&new_path, &old_path);
+}
+
int __init init_mount(const char *dev_name, const char *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
diff --git a/fs/internal.h b/fs/internal.h
index 18a062c1b5b0..9514d80ef5c4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -90,6 +90,7 @@ extern bool may_mount(void);
int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(const struct path *path, int flags);
+int path_pivot_root(struct path *new, struct path *old);
int show_path(struct seq_file *m, struct dentry *root);
diff --git a/fs/mount.h b/fs/mount.h
index 2d28ef2a3aed..e0816c11a198 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -5,6 +5,7 @@
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+extern struct file_system_type nullfs_fs_type;
extern struct list_head notify_list;
struct mnt_namespace {
diff --git a/fs/namespace.c b/fs/namespace.c
index c58674a20cad..53d1055c1825 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -221,7 +221,7 @@ static int mnt_alloc_id(struct mount *mnt)
int res;
xa_lock(&mnt_id_xa);
- res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+ res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL);
if (!res)
mnt->mnt_id_unique = ++mnt_id_ctr;
xa_unlock(&mnt_id_xa);
@@ -4498,36 +4498,8 @@ bool path_is_under(const struct path *path1, const struct path *path2)
}
EXPORT_SYMBOL(path_is_under);
-/*
- * pivot_root Semantics:
- * Moves the root file system of the current process to the directory put_old,
- * makes new_root as the new root file system of the current process, and sets
- * root/cwd of all processes which had them on the current root to new_root.
- *
- * Restrictions:
- * The new_root and put_old must be directories, and must not be on the
- * same file system as the current process root. The put_old must be
- * underneath new_root, i.e. adding a non-zero number of /.. to the string
- * pointed to by put_old must yield the same directory as new_root. No other
- * file system may be mounted on put_old. After all, new_root is a mountpoint.
- *
- * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
- * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
- * in this situation.
- *
- * Notes:
- * - we don't move root/cwd if they are not at the root (reason: if something
- * cared enough to change them, it's probably wrong to force them elsewhere)
- * - it's okay to pick a root that isn't the root of a file system, e.g.
- * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
- * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
- * first.
- */
-SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
- const char __user *, put_old)
+int path_pivot_root(struct path *new, struct path *old)
{
- struct path new __free(path_put) = {};
- struct path old __free(path_put) = {};
struct path root __free(path_put) = {};
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
int error;
@@ -4535,28 +4507,18 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (!may_mount())
return -EPERM;
- error = user_path_at(AT_FDCWD, new_root,
- LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
- if (error)
- return error;
-
- error = user_path_at(AT_FDCWD, put_old,
- LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
- if (error)
- return error;
-
- error = security_sb_pivotroot(&old, &new);
+ error = security_sb_pivotroot(old, new);
if (error)
return error;
get_fs_root(current->fs, &root);
- LOCK_MOUNT(old_mp, &old);
+ LOCK_MOUNT(old_mp, old);
old_mnt = old_mp.parent;
if (IS_ERR(old_mnt))
return PTR_ERR(old_mnt);
- new_mnt = real_mount(new.mnt);
+ new_mnt = real_mount(new->mnt);
root_mnt = real_mount(root.mnt);
ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
@@ -4568,7 +4530,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
return -EINVAL;
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
return -EINVAL;
- if (d_unlinked(new.dentry))
+ if (d_unlinked(new->dentry))
return -ENOENT;
if (new_mnt == root_mnt || old_mnt == root_mnt)
return -EBUSY; /* loop, on the same file system */
@@ -4576,15 +4538,15 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
return -EINVAL; /* absolute root */
- if (!path_mounted(&new))
+ if (!path_mounted(new))
return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
return -EINVAL; /* absolute root */
/* make sure we can reach put_old from new_root */
- if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
+ if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new))
return -EINVAL;
/* make certain new is below the root */
- if (!is_path_reachable(new_mnt, new.dentry, &root))
+ if (!is_path_reachable(new_mnt, new->dentry, &root))
return -EINVAL;
lock_mount_hash();
umount_mnt(new_mnt);
@@ -4603,10 +4565,55 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
unlock_mount_hash();
mnt_notify_add(root_mnt);
mnt_notify_add(new_mnt);
- chroot_fs_refs(&root, &new);
+ chroot_fs_refs(&root, new);
return 0;
}
+/*
+ * pivot_root Semantics:
+ * Moves the root file system of the current process to the directory put_old,
+ * makes new_root as the new root file system of the current process, and sets
+ * root/cwd of all processes which had them on the current root to new_root.
+ *
+ * Restrictions:
+ * The new_root and put_old must be directories, and must not be on the
+ * same file system as the current process root. The put_old must be
+ * underneath new_root, i.e. adding a non-zero number of /.. to the string
+ * pointed to by put_old must yield the same directory as new_root. No other
+ * file system may be mounted on put_old. After all, new_root is a mountpoint.
+ *
+ * The immutable nullfs filesystem is mounted as the true root of the VFS
+ * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this,
+ * allowing pivot_root() to work normally from initramfs.
+ *
+ * Notes:
+ * - we don't move root/cwd if they are not at the root (reason: if something
+ * cared enough to change them, it's probably wrong to force them elsewhere)
+ * - it's okay to pick a root that isn't the root of a file system, e.g.
+ * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
+ * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
+ * first.
+ */
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+ const char __user *, put_old)
+{
+ struct path new __free(path_put) = {};
+ struct path old __free(path_put) = {};
+ int error;
+
+ error = user_path_at(AT_FDCWD, new_root,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
+ if (error)
+ return error;
+
+ error = user_path_at(AT_FDCWD, put_old,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
+ if (error)
+ return error;
+
+ return path_pivot_root(&new, &old);
+}
+
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
unsigned int flags = mnt->mnt.mnt_flags;
@@ -5969,24 +5976,62 @@ struct mnt_namespace init_mnt_ns = {
static void __init init_mount_tree(void)
{
- struct vfsmount *mnt;
- struct mount *m;
+ struct vfsmount *mnt, *nullfs_mnt;
+ struct mount *mnt_root;
struct path root;
+ /*
+ * We create two mounts:
+ *
+ * (1) nullfs with mount id 1
+ * (2) mutable rootfs with mount id 2
+ *
+ * with (2) mounted on top of (1).
+ */
+ nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
+ if (IS_ERR(nullfs_mnt))
+ panic("VFS: Failed to create nullfs");
+
mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
if (IS_ERR(mnt))
panic("Can't create rootfs");
- m = real_mount(mnt);
- init_mnt_ns.root = m;
- init_mnt_ns.nr_mounts = 1;
- mnt_add_to_ns(&init_mnt_ns, m);
+ VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
+ VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
+
+ /* The namespace root is the nullfs mnt. */
+ mnt_root = real_mount(nullfs_mnt);
+ init_mnt_ns.root = mnt_root;
+
+ /* Mount mutable rootfs on top of nullfs. */
+ root.mnt = nullfs_mnt;
+ root.dentry = nullfs_mnt->mnt_root;
+
+ LOCK_MOUNT_EXACT(mp, &root);
+ if (unlikely(IS_ERR(mp.parent)))
+ panic("VFS: Failed to mount rootfs on nullfs");
+ scoped_guard(mount_writer)
+ attach_mnt(real_mount(mnt), mp.parent, mp.mp);
+
+ pr_info("VFS: Finished mounting rootfs on nullfs\n");
+
+ /*
+ * We've dropped all locks here but that's fine. Not just are we
+ * the only task that's running, there's no other mount
+ * namespace in existence and the initial mount namespace is
+ * completely empty until we add the mounts we just created.
+ */
+ for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
+ mnt_add_to_ns(&init_mnt_ns, p);
+ init_mnt_ns.nr_mounts++;
+ }
+
init_task.nsproxy->mnt_ns = &init_mnt_ns;
get_mnt_ns(&init_mnt_ns);
- root.mnt = mnt;
- root.dentry = mnt->mnt_root;
-
+ /* The root and pwd always point to the mutable rootfs. */
+ root.mnt = mnt;
+ root.dentry = mnt->mnt_root;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
diff --git a/fs/nullfs.c b/fs/nullfs.c
new file mode 100644
index 000000000000..fdbd3e5d3d71
--- /dev/null
+++ b/fs/nullfs.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#include <linux/fs/super_types.h>
+#include <linux/fs_context.h>
+#include <linux/magic.h>
+
+static const struct super_operations nullfs_super_operations = {
+ .statfs = simple_statfs,
+};
+
+static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct inode *inode;
+
+ s->s_maxbytes = MAX_LFS_FILESIZE;
+ s->s_blocksize = PAGE_SIZE;
+ s->s_blocksize_bits = PAGE_SHIFT;
+ s->s_magic = NULL_FS_MAGIC;
+ s->s_op = &nullfs_super_operations;
+ s->s_export_op = NULL;
+ s->s_xattr = NULL;
+ s->s_time_gran = 1;
+ s->s_d_flags = 0;
+
+ inode = new_inode(s);
+ if (!inode)
+ return -ENOMEM;
+
+ /* nullfs is permanently empty... */
+ make_empty_dir_inode(inode);
+ simple_inode_init_ts(inode);
+ inode->i_ino = 1;
+ /* ... and immutable. */
+ inode->i_flags |= S_IMMUTABLE;
+
+ s->s_root = d_make_root(inode);
+ if (!s->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * For now this is a single global instance. If needed we can make it
+ * mountable by userspace at which point we will need to make it
+ * multi-instance.
+ */
+static int nullfs_fs_get_tree(struct fs_context *fc)
+{
+ return get_tree_single(fc, nullfs_fs_fill_super);
+}
+
+static const struct fs_context_operations nullfs_fs_context_ops = {
+ .get_tree = nullfs_fs_get_tree,
+};
+
+static int nullfs_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &nullfs_fs_context_ops;
+ fc->global = true;
+ fc->sb_flags = SB_NOUSER;
+ fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV;
+ return 0;
+}
+
+struct file_system_type nullfs_fs_type = {
+ .name = "nullfs",
+ .init_fs_context = nullfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
diff --git a/include/linux/init_syscalls.h b/include/linux/init_syscalls.h
index 92045d18cbfc..28776ee28d8e 100644
--- a/include/linux/init_syscalls.h
+++ b/include/linux/init_syscalls.h
@@ -17,3 +17,4 @@ int __init init_mkdir(const char *pathname, umode_t mode);
int __init init_rmdir(const char *pathname);
int __init init_utimes(char *filename, struct timespec64 *ts);
int __init init_dup(struct file *file);
+int __init init_pivot_root(const char *new_root, const char *put_old);
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 638ca21b7a90..4f2da935a76c 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -104,5 +104,6 @@
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
#define PID_FS_MAGIC 0x50494446 /* "PIDF" */
#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */
+#define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */
#endif /* __LINUX_MAGIC_H__ */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 9c8a547075a7..55ed3ac0b70f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -483,8 +483,16 @@ void __init prepare_namespace(void)
wait_for_root(saved_root_name);
mount_root(saved_root_name);
devtmpfs_mount();
- init_mount(".", "/", NULL, MS_MOVE, NULL);
- init_chroot(".");
+
+ if (init_pivot_root(".", ".")) {
+ pr_err("VFS: Failed to pivot into new rootfs\n");
+ return;
+ }
+ if (init_umount(".", MNT_DETACH)) {
+ pr_err("VFS: Failed to unmount old rootfs\n");
+ return;
+ }
+ pr_info("VFS: Pivoted into new rootfs\n");
}
static bool is_tmpfs;