10 files changed, 216 insertions, 74 deletions
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.rst b/Documentation/filesystems/ramfs-rootfs-initramfs.rst
index a9d271e171c3..165117a721ce 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst
@@ -76,10 +76,10 @@ What is rootfs?
 ---------------
 
 Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
-always present in 2.6 systems.  You can't unmount rootfs for approximately the
-same reason you can't kill the init process; rather than having special code
-to check for and handle an empty list, it's smaller and simpler for the kernel
-to just make sure certain lists can't become empty.
+always present in Linux systems.  The kernel uses an immutable empty filesystem
+called nullfs as the true root of the VFS hierarchy, with the mutable rootfs
+(tmpfs/ramfs) mounted on top of it.  This allows pivot_root() and unmounting
+of the initramfs to work normally.
 
 Most systems just mount another filesystem over rootfs and ignore it.  The
 amount of space an empty instance of ramfs takes up is tiny.
@@ -121,16 +121,14 @@ All this differs from the old initrd in several ways:
     program.  See the switch_root utility, below.)
 
   - When switching another root device, initrd would pivot_root and then
-    umount the ramdisk.  But initramfs is rootfs: you can neither pivot_root
-    rootfs, nor unmount it.  Instead delete everything out of rootfs to
-    free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs
-    with the new root (cd /newmount; mount --move . /; chroot .), attach
-    stdin/stdout/stderr to the new /dev/console, and exec the new init.
-
-    Since this is a remarkably persnickety process (and involves deleting
-    commands before you can run them), the klibc package introduced a helper
-    program (utils/run_init.c) to do all this for you.  Most other packages
-    (such as busybox) have named this command "switch_root".
+    umount the ramdisk.  With nullfs as the true root, pivot_root() works
+    normally from the initramfs.  Userspace can simply do::
+
+      chdir(new_root);
+      pivot_root(".", ".");
+      umount2(".", MNT_DETACH);
+
+    This is the preferred method for switching root filesystems.
 
 Populating initramfs:
 ---------------------
diff --git a/fs/Makefile b/fs/Makefile
index f238cc5ea2e9..cf4a745e9679 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,7 +16,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
-		file_attr.o fserror.o
+		file_attr.o fserror.o nullfs.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
diff --git a/fs/init.c b/fs/init.c
index e0f5429c0a49..e33b2690d851 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -13,6 +13,23 @@
 #include <linux/security.h>
 #include "internal.h"
 
+int __init init_pivot_root(const char *new_root, const char *put_old)
+{
+	struct path new_path __free(path_put) = {};
+	struct path old_path __free(path_put) = {};
+	int ret;
+
+	ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path);
+	if (ret)
+		return ret;
+
+	ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path);
+	if (ret)
+		return ret;
+
+	return path_pivot_root(&new_path, &old_path);
+}
+
 int __init init_mount(const char *dev_name, const char *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
diff --git a/fs/internal.h b/fs/internal.h
index 18a062c1b5b0..9514d80ef5c4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -90,6 +90,7 @@ extern bool may_mount(void);
 int path_mount(const char *dev_name, const struct path *path,
 		const char *type_page, unsigned long flags, void *data_page);
 int path_umount(const struct path *path, int flags);
+int path_pivot_root(struct path *new, struct path *old);
 
 int show_path(struct seq_file *m, struct dentry *root);
 
diff --git a/fs/mount.h b/fs/mount.h
index 2d28ef2a3aed..e0816c11a198 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -5,6 +5,7 @@
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
 
+extern struct file_system_type nullfs_fs_type;
 extern struct list_head notify_list;
 
 struct mnt_namespace {
diff --git a/fs/namespace.c b/fs/namespace.c
index c58674a20cad..53d1055c1825 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -221,7 +221,7 @@ static int mnt_alloc_id(struct mount *mnt)
 	int res;
 
 	xa_lock(&mnt_id_xa);
-	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL);
 	if (!res)
 		mnt->mnt_id_unique = ++mnt_id_ctr;
 	xa_unlock(&mnt_id_xa);
@@ -4498,36 +4498,8 @@ bool path_is_under(const struct path *path1, const struct path *path2)
 }
 EXPORT_SYMBOL(path_is_under);
 
-/*
- * pivot_root Semantics:
- * Moves the root file system of the current process to the directory put_old,
- * makes new_root as the new root file system of the current process, and sets
- * root/cwd of all processes which had them on the current root to new_root.
- *
- * Restrictions:
- * The new_root and put_old must be directories, and  must not be on the
- * same file  system as the current process root. The put_old  must  be
- * underneath new_root,  i.e. adding a non-zero number of /.. to the string
- * pointed to by put_old must yield the same directory as new_root. No other
- * file system may be mounted on put_old. After all, new_root is a mountpoint.
- *
- * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
- * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
- * in this situation.
- *
- * Notes:
- *  - we don't move root/cwd if they are not at the root (reason: if something
- *    cared enough to change them, it's probably wrong to force them elsewhere)
- *  - it's okay to pick a root that isn't the root of a file system, e.g.
- *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
- *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
- *    first.
- */
-SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
-		const char __user *, put_old)
+int path_pivot_root(struct path *new, struct path *old)
 {
-	struct path new __free(path_put) = {};
-	struct path old __free(path_put) = {};
 	struct path root __free(path_put) = {};
 	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
 	int error;
@@ -4535,28 +4507,18 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	if (!may_mount())
 		return -EPERM;
 
-	error = user_path_at(AT_FDCWD, new_root,
-			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
-	if (error)
-		return error;
-
-	error = user_path_at(AT_FDCWD, put_old,
-			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
-	if (error)
-		return error;
-
-	error = security_sb_pivotroot(&old, &new);
+	error = security_sb_pivotroot(old, new);
 	if (error)
 		return error;
 
 	get_fs_root(current->fs, &root);
 
-	LOCK_MOUNT(old_mp, &old);
+	LOCK_MOUNT(old_mp, old);
 	old_mnt = old_mp.parent;
 	if (IS_ERR(old_mnt))
 		return PTR_ERR(old_mnt);
 
-	new_mnt = real_mount(new.mnt);
+	new_mnt = real_mount(new->mnt);
 	root_mnt = real_mount(root.mnt);
 	ex_parent = new_mnt->mnt_parent;
 	root_parent = root_mnt->mnt_parent;
@@ -4568,7 +4530,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		return -EINVAL;
 	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
 		return -EINVAL;
-	if (d_unlinked(new.dentry))
+	if (d_unlinked(new->dentry))
 		return -ENOENT;
 	if (new_mnt == root_mnt || old_mnt == root_mnt)
 		return -EBUSY; /* loop, on the same file system  */
@@ -4576,15 +4538,15 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		return -EINVAL; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		return -EINVAL; /* absolute root */
-	if (!path_mounted(&new))
+	if (!path_mounted(new))
 		return -EINVAL; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
 		return -EINVAL; /* absolute root */
 	/* make sure we can reach put_old from new_root */
-	if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
+	if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new))
 		return -EINVAL;
 	/* make certain new is below the root */
-	if (!is_path_reachable(new_mnt, new.dentry, &root))
+	if (!is_path_reachable(new_mnt, new->dentry, &root))
 		return -EINVAL;
 	lock_mount_hash();
 	umount_mnt(new_mnt);
@@ -4603,10 +4565,55 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	unlock_mount_hash();
 	mnt_notify_add(root_mnt);
 	mnt_notify_add(new_mnt);
-	chroot_fs_refs(&root, &new);
+	chroot_fs_refs(&root, new);
 	return 0;
 }
 
+/*
+ * pivot_root Semantics:
+ * Moves the root file system of the current process to the directory put_old,
+ * makes new_root as the new root file system of the current process, and sets
+ * root/cwd of all processes which had them on the current root to new_root.
+ *
+ * Restrictions:
+ * The new_root and put_old must be directories, and  must not be on the
+ * same file  system as the current process root. The put_old  must  be
+ * underneath new_root,  i.e. adding a non-zero number of /.. to the string
+ * pointed to by put_old must yield the same directory as new_root. No other
+ * file system may be mounted on put_old. After all, new_root is a mountpoint.
+ *
+ * The immutable nullfs filesystem is mounted as the true root of the VFS
+ * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this,
+ * allowing pivot_root() to work normally from initramfs.
+ *
+ * Notes:
+ *  - we don't move root/cwd if they are not at the root (reason: if something
+ *    cared enough to change them, it's probably wrong to force them elsewhere)
+ *  - it's okay to pick a root that isn't the root of a file system, e.g.
+ *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
+ *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
+ *    first.
+ */
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+		const char __user *, put_old)
+{
+	struct path new __free(path_put) = {};
+	struct path old __free(path_put) = {};
+	int error;
+
+	error = user_path_at(AT_FDCWD, new_root,
+			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
+	if (error)
+		return error;
+
+	error = user_path_at(AT_FDCWD, put_old,
+			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
+	if (error)
+		return error;
+
+	return path_pivot_root(&new, &old);
+}
+
 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
 {
 	unsigned int flags = mnt->mnt.mnt_flags;
@@ -5969,24 +5976,62 @@ struct mnt_namespace init_mnt_ns = {
 
 static void __init init_mount_tree(void)
 {
-	struct vfsmount *mnt;
-	struct mount *m;
+	struct vfsmount *mnt, *nullfs_mnt;
+	struct mount *mnt_root;
 	struct path root;
 
+	/*
+	 * We create two mounts:
+	 *
+	 * (1) nullfs with mount id 1
+	 * (2) mutable rootfs with mount id 2
+	 *
+	 * with (2) mounted on top of (1).
+	 */
+	nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
+	if (IS_ERR(nullfs_mnt))
+		panic("VFS: Failed to create nullfs");
+
 	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
-	m = real_mount(mnt);
-	init_mnt_ns.root = m;
-	init_mnt_ns.nr_mounts = 1;
-	mnt_add_to_ns(&init_mnt_ns, m);
+	VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
+	VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
+
+	/* The namespace root is the nullfs mnt. */
+	mnt_root		= real_mount(nullfs_mnt);
+	init_mnt_ns.root	= mnt_root;
+
+	/* Mount mutable rootfs on top of nullfs. */
+	root.mnt		= nullfs_mnt;
+	root.dentry		= nullfs_mnt->mnt_root;
+
+	LOCK_MOUNT_EXACT(mp, &root);
+	if (unlikely(IS_ERR(mp.parent)))
+		panic("VFS: Failed to mount rootfs on nullfs");
+	scoped_guard(mount_writer)
+		attach_mnt(real_mount(mnt), mp.parent, mp.mp);
+
+	pr_info("VFS: Finished mounting rootfs on nullfs\n");
+
+	/*
+	 * We've dropped all locks here but that's fine. Not just are we
+	 * the only task that's running, there's no other mount
+	 * namespace in existence and the initial mount namespace is
+	 * completely empty until we add the mounts we just created.
+	 */
+	for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
+		mnt_add_to_ns(&init_mnt_ns, p);
+		init_mnt_ns.nr_mounts++;
+	}
+
 	init_task.nsproxy->mnt_ns = &init_mnt_ns;
 	get_mnt_ns(&init_mnt_ns);
 
-	root.mnt = mnt;
-	root.dentry = mnt->mnt_root;
-
+	/* The root and pwd always point to the mutable rootfs. */
+	root.mnt	= mnt;
+	root.dentry	= mnt->mnt_root;
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 
diff --git a/fs/nullfs.c b/fs/nullfs.c
new file mode 100644
index 000000000000..fdbd3e5d3d71
--- /dev/null
+++ b/fs/nullfs.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#include <linux/fs/super_types.h>
+#include <linux/fs_context.h>
+#include <linux/magic.h>
+
+static const struct super_operations nullfs_super_operations = {
+	.statfs	= simple_statfs,
+};
+
+static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+	struct inode *inode;
+
+	s->s_maxbytes		= MAX_LFS_FILESIZE;
+	s->s_blocksize		= PAGE_SIZE;
+	s->s_blocksize_bits	= PAGE_SHIFT;
+	s->s_magic		= NULL_FS_MAGIC;
+	s->s_op			= &nullfs_super_operations;
+	s->s_export_op		= NULL;
+	s->s_xattr		= NULL;
+	s->s_time_gran		= 1;
+	s->s_d_flags		= 0;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+
+	/* nullfs is permanently empty... */
+	make_empty_dir_inode(inode);
+	simple_inode_init_ts(inode);
+	inode->i_ino	= 1;
+	/* ... and immutable. */
+	inode->i_flags |= S_IMMUTABLE;
+
+	s->s_root = d_make_root(inode);
+	if (!s->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * For now this is a single global instance. If needed we can make it
+ * mountable by userspace at which point we will need to make it
+ * multi-instance.
+ */
+static int nullfs_fs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, nullfs_fs_fill_super);
+}
+
+static const struct fs_context_operations nullfs_fs_context_ops = {
+	.get_tree	= nullfs_fs_get_tree,
+};
+
+static int nullfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops		= &nullfs_fs_context_ops;
+	fc->global	= true;
+	fc->sb_flags	= SB_NOUSER;
+	fc->s_iflags	= SB_I_NOEXEC | SB_I_NODEV;
+	return 0;
+}
+
+struct file_system_type nullfs_fs_type = {
+	.name			= "nullfs",
+	.init_fs_context	= nullfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
diff --git a/include/linux/init_syscalls.h b/include/linux/init_syscalls.h
index 92045d18cbfc..28776ee28d8e 100644
--- a/include/linux/init_syscalls.h
+++ b/include/linux/init_syscalls.h
@@ -17,3 +17,4 @@ int __init init_mkdir(const char *pathname, umode_t mode);
 int __init init_rmdir(const char *pathname);
 int __init init_utimes(char *filename, struct timespec64 *ts);
 int __init init_dup(struct file *file);
+int __init init_pivot_root(const char *new_root, const char *put_old);
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 638ca21b7a90..4f2da935a76c 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -104,5 +104,6 @@
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
 #define GUEST_MEMFD_MAGIC	0x474d454d	/* "GMEM" */
+#define NULL_FS_MAGIC		0x4E554C4C	/* "NULL" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 9c8a547075a7..55ed3ac0b70f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -483,8 +483,16 @@ void __init prepare_namespace(void)
 		wait_for_root(saved_root_name);
 	mount_root(saved_root_name);
 	devtmpfs_mount();
-	init_mount(".", "/", NULL, MS_MOVE, NULL);
-	init_chroot(".");
+
+	if (init_pivot_root(".", ".")) {
+		pr_err("VFS: Failed to pivot into new rootfs\n");
+		return;
+	}
+	if (init_umount(".", MNT_DETACH)) {
+		pr_err("VFS: Failed to unmount old rootfs\n");
+		return;
+	}
+	pr_info("VFS: Pivoted into new rootfs\n");
 }
 
 static bool is_tmpfs;