snarkmaster/locked_mount_via_propagation.cpp

## locked_mount_via_propagation.cpp
/*
This shows one of two possible methods (as of Linux v6.4.9) for seting up a
MNT_LOCK_READONLY mount, which is a mount that cannot be remounted
read-write even by a fully privileged super-user.  The same principle applies
to locking NODEV, NOSUID, and NOEXEC [1].

Hopefully, at some point the Linux kernel will support setting locked bits
via `mount_setattr` [2], but as of v6.4.9, this appears to be the simplest way.

This `attach_recursive_mnt`-based method works best if you can have a
persistent daemon, which exchanges with the actual mounting process via
AF_UNIX sockets via SCM_RIGHTS.  Then, you do NOT need a `clone` per mount.

If you're going to spawn a helper process per mount anyway, then the
`copy_mnt_ns` method is better, see:
https://gist.github.com/snarkmaster/5ca6b668499bf2c9010fd68227d64887

This works as follows:
 - As seen in [1], whether a mount is locked is a bit on `struct mount`. Once
   locked, these bits cannot be cleared.
 - The call chain `move_mount` -> 'do_move_mount' -> 'attach_recursive_mnt'
   will trigger `lock_mnt_tree` if the mount's user namespace differs from
   the calling process's user namespace, see [3].

Mount locking is a standard part of the user NS security model, per `man
mount_namespaces`:

> A mount namespace has an owner user namespace.  A mount namespace whose
> owner user namespace is different from the owner user namespace of its
> parent mount namespace is considered a less privileged mount namespace.
>
> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags
> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when
> propagated from a more privileged to a less privileged mount namespace,
> and may not be changed in the less privileged mount namespace.

The only innovation here is that we generate a locked mount (via mount
propagation), and ship it back to the originating namespace, showing that
mounts can be locked even within the outermost user NS.

Demo:

mkdir src dest
touch src/foo
g++ -Wall -o locked_mount_via_propagation locked_mount_via_propagation.cpp
./locked_mount_via_propagation src dest
mount -o rw,remount dest
mount: /tmp/tmp.TDKtNTRrOZ/dest: permission denied.

This is NOT production code. Some problems deliberately left for brevity:
  - doing work in post-clone code -- risky / incompatible with threaded code
    and sanitizers,
  - not cleaning up stray mounts on error paths,
  - `fflush` / `perror` in the `clone` code paths,
  - failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate,
  - not closing FDs or setting CLOEXEC by default.

Credit: sargun@sargun.me for the clone3 / CLONE_FILES trick + waitid(P_PIDFD)

References:
[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069
[2]
https://lore.kernel.org/linux-fsdevel/20230810090044.1252084-1-sargun@sargun.me
[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2261
*/
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include <functional>
#include <vector>

// START: Syscall stubs to aid compilation with older userspace

namespace {

static pid_t sys_clone3(struct clone_args *args, size_t size) {
  fflush(stdout);
  fflush(stderr);
  return syscall(__NR_clone3, args, size);
}

int sys_open_tree(int dfd, const char *filename, unsigned int flags) {
  return syscall(__NR_open_tree, dfd, filename, flags);
}

static inline int sys_mount_setattr(
    int dfd, const char *path, unsigned int flags, struct mount_attr *attr) {
  return syscall(__NR_mount_setattr, dfd, path, flags, attr, sizeof(*attr));
}

int sys_move_mount(
    int from_dfd,
    const char *from_path,
    int to_dfd,
    const char *to_path,
    unsigned int flags) {
  return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags);
}

int sys_waitid(
    int which,
    pid_t pid,
    siginfo_t *info,
    int options,
    struct rusage *ru) {
  return syscall(__NR_waitid, which, pid, info, options, ru);
}

} // namespace

// END: Syscall stubs

#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }())

std::vector<int> cloneChildAndAwaitFds(
    size_t numFds,
    std::function<void(int*, size_t)> childFn,
    __u64 cloneFlags) {
  std::vector<int> fdsToReplace;
  for (size_t i = 0; i < numFds; ++i) {
    fdsToReplace.emplace_back(memfd_create("fd_replaced_by_child", 0));
  }
  int childPidfd = 0;
  struct clone_args cloneArgs = {
    .flags = CLONE_PIDFD | CLONE_FILES | cloneFlags,
    .pidfd = (__u64)&childPidfd,
    .exit_signal = SIGCHLD,
  };
  if (0 == ERR_EXIT(sys_clone3(&cloneArgs, sizeof(cloneArgs)))) {
    int outFds[numFds];
    childFn(outFds, numFds);
    for (size_t i = 0; i < numFds; ++i) {
      ERR_EXIT(dup2(outFds[i], fdsToReplace[i]));
    }
    _exit(0);
  }
  siginfo_t info = {};
  ERR_EXIT(sys_waitid(P_PIDFD, childPidfd, &info, WEXITED, NULL));
  assert(WIFEXITED(info.si_status));
  assert(WEXITSTATUS(info.si_status) == 0);
  return fdsToReplace;
}

int main(int argc, char** argv) {
  if (argc < 3) {
    fprintf(stderr, "Usage: %s src dest\n", argv[0]);
    return 1;
  }
  const char* src = argv[1];
  const char* dest = argv[2];

  char tempDir[] = "/tmp/mnt_prop_tunnel.XXXXXX";
  if (mkdtemp(tempDir) == nullptr) {
    perror("mkdtemp");
    _exit(1);
  }

  auto origTreeFd = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, OPEN_TREE_CLONE));
  {
    struct mount_attr attr { .propagation = MS_SLAVE };
    ERR_EXIT(sys_mount_setattr(origTreeFd, "", AT_EMPTY_PATH, &attr));
    ERR_EXIT(sys_move_mount(
      origTreeFd, "", AT_FDCWD, tempDir, MOVE_MOUNT_F_EMPTY_PATH));
  }

  auto helperOutFds = cloneChildAndAwaitFds(
    2,
    [&](int* outFds, size_t numFds) {
      if (numFds != 2) { _exit(1); }
      outFds[0] = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, 0));
      outFds[1] = open("/proc/self/ns/mnt", O_RDONLY);
      chdir(tempDir);
      mkdir("mountpoint", 0777);
    },
    CLONE_NEWUSER | CLONE_NEWNS);


  {
    auto srcFd = ERR_EXIT(sys_open_tree(AT_FDCWD, src, OPEN_TREE_CLONE));
    struct mount_attr attr = {.attr_set = MOUNT_ATTR_RDONLY};
    ERR_EXIT(sys_mount_setattr(srcFd, "", AT_EMPTY_PATH, &attr));
    ERR_EXIT(sys_move_mount(
        srcFd, "", origTreeFd, "mountpoint", MOVE_MOUNT_F_EMPTY_PATH));
  }

  auto lockedDestFdVec = cloneChildAndAwaitFds(
    1,
    [&](int* outFds, size_t numFds) {
      if (numFds != 1) { _exit(1); }
      ERR_EXIT(setns(helperOutFds[1], CLONE_NEWNS));
      outFds[0] = ERR_EXIT(sys_open_tree(
        helperOutFds[0], "mountpoint", OPEN_TREE_CLONE));
    },
    0);

  ERR_EXIT(sys_move_mount(
      lockedDestFdVec[0], "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH));

  ERR_EXIT(umount2(tempDir, MNT_DETACH));

  struct mount_attr clearRdOnly = {.attr_clr = MOUNT_ATTR_RDONLY};
  assert(sys_mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly) == -1);
  assert(errno == EPERM);
  return 0;
}
	/*
	This shows one of two possible methods (as of Linux v6.4.9) for seting up a
	MNT_LOCK_READONLY mount, which is a mount that cannot be remounted
	read-write even by a fully privileged super-user. The same principle applies
	to locking NODEV, NOSUID, and NOEXEC [1].

	Hopefully, at some point the Linux kernel will support setting locked bits
	via `mount_setattr` [2], but as of v6.4.9, this appears to be the simplest way.

	This `attach_recursive_mnt`-based method works best if you can have a
	persistent daemon, which exchanges with the actual mounting process via
	AF_UNIX sockets via SCM_RIGHTS. Then, you do NOT need a `clone` per mount.

	If you're going to spawn a helper process per mount anyway, then the
	`copy_mnt_ns` method is better, see:
	https://gist.github.com/snarkmaster/5ca6b668499bf2c9010fd68227d64887

	This works as follows:
	- As seen in [1], whether a mount is locked is a bit on `struct mount`. Once
	locked, these bits cannot be cleared.
	- The call chain `move_mount` -> 'do_move_mount' -> 'attach_recursive_mnt'
	will trigger `lock_mnt_tree` if the mount's user namespace differs from
	the calling process's user namespace, see [3].

	Mount locking is a standard part of the user NS security model, per `man
	mount_namespaces`:

	> A mount namespace has an owner user namespace. A mount namespace whose
	> owner user namespace is different from the owner user namespace of its
	> parent mount namespace is considered a less privileged mount namespace.
	>
	> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags
	> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when
	> propagated from a more privileged to a less privileged mount namespace,
	> and may not be changed in the less privileged mount namespace.

	The only innovation here is that we generate a locked mount (via mount
	propagation), and ship it back to the originating namespace, showing that
	mounts can be locked even within the outermost user NS.

	Demo:

	mkdir src dest
	touch src/foo
	g++ -Wall -o locked_mount_via_propagation locked_mount_via_propagation.cpp
	./locked_mount_via_propagation src dest
	mount -o rw,remount dest
	mount: /tmp/tmp.TDKtNTRrOZ/dest: permission denied.

	This is NOT production code. Some problems deliberately left for brevity:
	- doing work in post-clone code -- risky / incompatible with threaded code
	and sanitizers,
	- not cleaning up stray mounts on error paths,
	- `fflush` / `perror` in the `clone` code paths,
	- failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate,
	- not closing FDs or setting CLOEXEC by default.

	Credit: sargun@sargun.me for the clone3 / CLONE_FILES trick + waitid(P_PIDFD)

	References:
	[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069
	[2]
	https://lore.kernel.org/linux-fsdevel/20230810090044.1252084-1-sargun@sargun.me
	[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2261
	*/
	#include <assert.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <linux/sched.h>
	#include <linux/types.h>
	#include <sched.h>
	#include <signal.h>
	#include <stdio.h>
	#include <string.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>

	#include <functional>
	#include <vector>

	// START: Syscall stubs to aid compilation with older userspace

	namespace {

	static pid_t sys_clone3(struct clone_args *args, size_t size) {
	fflush(stdout);
	fflush(stderr);
	return syscall(__NR_clone3, args, size);
	}

	int sys_open_tree(int dfd, const char *filename, unsigned int flags) {
	return syscall(__NR_open_tree, dfd, filename, flags);
	}

	static inline int sys_mount_setattr(
	int dfd, const char path, unsigned int flags, struct mount_attr attr) {
	return syscall(__NR_mount_setattr, dfd, path, flags, attr, sizeof(*attr));
	}

	int sys_move_mount(
	int from_dfd,
	const char *from_path,
	int to_dfd,
	const char *to_path,
	unsigned int flags) {
	return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags);
	}

	int sys_waitid(
	int which,
	pid_t pid,
	siginfo_t *info,
	int options,
	struct rusage *ru) {
	return syscall(__NR_waitid, which, pid, info, options, ru);
	}

	} // namespace

	// END: Syscall stubs

	#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }())

	std::vector<int> cloneChildAndAwaitFds(
	size_t numFds,
	std::function<void(int*, size_t)> childFn,
	__u64 cloneFlags) {
	std::vector<int> fdsToReplace;
	for (size_t i = 0; i < numFds; ++i) {
	fdsToReplace.emplace_back(memfd_create("fd_replaced_by_child", 0));
	}
	int childPidfd = 0;
	struct clone_args cloneArgs = {
	.flags = CLONE_PIDFD \| CLONE_FILES \| cloneFlags,
	.pidfd = (__u64)&childPidfd,
	.exit_signal = SIGCHLD,
	};
	if (0 == ERR_EXIT(sys_clone3(&cloneArgs, sizeof(cloneArgs)))) {
	int outFds[numFds];
	childFn(outFds, numFds);
	for (size_t i = 0; i < numFds; ++i) {
	ERR_EXIT(dup2(outFds[i], fdsToReplace[i]));
	}
	_exit(0);
	}
	siginfo_t info = {};
	ERR_EXIT(sys_waitid(P_PIDFD, childPidfd, &info, WEXITED, NULL));
	assert(WIFEXITED(info.si_status));
	assert(WEXITSTATUS(info.si_status) == 0);
	return fdsToReplace;
	}

	int main(int argc, char** argv) {
	if (argc < 3) {
	fprintf(stderr, "Usage: %s src dest\n", argv[0]);
	return 1;
	}
	const char* src = argv[1];
	const char* dest = argv[2];

	char tempDir[] = "/tmp/mnt_prop_tunnel.XXXXXX";
	if (mkdtemp(tempDir) == nullptr) {
	perror("mkdtemp");
	_exit(1);
	}

	auto origTreeFd = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, OPEN_TREE_CLONE));
	{
	struct mount_attr attr { .propagation = MS_SLAVE };
	ERR_EXIT(sys_mount_setattr(origTreeFd, "", AT_EMPTY_PATH, &attr));
	ERR_EXIT(sys_move_mount(
	origTreeFd, "", AT_FDCWD, tempDir, MOVE_MOUNT_F_EMPTY_PATH));
	}

	auto helperOutFds = cloneChildAndAwaitFds(
	2,
	[&](int* outFds, size_t numFds) {
	if (numFds != 2) { _exit(1); }
	outFds[0] = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, 0));
	outFds[1] = open("/proc/self/ns/mnt", O_RDONLY);
	chdir(tempDir);
	mkdir("mountpoint", 0777);
	},
	CLONE_NEWUSER \| CLONE_NEWNS);


	{
	auto srcFd = ERR_EXIT(sys_open_tree(AT_FDCWD, src, OPEN_TREE_CLONE));
	struct mount_attr attr = {.attr_set = MOUNT_ATTR_RDONLY};
	ERR_EXIT(sys_mount_setattr(srcFd, "", AT_EMPTY_PATH, &attr));
	ERR_EXIT(sys_move_mount(
	srcFd, "", origTreeFd, "mountpoint", MOVE_MOUNT_F_EMPTY_PATH));
	}

	auto lockedDestFdVec = cloneChildAndAwaitFds(
	1,
	[&](int* outFds, size_t numFds) {
	if (numFds != 1) { _exit(1); }
	ERR_EXIT(setns(helperOutFds[1], CLONE_NEWNS));
	outFds[0] = ERR_EXIT(sys_open_tree(
	helperOutFds[0], "mountpoint", OPEN_TREE_CLONE));
	},
	0);

	ERR_EXIT(sys_move_mount(
	lockedDestFdVec[0], "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH));

	ERR_EXIT(umount2(tempDir, MNT_DETACH));

	struct mount_attr clearRdOnly = {.attr_clr = MOUNT_ATTR_RDONLY};
	assert(sys_mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly) == -1);
	assert(errno == EPERM);
	return 0;
	}