snarkmaster/locked_mount_via_newns.cpp

## locked_mount_via_newns.cpp
/*
This shows one of two possible methods (as of Linux v6.4.9) for seting up a
MNT_LOCK_READONLY mount, which is a mount that cannot be remounted
read-write even by a fully privileged super-user.  The same principle applies
to locking NODEV, NOSUID, and NOEXEC [1].

Hopefully, at some point the Linux kernel will support setting locked bits
via `mount_setattr` -- see [2].

This works as follows:
 - As seen in [1], whether a mount is locked is a bit on `struct mount`. Once
   locked, these bits cannot be cleared [3].
 - When we create the Helper child

Mount locking is a standard part of the user NS security model, per `man
mount_namespaces`:

> A mount namespace has an owner user namespace.  A mount namespace whose
> owner user namespace is different from the owner user namespace of its
> parent mount namespace is considered a less privileged mount namespace.
>
> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags
> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when
> propagated from a more privileged to a less privileged mount namespace,
> and may not be changed in the less privileged mount namespace.

The only innovation here is that after we generate a locked mount (by
creating a new user & mount NS), we ship it back to the originating
namespace, showing that mounts remain locked even once they return to the
outermost user NS.

This is NOT production code. Some problems deliberately left for brevity:
  - doing work (including `perror` without flushing!) in post-fork -- risky,
    incompatible with threads & sanitizers
  - read / write without EINTR / retries,
  - `SCM_RIGHTS` would be cleaner than `pidfd_getfd`,
  - failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate,
  - not cleaning up stray mounts, especially on error paths,
  - not closing FDs.

Demo:

gcc -Wall -o locked_mount_via_newns locked_mount_via_newns.cpp
sudo env PREV_PWD=$(pwd) unshare -m  # `unshare` not required, can be `bash`
cd $(mktemp -d)
mkdir src dest temp
"$PREV_PWD"/locked_mount_via_newns src dest temp

$ touch src/foo
$ ls {src,dest,temp}/*
dest/foo  src/foo  temp/foo
$ touch dest/bar temp/bar
touch: cannot touch 'dest/bar': Read-only file system
touch: cannot touch 'temp/bar': Read-only file system
$ mount -o rw,remount temp
$ mount -o rw,remount dest
mount: /tmp/tmp.GnXKyj5evn/dest: permission denied.

References:
[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069
[2]
https://lore.kernel.org/linux-fsdevel/20230810090044.1252084-1-sargun@sargun.me
[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2561
*/
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

namespace {
int pidfd_open(pid_t pid, unsigned int flags) noexcept {
  return ::syscall(SYS_pidfd_open, pid, flags);
}

int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) noexcept {
  return ::syscall(SYS_pidfd_getfd, pidfd, targetfd, flags);
}
} // namespace

#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }())

int main(int argc, char** argv) {
  if (argc < 4) {
    fprintf(stderr, "Usage: %s src dest temp\n", argv[0]);
    return 1;
  }
  const char* src = argv[1];
  const char* dest = argv[2];
  const char* temp = argv[3];

  // Set up an read-only bind-mount of `src` and attach it to `temp`
  auto treeFd = ERR_EXIT(open_tree(AT_FDCWD, src, OPEN_TREE_CLONE));
  struct mount_attr attr { .attr_set = MOUNT_ATTR_RDONLY };
  ERR_EXIT(mount_setattr(
      treeFd, "", AT_EMPTY_PATH, &attr, sizeof(struct mount_attr)));
  ERR_EXIT(move_mount(
      treeFd, "", AT_FDCWD, temp, MOVE_MOUNT_F_EMPTY_PATH));

  // `fork` child writes `kSigil` to the pipe once its FD 0 is a locked tree
  const char kSigil[] = "DoNeSiGiL";
  int pipeFds[2];
  ERR_EXIT(pipe2(pipeFds, O_CLOEXEC));

  // The child is the "Helper" described in the file docblock.
  pid_t child = ERR_EXIT(fork());
  if (child == 0) {
    close(pipeFds[0]);  // Below, we use `EPIPE`-on-write as a signal to exit.

    // CRUCIAL: As we clone the mount namespace, all pre-existing mounts
    // (including `temp` above)` become locked.
    ERR_EXIT(unshare(CLONE_NEWUSER | CLONE_NEWNS));

    auto lockedTreeFd = ERR_EXIT(open_tree(AT_FDCWD, temp, OPEN_TREE_CLONE));
    // Put the unattached mount at FD 0, and tell parent to `pidfd_getfd` it.
    ERR_EXIT(dup2(lockedTreeFd, 0));
    if (write(pipeFds[1], kSigil, sizeof(kSigil)) != sizeof(kSigil)) {
      perror("write kSigil");
      _exit(1);
    }
    // Parent will close the pipe after grabbing FD 0. Waiting for EPIPE
    // guarantees the child exits even if the parent crashes.
    while (true) {
      sleep(1);
      if (-1 == write(pipeFds[1], kSigil, 1) && errno == EPIPE) {
        break;
      }
    }
  }

  // Wait for child to prepare mount FD
  char readSigil[sizeof(kSigil)];
  if (read(pipeFds[0], readSigil, sizeof(readSigil)) != sizeof(kSigil) ||
      0 != strncmp(kSigil, readSigil, sizeof(kSigil))) {
    perror("read kSigil");
    return 1;
  }

  // Retrieve child FD 0, which is the new, now-locked, tree
  auto pidFd = ERR_EXIT(pidfd_open(child, 0));
  auto lockedFd = ERR_EXIT(pidfd_getfd(pidFd, 0, 0));

  // Reap child
  close(pipeFds[0]);
  kill(child, SIGKILL);
  waitpid(child, nullptr, 0);

  ERR_EXIT(move_mount(lockedFd, "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH));

  struct mount_attr clearRdOnly = { .attr_clr = MOUNT_ATTR_RDONLY };
  assert(mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly, sizeof(clearRdOnly)) == -1);
  assert(errno == EPERM);

  return 0;
}
	/*
	This shows one of two possible methods (as of Linux v6.4.9) for seting up a
	MNT_LOCK_READONLY mount, which is a mount that cannot be remounted
	read-write even by a fully privileged super-user. The same principle applies
	to locking NODEV, NOSUID, and NOEXEC [1].

	Hopefully, at some point the Linux kernel will support setting locked bits
	via `mount_setattr` -- see [2].

	This works as follows:
	- As seen in [1], whether a mount is locked is a bit on `struct mount`. Once
	locked, these bits cannot be cleared [3].
	- When we create the Helper child

	Mount locking is a standard part of the user NS security model, per `man
	mount_namespaces`:

	> A mount namespace has an owner user namespace. A mount namespace whose
	> owner user namespace is different from the owner user namespace of its
	> parent mount namespace is considered a less privileged mount namespace.
	>
	> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags
	> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when
	> propagated from a more privileged to a less privileged mount namespace,
	> and may not be changed in the less privileged mount namespace.

	The only innovation here is that after we generate a locked mount (by
	creating a new user & mount NS), we ship it back to the originating
	namespace, showing that mounts remain locked even once they return to the
	outermost user NS.

	This is NOT production code. Some problems deliberately left for brevity:
	- doing work (including `perror` without flushing!) in post-fork -- risky,
	incompatible with threads & sanitizers
	- read / write without EINTR / retries,
	- `SCM_RIGHTS` would be cleaner than `pidfd_getfd`,
	- failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate,
	- not cleaning up stray mounts, especially on error paths,
	- not closing FDs.

	Demo:

	gcc -Wall -o locked_mount_via_newns locked_mount_via_newns.cpp
	sudo env PREV_PWD=$(pwd) unshare -m # `unshare` not required, can be `bash`
	cd $(mktemp -d)
	mkdir src dest temp
	"$PREV_PWD"/locked_mount_via_newns src dest temp

	$ touch src/foo
	$ ls {src,dest,temp}/*
	dest/foo src/foo temp/foo
	$ touch dest/bar temp/bar
	touch: cannot touch 'dest/bar': Read-only file system
	touch: cannot touch 'temp/bar': Read-only file system
	$ mount -o rw,remount temp
	$ mount -o rw,remount dest
	mount: /tmp/tmp.GnXKyj5evn/dest: permission denied.

	References:
	[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069
	[2]
	https://lore.kernel.org/linux-fsdevel/20230810090044.1252084-1-sargun@sargun.me
	[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2561
	*/
	#include <assert.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <linux/types.h>
	#include <sched.h>
	#include <signal.h>
	#include <stdio.h>
	#include <string.h>
	#include <sys/mount.h>
	#include <sys/syscall.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>

	namespace {
	int pidfd_open(pid_t pid, unsigned int flags) noexcept {
	return ::syscall(SYS_pidfd_open, pid, flags);
	}

	int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) noexcept {
	return ::syscall(SYS_pidfd_getfd, pidfd, targetfd, flags);
	}
	} // namespace

	#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }())

	int main(int argc, char** argv) {
	if (argc < 4) {
	fprintf(stderr, "Usage: %s src dest temp\n", argv[0]);
	return 1;
	}
	const char* src = argv[1];
	const char* dest = argv[2];
	const char* temp = argv[3];

	// Set up an read-only bind-mount of `src` and attach it to `temp`
	auto treeFd = ERR_EXIT(open_tree(AT_FDCWD, src, OPEN_TREE_CLONE));
	struct mount_attr attr { .attr_set = MOUNT_ATTR_RDONLY };
	ERR_EXIT(mount_setattr(
	treeFd, "", AT_EMPTY_PATH, &attr, sizeof(struct mount_attr)));
	ERR_EXIT(move_mount(
	treeFd, "", AT_FDCWD, temp, MOVE_MOUNT_F_EMPTY_PATH));

	// `fork` child writes `kSigil` to the pipe once its FD 0 is a locked tree
	const char kSigil[] = "DoNeSiGiL";
	int pipeFds[2];
	ERR_EXIT(pipe2(pipeFds, O_CLOEXEC));

	// The child is the "Helper" described in the file docblock.
	pid_t child = ERR_EXIT(fork());
	if (child == 0) {
	close(pipeFds[0]); // Below, we use `EPIPE`-on-write as a signal to exit.

	// CRUCIAL: As we clone the mount namespace, all pre-existing mounts
	// (including `temp` above)` become locked.
	ERR_EXIT(unshare(CLONE_NEWUSER \| CLONE_NEWNS));

	auto lockedTreeFd = ERR_EXIT(open_tree(AT_FDCWD, temp, OPEN_TREE_CLONE));
	// Put the unattached mount at FD 0, and tell parent to `pidfd_getfd` it.
	ERR_EXIT(dup2(lockedTreeFd, 0));
	if (write(pipeFds[1], kSigil, sizeof(kSigil)) != sizeof(kSigil)) {
	perror("write kSigil");
	_exit(1);
	}
	// Parent will close the pipe after grabbing FD 0. Waiting for EPIPE
	// guarantees the child exits even if the parent crashes.
	while (true) {
	sleep(1);
	if (-1 == write(pipeFds[1], kSigil, 1) && errno == EPIPE) {
	break;
	}
	}
	}

	// Wait for child to prepare mount FD
	char readSigil[sizeof(kSigil)];
	if (read(pipeFds[0], readSigil, sizeof(readSigil)) != sizeof(kSigil) \|\|
	0 != strncmp(kSigil, readSigil, sizeof(kSigil))) {
	perror("read kSigil");
	return 1;
	}

	// Retrieve child FD 0, which is the new, now-locked, tree
	auto pidFd = ERR_EXIT(pidfd_open(child, 0));
	auto lockedFd = ERR_EXIT(pidfd_getfd(pidFd, 0, 0));

	// Reap child
	close(pipeFds[0]);
	kill(child, SIGKILL);
	waitpid(child, nullptr, 0);

	ERR_EXIT(move_mount(lockedFd, "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH));

	struct mount_attr clearRdOnly = { .attr_clr = MOUNT_ATTR_RDONLY };
	assert(mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly, sizeof(clearRdOnly)) == -1);
	assert(errno == EPERM);

	return 0;
	}