Skip to content

Instantly share code, notes, and snippets.

@snarkmaster
Last active August 10, 2023 22:37
Show Gist options
  • Save snarkmaster/c2d4765b19e69f1626cafc076e60f8ac to your computer and use it in GitHub Desktop.
Save snarkmaster/c2d4765b19e69f1626cafc076e60f8ac to your computer and use it in GitHub Desktop.
Linux demo: Creating irrevocably read-only bind mounts (`MNT_LOCK_READONLY`) via mount propagation into another user NS
/*
This shows one of two possible methods (as of Linux v6.4.9) for seting up a
MNT_LOCK_READONLY mount, which is a mount that cannot be remounted
read-write even by a fully privileged super-user. The same principle applies
to locking NODEV, NOSUID, and NOEXEC [1].
Hopefully, at some point the Linux kernel will support setting locked bits
via `mount_setattr` [2], but as of v6.4.9, this appears to be the simplest way.
This `attach_recursive_mnt`-based method works best if you can have a
persistent daemon, which exchanges with the actual mounting process via
AF_UNIX sockets via SCM_RIGHTS. Then, you do NOT need a `clone` per mount.
If you're going to spawn a helper process per mount anyway, then the
`copy_mnt_ns` method is better, see:
https://gist.github.com/snarkmaster/5ca6b668499bf2c9010fd68227d64887
This works as follows:
- As seen in [1], whether a mount is locked is a bit on `struct mount`. Once
locked, these bits cannot be cleared.
- The call chain `move_mount` -> 'do_move_mount' -> 'attach_recursive_mnt'
will trigger `lock_mnt_tree` if the mount's user namespace differs from
the calling process's user namespace, see [3].
Mount locking is a standard part of the user NS security model, per `man
mount_namespaces`:
> A mount namespace has an owner user namespace. A mount namespace whose
> owner user namespace is different from the owner user namespace of its
> parent mount namespace is considered a less privileged mount namespace.
>
> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags
> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when
> propagated from a more privileged to a less privileged mount namespace,
> and may not be changed in the less privileged mount namespace.
The only innovation here is that we generate a locked mount (via mount
propagation), and ship it back to the originating namespace, showing that
mounts can be locked even within the outermost user NS.
Demo:
mkdir src dest
touch src/foo
g++ -Wall -o locked_mount_via_propagation locked_mount_via_propagation.cpp
./locked_mount_via_propagation src dest
mount -o rw,remount dest
mount: /tmp/tmp.TDKtNTRrOZ/dest: permission denied.
This is NOT production code. Some problems deliberately left for brevity:
- doing work in post-clone code -- risky / incompatible with threaded code
and sanitizers,
- not cleaning up stray mounts on error paths,
- `fflush` / `perror` in the `clone` code paths,
- failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate,
- not closing FDs or setting CLOEXEC by default.
Credit: sargun@sargun.me for the clone3 / CLONE_FILES trick + waitid(P_PIDFD)
References:
[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069
[2]
https://lore.kernel.org/linux-fsdevel/20230810090044.1252084-1-sargun@sargun.me
[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2261
*/
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <functional>
#include <vector>
// START: Syscall stubs to aid compilation with older userspace
namespace {
static pid_t sys_clone3(struct clone_args *args, size_t size) {
fflush(stdout);
fflush(stderr);
return syscall(__NR_clone3, args, size);
}
int sys_open_tree(int dfd, const char *filename, unsigned int flags) {
return syscall(__NR_open_tree, dfd, filename, flags);
}
static inline int sys_mount_setattr(
int dfd, const char *path, unsigned int flags, struct mount_attr *attr) {
return syscall(__NR_mount_setattr, dfd, path, flags, attr, sizeof(*attr));
}
int sys_move_mount(
int from_dfd,
const char *from_path,
int to_dfd,
const char *to_path,
unsigned int flags) {
return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags);
}
int sys_waitid(
int which,
pid_t pid,
siginfo_t *info,
int options,
struct rusage *ru) {
return syscall(__NR_waitid, which, pid, info, options, ru);
}
} // namespace
// END: Syscall stubs
#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }())
std::vector<int> cloneChildAndAwaitFds(
size_t numFds,
std::function<void(int*, size_t)> childFn,
__u64 cloneFlags) {
std::vector<int> fdsToReplace;
for (size_t i = 0; i < numFds; ++i) {
fdsToReplace.emplace_back(memfd_create("fd_replaced_by_child", 0));
}
int childPidfd = 0;
struct clone_args cloneArgs = {
.flags = CLONE_PIDFD | CLONE_FILES | cloneFlags,
.pidfd = (__u64)&childPidfd,
.exit_signal = SIGCHLD,
};
if (0 == ERR_EXIT(sys_clone3(&cloneArgs, sizeof(cloneArgs)))) {
int outFds[numFds];
childFn(outFds, numFds);
for (size_t i = 0; i < numFds; ++i) {
ERR_EXIT(dup2(outFds[i], fdsToReplace[i]));
}
_exit(0);
}
siginfo_t info = {};
ERR_EXIT(sys_waitid(P_PIDFD, childPidfd, &info, WEXITED, NULL));
assert(WIFEXITED(info.si_status));
assert(WEXITSTATUS(info.si_status) == 0);
return fdsToReplace;
}
int main(int argc, char** argv) {
if (argc < 3) {
fprintf(stderr, "Usage: %s src dest\n", argv[0]);
return 1;
}
const char* src = argv[1];
const char* dest = argv[2];
char tempDir[] = "/tmp/mnt_prop_tunnel.XXXXXX";
if (mkdtemp(tempDir) == nullptr) {
perror("mkdtemp");
_exit(1);
}
auto origTreeFd = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, OPEN_TREE_CLONE));
{
struct mount_attr attr { .propagation = MS_SLAVE };
ERR_EXIT(sys_mount_setattr(origTreeFd, "", AT_EMPTY_PATH, &attr));
ERR_EXIT(sys_move_mount(
origTreeFd, "", AT_FDCWD, tempDir, MOVE_MOUNT_F_EMPTY_PATH));
}
auto helperOutFds = cloneChildAndAwaitFds(
2,
[&](int* outFds, size_t numFds) {
if (numFds != 2) { _exit(1); }
outFds[0] = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, 0));
outFds[1] = open("/proc/self/ns/mnt", O_RDONLY);
chdir(tempDir);
mkdir("mountpoint", 0777);
},
CLONE_NEWUSER | CLONE_NEWNS);
{
auto srcFd = ERR_EXIT(sys_open_tree(AT_FDCWD, src, OPEN_TREE_CLONE));
struct mount_attr attr = {.attr_set = MOUNT_ATTR_RDONLY};
ERR_EXIT(sys_mount_setattr(srcFd, "", AT_EMPTY_PATH, &attr));
ERR_EXIT(sys_move_mount(
srcFd, "", origTreeFd, "mountpoint", MOVE_MOUNT_F_EMPTY_PATH));
}
auto lockedDestFdVec = cloneChildAndAwaitFds(
1,
[&](int* outFds, size_t numFds) {
if (numFds != 1) { _exit(1); }
ERR_EXIT(setns(helperOutFds[1], CLONE_NEWNS));
outFds[0] = ERR_EXIT(sys_open_tree(
helperOutFds[0], "mountpoint", OPEN_TREE_CLONE));
},
0);
ERR_EXIT(sys_move_mount(
lockedDestFdVec[0], "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH));
ERR_EXIT(umount2(tempDir, MNT_DETACH));
struct mount_attr clearRdOnly = {.attr_clr = MOUNT_ATTR_RDONLY};
assert(sys_mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly) == -1);
assert(errno == EPERM);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment