Skip to content

Instantly share code, notes, and snippets.

@tiqwab
Created October 15, 2021 02:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tiqwab/de9605f496b8915a3551ba27bc0a31a9 to your computer and use it in GitHub Desktop.
Save tiqwab/de9605f496b8915a3551ba27bc0a31a9 to your computer and use it in GitHub Desktop.
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <limits.h>
#include <sys/mman.h>
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
static int
pivot_root(const char *new_root, const char *put_old)
{
return syscall(SYS_pivot_root, new_root, put_old);
}
#define STACK_SIZE (1024 * 1024)
static int /* Startup function for cloned child */
child(void *arg)
{
char **args = arg;
char *new_root = args[0];
const char *put_old = "/oldrootfs";
char path[PATH_MAX];
/* Ensure that 'new_root' and its parent mount don't have
shared propagation (which would cause pivot_root() to
return an error), and prevent propagation of mount
events to the initial mount namespace. */
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) == -1)
errExit("mount-MS_PRIVATE");
/* Ensure that 'new_root' is a mount point. */
if (mount(new_root, new_root, NULL, MS_BIND, NULL) == -1)
errExit("mount-MS_BIND");
/* Mount proc */
snprintf(path, sizeof(path), "%s/proc", new_root);
if (mount(path, path, "proc", 0, NULL) == -1) {
errExit("mount-proc");
}
/* Create directory to which old root will be pivoted. */
snprintf(path, sizeof(path), "%s/%s", new_root, put_old);
if (mkdir(path, 0777) == -1)
errExit("mkdir");
/* And pivot the root filesystem. */
if (pivot_root(new_root, path) == -1)
errExit("pivot_root");
/* Switch the current working directory to "/". */
if (chdir("/") == -1)
errExit("chdir");
/* Unmount old root and remove mount point. */
if (umount2(put_old, MNT_DETACH) == -1)
perror("umount2");
if (rmdir(put_old) == -1)
perror("rmdir");
/* Execute the command specified in argv[1]... */
execv(args[1], &args[1]);
errExit("execv");
}
int
main(int argc, char *argv[])
{
if (strncmp(argv[1], "test", 4) == 0) {
printf("test\n");
exit(EXIT_SUCCESS);
} else {
/* Create a child process in a new mount namespace. */
char *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (stack == MAP_FAILED)
errExit("mmap");
if (clone(child, stack + STACK_SIZE,
CLONE_NEWNS | SIGCHLD, &argv[1]) == -1)
errExit("clone");
/* Parent falls through to here; wait for child. */
if (wait(NULL) == -1)
errExit("wait");
exit(EXIT_SUCCESS);
}
}
@tiqwab
Copy link
Author

tiqwab commented Oct 24, 2021

# prepare rootfs
$ mkdir rootfs
$ sudo docker export $(sudo docker create ubuntu:18.04) | tar -C rootfs -xvf -

# compile pivot_root_sample
$ cc -o pivot_root_sample pivot_root_sample.c

# We can execute pivot_root_sample though it is outside container rootfs
$ sudo ./pivot_root_sample rootfs /proc/self/exe test
child_pid: 91919
test output

@tiqwab
Copy link
Author

tiqwab commented Oct 24, 2021

例えば /bin/ls -l / だと / を open する様子が確認できる。

$ sudo strace -p 92011
strace: Process 92011 attached
...
openat(AT_FDCWD, "/", O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_DIRECTORY) = 3
...

これが /proc/self/exe test だと対応する openat システムコールは呼ばれない。

$ sudo strace -p 92074
strace: Process 92074 attached
restart_syscall(<... resuming interrupted read ...>) = 0
execve("/proc/self/exe", ["/proc/self/exe", "test"], 0x7ffc85c5cea0 /* 15 vars */) = 0
brk(NULL)                               = 0x5593154e0000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=7274, ...}) = 0
mmap(NULL, 7274, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f0facf39000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\34\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2030544, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f0facf37000
mmap(NULL, 4131552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f0fac923000
mprotect(0x7f0facb0a000, 2097152, PROT_NONE) = 0
mmap(0x7f0facd0a000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f0facd0a000
mmap(0x7f0facd10000, 15072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f0facd10000
close(3)                                = 0
arch_prctl(ARCH_SET_FS, 0x7f0facf384c0) = 0
mprotect(0x7f0facd0a000, 16384, PROT_READ) = 0
mprotect(0x5593148ba000, 4096, PROT_READ) = 0
mprotect(0x7f0facf3b000, 4096, PROT_READ) = 0
munmap(0x7f0facf39000, 7274)            = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0xd), ...}) = 0
brk(NULL)                               = 0x5593154e0000
brk(0x559315501000)                     = 0x559315501000
write(1, "test output\n", 12)           = 12
exit_group(0)                           = ?
+++ exited with 0 +++

既にそのプロセスで open しているファイルを実行する場合、新たに open する必要はない。
/proc/self/exe の場合 ls -l /proc/self/fd 等には出てこないがそのプロセスでは実行している以上ファイルを利用しているので open せずとも実行できる。open しないのでパスを辿る必要もなく rootfs 下のファイルでなくても実行できてしまう... ということか?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment