Last active
April 17, 2026 06:10
-
-
Save rrampage/92f0eb6bf56d7bb403aff069cc8f1d6b to your computer and use it in GitHub Desktop.
A userspace sandbox which uses SOCKS proxy to restrict network access (inspired by oniux)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define _GNU_SOURCE | |
| /* | |
| * sockpuppet.c - single-file Linux sandbox + userspace network broker | |
| * | |
| * Quick build: | |
| * gcc -O2 -g -Wall -Wextra -Wformat -Wformat=2 -Wconversion \ | |
| * -Wimplicit-fallthrough -Werror=format-security \ | |
| * -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS \ | |
| * -fstack-clash-protection -fstack-protector-strong \ | |
| * -Wl,-z,relro -Wl,-z,now -Wl,--as-needed \ | |
| * -Wl,--no-copy-dt-needed-entries sockpuppet.c -o sockpuppet | |
| * | |
| * Common usage: | |
| * ./sockpuppet /bin/sh | |
| * Run a command with non-interactive stdio: stdin from /dev/null and | |
| * stdout/stderr relayed through the parent. | |
| * | |
| * ./sockpuppet --interactive /bin/sh | |
| * Run with a private PTY for shells, REPLs, and full-screen terminal apps. | |
| * | |
| * ./sockpuppet --allow-host=127.0.0.1:8080/tcp curl http://10.0.1.1:8080 | |
| * Allow the sandbox to reach a host-local service through the 10.0.1.x | |
| * gateway mapping. | |
| * | |
| * ./sockpuppet --socks socks5://127.0.0.1:1080 curl https://example.com | |
| * Route outbound traffic through a SOCKS5 proxy. | |
| * | |
| * ./sockpuppet --unsafe-share-cwd ... | |
| * Allow running from /, /root, or /home/... when you intentionally want | |
| * the current working directory exposed inside the sandbox. | |
| */ | |
| #include <arpa/inet.h> | |
| #include <dirent.h> | |
| #include <errno.h> | |
| #include <fcntl.h> | |
| #include <linux/audit.h> | |
| #include <linux/capability.h> | |
| #include <linux/filter.h> | |
| #include <linux/if.h> | |
| #include <linux/if_tun.h> | |
| #include <linux/netlink.h> | |
| #include <linux/rtnetlink.h> | |
| #include <linux/seccomp.h> | |
| #include <netdb.h> | |
| #include <netinet/in.h> | |
| #include <netinet/ip.h> | |
| #include <netinet/tcp.h> | |
| #include <netinet/udp.h> | |
| #include <stddef.h> | |
| #include <poll.h> | |
| #include <sched.h> | |
| #include <signal.h> | |
| #include <limits.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <sys/epoll.h> | |
| #include <sys/ioctl.h> | |
| #include <sys/mount.h> | |
| #include <sys/prctl.h> | |
| #include <sys/random.h> | |
| #include <sys/resource.h> | |
| #include <sys/socket.h> | |
| #include <sys/stat.h> | |
| #include <sys/syscall.h> | |
| #include <sys/xattr.h> | |
| #include <termios.h> | |
| #include <sys/types.h> | |
| #include <sys/wait.h> | |
| #include <time.h> | |
| #include <unistd.h> | |
| #define MAX_TCP 128 | |
| #define MAX_UDP 64 | |
| #define MAX_EVENTS 64 | |
| #define EPOLL_TIMEOUT_MS 100 | |
| #define TCP_PENDING_WRITE_CAP 262144 | |
| #define BROKER_MEMORY_LOW "67108864" | |
| #define BROKER_MEMORY_HIGH "134217728" | |
| #define BROKER_MEMORY_MAX "201326592" | |
| #define BROKER_PIDS_MAX "32" | |
| #define BROKER_CPU_WEIGHT "200" | |
| #define PAYLOAD_MEMORY_HIGH "805306368" | |
| #define PAYLOAD_MEMORY_MAX "1073741824" | |
| #define PAYLOAD_PIDS_MAX "128" | |
| #define PAYLOAD_CPU_WEIGHT "100" | |
| #define PAYLOAD_CPU_MAX "200000 100000" | |
| #define SCOPE_MEMORY_MAX "1280M" | |
| #define SCOPE_MEMORY_HIGH "896M" | |
| #define SCOPE_TASKS_MAX "160" | |
| #define SCOPE_CPU_QUOTA "300%" | |
| #define SCOPE_NOFILE "8192" | |
| #define RLIMIT_PARENT_NOFILE 4096 | |
| #define RLIMIT_CHILD_NOFILE 1024 | |
| /* Epoll event data wrapper */ | |
| enum fd_type { | |
| FD_TUN = 1, | |
| FD_STDOUT_RELAY, | |
| FD_STDERR_RELAY, | |
| FD_INTERACTIVE_TTY, | |
| FD_INTERACTIVE_PTY, | |
| FD_TCP, | |
| FD_UDP_RELAY, | |
| FD_UDP_CTRL | |
| }; | |
| struct epoll_wrapper { | |
| enum fd_type type; | |
| int fd; | |
| void *flow; /* Points to tcp_flow or udp_flow */ | |
| }; | |
| static int g_epfd = -1; /* Global epoll fd */ | |
| /* SOCKS5 proxy configuration */ | |
| struct socks_config { | |
| char host[256]; | |
| int port; | |
| char username[256]; | |
| char password[256]; | |
| int enabled; | |
| int addr_valid; | |
| struct sockaddr_in addr; | |
| }; | |
| enum socks_io_state { | |
| SOCKS_IO_NONE = 0, | |
| SOCKS_IO_CONNECTING, | |
| SOCKS_IO_METHOD, | |
| SOCKS_IO_AUTH, | |
| SOCKS_IO_REQUEST, | |
| SOCKS_IO_READY, | |
| SOCKS_IO_FAILED | |
| }; | |
| struct socks_io { | |
| int active; | |
| int is_udp; | |
| int connect_pending; | |
| enum socks_io_state state; | |
| uint32_t target_ip; | |
| uint16_t target_port; | |
| uint8_t txbuf[512]; | |
| size_t tx_off; | |
| size_t tx_len; | |
| uint8_t rxbuf[512]; | |
| size_t rx_len; | |
| }; | |
| static struct socks_config socks_proxy = {0}; | |
| static int unsafe_share_cwd = 0; | |
| static int interactive_stdio = 0; | |
| static int verbose = 0; /* Verbose debug output */ | |
| static volatile sig_atomic_t interactive_resize_pending = 0; | |
| struct interactive_session { | |
| int active; | |
| int host_tty_fd; | |
| int pty_master_fd; | |
| int pty_slave_fd; | |
| struct termios host_termios; | |
| struct winsize host_winsize; | |
| int host_termios_saved; | |
| int host_winsize_saved; | |
| }; | |
| /* Debug macro - only prints if verbose mode enabled */ | |
| #define DBG(fmt, ...) \ | |
| do { \ | |
| if (verbose) \ | |
| fprintf(stderr, "[sockpuppet] " fmt "\n", ##__VA_ARGS__); \ | |
| } while (0) | |
| /* Host gateway configuration - map 10.0.1.x to 127.0.0.x */ | |
| #define HOST_PING_IP 0x0100000a /* 10.0.0.1 - only for ping */ | |
| #define HOST_GATEWAY_BASE 0x0001000a /* 10.0.1.0 network byte order */ | |
| #define HOST_GATEWAY_MASK 0x00ffffff /* /24 mask for 10.0.1.x */ | |
| #define LOCALHOST_BASE 0x0000007f /* 127.0.0.0 network byte order */ | |
| #define MAX_HOST_RULES 64 | |
| struct host_rule { | |
| uint8_t last_octet; /* x in 127.0.0.x (1-255), 0 = wildcard */ | |
| uint16_t port; /* port number, 0 = all ports */ | |
| int proto; /* IPPROTO_TCP, IPPROTO_UDP, or 0 for both */ | |
| int wildcard_ip; /* match all 127.0.0.x */ | |
| int wildcard_port; /* match all ports */ | |
| }; | |
| static struct host_rule host_rules[MAX_HOST_RULES]; | |
| static int host_rule_count = 0; | |
| static int host_allow_all = 0; /* --host=* */ | |
| /* Check if IP is in gateway range (10.0.1.0/24) */ | |
| static int is_gateway_ip(uint32_t ip) { | |
| return (ip & HOST_GATEWAY_MASK) == HOST_GATEWAY_BASE; | |
| } | |
| /* Extract last octet from gateway IP (10.0.1.x -> x) */ | |
| static uint8_t gateway_last_octet(uint32_t ip) { | |
| return (uint8_t)((ip >> 24) & 0xff); | |
| } | |
| /* Convert gateway IP to localhost (10.0.1.x -> 127.0.0.x) */ | |
| static uint32_t gateway_to_localhost(uint32_t gw_ip) { | |
| uint8_t last = gateway_last_octet(gw_ip); | |
| return LOCALHOST_BASE | ((uint32_t)last << 24); | |
| } | |
| /* Check if gateway access is allowed for given IP, port, and protocol */ | |
| static int is_gateway_allowed(uint32_t gw_ip, uint16_t port, int proto) { | |
| if (!is_gateway_ip(gw_ip)) | |
| return 0; | |
| if (host_allow_all) | |
| return 1; | |
| uint8_t last = gateway_last_octet(gw_ip); | |
| for (int i = 0; i < host_rule_count; i++) { | |
| struct host_rule *r = &host_rules[i]; | |
| int ip_match = r->wildcard_ip || (r->last_octet == last); | |
| int port_match = r->wildcard_port || (r->port == port); | |
| int proto_match = (r->proto == 0) || (r->proto == proto); | |
| if (ip_match && port_match && proto_match) | |
| return 1; | |
| } | |
| return 0; | |
| } | |
| /* Rate limiting */ | |
| #define MAX_CONNECTS_PER_SEC 50 | |
| #define TCP_HALF_OPEN_TIMEOUT_SEC 10 | |
| #define TCP_IDLE_TIMEOUT_SEC 120 | |
| static struct timespec rate_limit_last = {0}; | |
| static double rate_limit_tokens = (double)MAX_CONNECTS_PER_SEC; | |
| static double monotonic_elapsed_seconds(struct timespec now, | |
| struct timespec then) { | |
| time_t sec = now.tv_sec - then.tv_sec; | |
| long nsec = now.tv_nsec - then.tv_nsec; | |
| return (double)sec + ((double)nsec / 1000000000.0); | |
| } | |
| static int check_rate_limit(void) { | |
| struct timespec now; | |
| if (clock_gettime(CLOCK_MONOTONIC, &now) < 0) | |
| return 0; | |
| if (rate_limit_last.tv_sec == 0 && rate_limit_last.tv_nsec == 0) { | |
| rate_limit_last = now; | |
| } else { | |
| double elapsed = monotonic_elapsed_seconds(now, rate_limit_last); | |
| if (elapsed > 0.0) { | |
| rate_limit_tokens += elapsed * (double)MAX_CONNECTS_PER_SEC; | |
| if (rate_limit_tokens > (double)MAX_CONNECTS_PER_SEC) | |
| rate_limit_tokens = (double)MAX_CONNECTS_PER_SEC; | |
| rate_limit_last = now; | |
| } | |
| } | |
| if (rate_limit_tokens < 1.0) | |
| return 0; | |
| rate_limit_tokens -= 1.0; | |
| return 1; | |
| } | |
| /* TCP connection states */ | |
| enum tcp_state { | |
| SP_TCP_CLOSED = 0, | |
| SP_TCP_SYN_RECEIVED, | |
| SP_TCP_ESTABLISHED, | |
| SP_TCP_FIN_WAIT_1, | |
| SP_TCP_FIN_WAIT_2, | |
| SP_TCP_CLOSE_WAIT, | |
| SP_TCP_CLOSING, | |
| SP_TCP_LAST_ACK, | |
| SP_TCP_TIME_WAIT | |
| }; | |
| struct tcp_flow { | |
| uint32_t cli_ip; | |
| uint16_t cli_port; | |
| uint32_t srv_ip; | |
| uint16_t srv_port; | |
| uint32_t cli_isn; | |
| uint32_t srv_isn; | |
| uint32_t cli_next; | |
| uint32_t srv_next; | |
| int sock; | |
| enum tcp_state state; | |
| time_t last_active; | |
| /* TCP timestamp option (RFC 7323) */ | |
| int ts_ok; /* Timestamps negotiated */ | |
| uint32_t ts_recent; /* Last TSval received from client */ | |
| uint8_t pending_write[TCP_PENDING_WRITE_CAP]; | |
| size_t pending_write_off; | |
| size_t pending_write_len; | |
| int pending_fin; | |
| uint32_t pending_fin_seq; | |
| int backend_ready; | |
| struct socks_io socks; | |
| struct epoll_wrapper ew; /* Epoll registration */ | |
| }; | |
| static struct tcp_flow tcp_flows[MAX_TCP]; | |
| struct udp_flow { | |
| uint32_t cli_ip; | |
| uint16_t cli_port; | |
| uint32_t srv_ip; | |
| uint16_t srv_port; | |
| int tcp_ctrl; /* SOCKS5 TCP control connection (must stay open) */ | |
| int udp_relay; /* UDP socket to SOCKS relay */ | |
| int udp_staging; /* UDP socket bound before SOCKS UDP ASSOCIATE completes */ | |
| time_t last_used; /* Last activity timestamp */ | |
| struct sockaddr_in relay_addr; /* Expected relay source for validation */ | |
| struct socks_io socks; | |
| uint8_t pending_data[65535]; | |
| size_t pending_len; | |
| int pending_set; | |
| unsigned long dropped_backpressure; | |
| struct epoll_wrapper ew; /* Epoll registration */ | |
| struct epoll_wrapper ctrl_ew; /* Epoll registration for SOCKS control TCP */ | |
| }; | |
| static struct udp_flow udp_flows[MAX_UDP]; | |
| struct cgroup_ctx { | |
| int active; | |
| int cpu_enabled; | |
| int memory_enabled; | |
| int pids_enabled; | |
| char root[PATH_MAX]; | |
| char broker[PATH_MAX]; | |
| char payload[PATH_MAX]; | |
| }; | |
| static struct cgroup_ctx g_cgroup = {0}; | |
| /* I/O buffer for event loop reads - safe as static since writes are via | |
| opaque syscalls (read/recv) that act as compiler barriers. */ | |
| static uint8_t g_io_buf[65536]; | |
| /* ---------- utilities ---------- */ | |
| static void die(const char *msg) { | |
| perror(msg); | |
| exit(1); | |
| } | |
| static ssize_t write_all(int fd, const void *buf, size_t len) { | |
| const uint8_t *p = buf; | |
| size_t off = 0; | |
| while (off < len) { | |
| ssize_t w = write(fd, p + off, len - off); | |
| if (w > 0) { | |
| off += (size_t)w; | |
| continue; | |
| } | |
| if (w < 0 && errno == EINTR) | |
| continue; | |
| return -1; | |
| } | |
| return (ssize_t)off; | |
| } | |
| static int tun_write_packet(int tunfd, const uint8_t *buf, size_t len, | |
| const char *what) { | |
| for (;;) { | |
| ssize_t n = write(tunfd, buf, len); | |
| if (n == (ssize_t)len) | |
| return 0; | |
| if (n > 0) { | |
| DBG("TUN write short (%s): %zd/%zu, dropping packet", what, n, len); | |
| return -1; | |
| } | |
| if (errno == EINTR) | |
| continue; | |
| if (errno == EAGAIN || errno == EWOULDBLOCK) { | |
| DBG("TUN write would block (%s), dropping packet", what); | |
| return -1; | |
| } | |
| DBG("TUN write failed (%s): %s", what, strerror(errno)); | |
| return -1; | |
| } | |
| } | |
| static void interactive_handle_sigwinch(int signo) { | |
| (void)signo; | |
| interactive_resize_pending = 1; | |
| } | |
| static int parse_long_strict(const char *s, long min, long max, long *out) { | |
| char *end = NULL; | |
| long value; | |
| if (s == NULL || *s == '\0') | |
| return -1; | |
| errno = 0; | |
| value = strtol(s, &end, 10); | |
| if (errno != 0 || end == NULL || *end != '\0' || value < min || value > max) | |
| return -1; | |
| *out = value; | |
| return 0; | |
| } | |
| /* Helper to suppress unused result warnings from FORTIFY_SOURCE */ | |
| #define IGNORE_RESULT(x) \ | |
| do { \ | |
| if (x) { \ | |
| } \ | |
| } while (0) | |
| static void write_file(const char *path, const char *data) { | |
| int fd = open(path, O_WRONLY); | |
| if (fd < 0) | |
| die(path); | |
| if (write(fd, data, strlen(data)) != (ssize_t)strlen(data)) | |
| die(path); | |
| close(fd); | |
| } | |
| static void set_rlimit_or_die(int resource, rlim_t soft, rlim_t hard, | |
| const char *name) { | |
| struct rlimit lim = {.rlim_cur = soft, .rlim_max = hard}; | |
| if (setrlimit(resource, &lim) < 0) { | |
| fprintf(stderr, "setrlimit(%s) failed: %s\n", name, strerror(errno)); | |
| exit(1); | |
| } | |
| DBG("RLIMIT %s set to soft=%llu hard=%llu", name, | |
| (unsigned long long)soft, (unsigned long long)hard); | |
| } | |
| static void apply_parent_rlimits(void) { | |
| set_rlimit_or_die(RLIMIT_CORE, 0, 0, "CORE"); | |
| set_rlimit_or_die(RLIMIT_MEMLOCK, 0, 0, "MEMLOCK"); | |
| set_rlimit_or_die(RLIMIT_NOFILE, RLIMIT_PARENT_NOFILE, RLIMIT_PARENT_NOFILE, | |
| "NOFILE"); | |
| } | |
| static void apply_child_rlimits(void) { | |
| set_rlimit_or_die(RLIMIT_CORE, 0, 0, "CORE"); | |
| set_rlimit_or_die(RLIMIT_MEMLOCK, 0, 0, "MEMLOCK"); | |
| set_rlimit_or_die(RLIMIT_NOFILE, RLIMIT_CHILD_NOFILE, RLIMIT_CHILD_NOFILE, | |
| "NOFILE"); | |
| } | |
| static int detect_delegated_cgroup_root(struct cgroup_ctx *ctx); | |
| static void pre_scan_verbose_flag(int argc, char **argv) { | |
| for (int i = 1; i < argc; ++i) { | |
| if (strcmp(argv[i], "--verbose") == 0 || strcmp(argv[i], "-v") == 0) { | |
| verbose = 1; | |
| continue; | |
| } | |
| if (strcmp(argv[i], "--socks") == 0) { | |
| ++i; | |
| continue; | |
| } | |
| if (strncmp(argv[i], "--socks-auth-file=", 18) == 0 || | |
| strncmp(argv[i], "--allow-host=", 13) == 0 || | |
| strcmp(argv[i], "--unsafe-share-cwd") == 0 || | |
| strcmp(argv[i], "--interactive") == 0) { | |
| continue; | |
| } | |
| if (argv[i][0] == '-') | |
| break; | |
| break; | |
| } | |
| } | |
| static int env_flag_enabled(const char *name) { | |
| const char *value = getenv(name); | |
| return value != NULL && *value != '\0' && strcmp(value, "0") != 0; | |
| } | |
| static int read_self_exe_path(char *out, size_t out_sz) { | |
| ssize_t n = readlink("/proc/self/exe", out, out_sz - 1); | |
| if (n < 0) | |
| return -1; | |
| if ((size_t)n >= out_sz - 1) { | |
| errno = ENAMETOOLONG; | |
| return -1; | |
| } | |
| out[n] = '\0'; | |
| return 0; | |
| } | |
| static int find_executable_in_path(const char *name, char *out, size_t out_sz) { | |
| const char *path = getenv("PATH"); | |
| const char *segment = path; | |
| if (!name || !*name || !out || out_sz == 0) { | |
| errno = EINVAL; | |
| return -1; | |
| } | |
| if (path == NULL || *path == '\0') { | |
| errno = ENOENT; | |
| return -1; | |
| } | |
| while (1) { | |
| const char *colon = strchr(segment, ':'); | |
| size_t seg_len = colon ? (size_t)(colon - segment) : strlen(segment); | |
| const char *dir = segment; | |
| char candidate[PATH_MAX]; | |
| int rc; | |
| if (seg_len == 0) { | |
| dir = "."; | |
| seg_len = 1; | |
| } | |
| rc = snprintf(candidate, sizeof(candidate), "%.*s/%s", (int)seg_len, dir, | |
| name); | |
| if (rc >= 0 && (size_t)rc < sizeof(candidate) && | |
| access(candidate, X_OK) == 0) { | |
| if (snprintf(out, out_sz, "%s", candidate) >= (int)out_sz) { | |
| errno = ENAMETOOLONG; | |
| return -1; | |
| } | |
| return 0; | |
| } | |
| if (!colon) | |
| break; | |
| segment = colon + 1; | |
| } | |
| errno = ENOENT; | |
| return -1; | |
| } | |
| static void maybe_reexec_under_systemd_scope(int argc, char **argv) { | |
| struct cgroup_ctx probe; | |
| char systemd_run_path[PATH_MAX]; | |
| char self_path[PATH_MAX]; | |
| char **new_argv; | |
| char *term_env = NULL; | |
| size_t extra_args = 0; | |
| size_t argc_sz = (size_t)(argc > 0 ? argc : 0); | |
| size_t idx = 0; | |
| int use_user_scope = 1; | |
| const char *term = getenv("TERM"); | |
| struct stat st; | |
| if (detect_delegated_cgroup_root(&probe)) { | |
| DBG("scope bootstrap skipped: already in delegated subtree"); | |
| return; | |
| } | |
| if (getenv("_SOCKPUPPET_IN_SCOPE") != NULL) { | |
| fprintf(stderr, | |
| "[sockpuppet] scope bootstrap claimed success but no delegated " | |
| "cgroup root is active\n"); | |
| exit(1); | |
| } | |
| if (stat("/run/systemd/system", &st) < 0) { | |
| DBG("scope bootstrap skipped: systemd not available"); | |
| return; | |
| } | |
| if (find_executable_in_path("systemd-run", systemd_run_path, | |
| sizeof(systemd_run_path)) < 0) { | |
| DBG("scope bootstrap skipped: systemd-run not available"); | |
| return; | |
| } | |
| if (read_self_exe_path(self_path, sizeof(self_path)) < 0) | |
| die("readlink /proc/self/exe"); | |
| if (env_flag_enabled("SOCKPUPPET_SCOPE_SYSTEM") && geteuid() == 0) | |
| use_user_scope = 0; | |
| if (term != NULL && *term != '\0') | |
| ++extra_args; | |
| new_argv = calloc(argc_sz + extra_args + 20U, sizeof(*new_argv)); | |
| if (new_argv == NULL) | |
| die("calloc systemd-run argv"); | |
| new_argv[idx++] = systemd_run_path; | |
| if (use_user_scope) | |
| new_argv[idx++] = "--user"; | |
| new_argv[idx++] = "--scope"; | |
| new_argv[idx++] = "--quiet"; | |
| new_argv[idx++] = "--same-dir"; | |
| new_argv[idx++] = "--collect"; | |
| new_argv[idx++] = "--property=Delegate=yes"; | |
| new_argv[idx++] = "--property=MemoryMax=" SCOPE_MEMORY_MAX; | |
| new_argv[idx++] = "--property=MemoryHigh=" SCOPE_MEMORY_HIGH; | |
| new_argv[idx++] = "--property=TasksMax=" SCOPE_TASKS_MAX; | |
| new_argv[idx++] = "--property=CPUQuota=" SCOPE_CPU_QUOTA; | |
| new_argv[idx++] = "--property=LimitCORE=0"; | |
| new_argv[idx++] = "--property=LimitNOFILE=" SCOPE_NOFILE; | |
| new_argv[idx++] = "--setenv=_SOCKPUPPET_IN_SCOPE=1"; | |
| if (term != NULL && *term != '\0') { | |
| size_t term_len = strlen(term); | |
| term_env = malloc(term_len + sizeof("--setenv=TERM=")); | |
| if (term_env == NULL) | |
| die("malloc TERM env"); | |
| if (snprintf(term_env, term_len + sizeof("--setenv=TERM="), | |
| "--setenv=TERM=%s", term) >= | |
| (int)(term_len + sizeof("--setenv=TERM="))) | |
| die("TERM env too long"); | |
| new_argv[idx++] = term_env; | |
| } | |
| new_argv[idx++] = "--"; | |
| new_argv[idx++] = self_path; | |
| for (int i = 1; i < argc; ++i) | |
| new_argv[idx++] = argv[i]; | |
| new_argv[idx] = NULL; | |
| execv(systemd_run_path, new_argv); | |
| DBG("scope bootstrap skipped: exec systemd-run failed (%s)", | |
| strerror(errno)); | |
| free(term_env); | |
| free(new_argv); | |
| } | |
| static int read_self_cgroup_path(char *out, size_t out_sz) { | |
| FILE *fp = fopen("/proc/self/cgroup", "r"); | |
| char line[1024]; | |
| if (!fp) | |
| return -1; | |
| while (fgets(line, sizeof(line), fp) != NULL) { | |
| if (strncmp(line, "0::", 3) != 0) | |
| continue; | |
| char *path = line + 3; | |
| char *nl = strchr(path, '\n'); | |
| if (nl) | |
| *nl = '\0'; | |
| if (*path == '\0') | |
| path = "/"; | |
| if (snprintf(out, out_sz, "%s", path) >= (int)out_sz) { | |
| fclose(fp); | |
| return -1; | |
| } | |
| fclose(fp); | |
| return 0; | |
| } | |
| fclose(fp); | |
| return -1; | |
| } | |
| static int cgroup_write_file(const char *dir, const char *name, | |
| const char *value) { | |
| char path[PATH_MAX]; | |
| int fd; | |
| if (snprintf(path, sizeof(path), "%s/%s", dir, name) >= (int)sizeof(path)) | |
| return -1; | |
| fd = open(path, O_WRONLY | O_CLOEXEC); | |
| if (fd < 0) | |
| return -1; | |
| size_t len = strlen(value); | |
| ssize_t w = write(fd, value, len); | |
| int saved = errno; | |
| close(fd); | |
| if (w != (ssize_t)len) { | |
| errno = saved ? saved : EIO; | |
| return -1; | |
| } | |
| return 0; | |
| } | |
| static int cgroup_mkdir_leaf(const char *path) { | |
| if (mkdir(path, 0755) < 0 && errno != EEXIST) | |
| return -1; | |
| return 0; | |
| } | |
| static int cgroup_move_pid(const char *leaf, pid_t pid) { | |
| char pidbuf[32]; | |
| if (snprintf(pidbuf, sizeof(pidbuf), "%ld", (long)pid) >= (int)sizeof(pidbuf)) | |
| return -1; | |
| return cgroup_write_file(leaf, "cgroup.procs", pidbuf); | |
| } | |
| static int cgroup_has_controller(const char *controllers, const char *needle) { | |
| size_t nlen = strlen(needle); | |
| const char *p = controllers; | |
| while (*p) { | |
| while (*p == ' ' || *p == '\t' || *p == '\n') | |
| ++p; | |
| if (!*p) | |
| break; | |
| const char *start = p; | |
| while (*p && *p != ' ' && *p != '\t' && *p != '\n') | |
| ++p; | |
| size_t len = (size_t)(p - start); | |
| if (len == nlen && memcmp(start, needle, nlen) == 0) | |
| return 1; | |
| } | |
| return 0; | |
| } | |
| static int cgroup_enable_controllers(struct cgroup_ctx *ctx, int want_cpu, | |
| int want_memory, int want_pids) { | |
| char path[PATH_MAX]; | |
| char controllers[1024]; | |
| FILE *fp; | |
| char enable[128] = {0}; | |
| size_t off = 0; | |
| if (snprintf(path, sizeof(path), "%s/cgroup.controllers", ctx->root) >= | |
| (int)sizeof(path)) | |
| return -1; | |
| fp = fopen(path, "r"); | |
| if (!fp) | |
| return -1; | |
| if (!fgets(controllers, sizeof(controllers), fp)) { | |
| fclose(fp); | |
| return -1; | |
| } | |
| fclose(fp); | |
| ctx->cpu_enabled = want_cpu && cgroup_has_controller(controllers, "cpu"); | |
| ctx->memory_enabled = | |
| want_memory && cgroup_has_controller(controllers, "memory"); | |
| ctx->pids_enabled = want_pids && cgroup_has_controller(controllers, "pids"); | |
| if (ctx->cpu_enabled) | |
| off += (size_t)snprintf(enable + off, sizeof(enable) - off, "%s+cpu", | |
| off > 0 ? " " : ""); | |
| if (ctx->memory_enabled) | |
| off += (size_t)snprintf(enable + off, sizeof(enable) - off, "%s+memory", | |
| off > 0 ? " " : ""); | |
| if (ctx->pids_enabled) | |
| off += (size_t)snprintf(enable + off, sizeof(enable) - off, "%s+pids", | |
| off > 0 ? " " : ""); | |
| if (off == 0) | |
| return 0; | |
| if (off >= sizeof(enable)) | |
| return -1; | |
| if (cgroup_write_file(ctx->root, "cgroup.subtree_control", enable) < 0) | |
| return -1; | |
| DBG("cgroup controllers enabled:%s%s%s", ctx->cpu_enabled ? " cpu" : "", | |
| ctx->memory_enabled ? " memory" : "", ctx->pids_enabled ? " pids" : ""); | |
| return 0; | |
| } | |
| static int detect_delegated_cgroup_root(struct cgroup_ctx *ctx) { | |
| char rel[PATH_MAX]; | |
| char ctrl_path[PATH_MAX]; | |
| char subtree_path[PATH_MAX]; | |
| char delegate[8] = {0}; | |
| ssize_t xrc; | |
| memset(ctx, 0, sizeof(*ctx)); | |
| if (read_self_cgroup_path(rel, sizeof(rel)) < 0) | |
| return 0; | |
| if (strcmp(rel, "/") == 0) { | |
| if (snprintf(ctx->root, sizeof(ctx->root), "/sys/fs/cgroup") >= | |
| (int)sizeof(ctx->root)) | |
| return 0; | |
| } else { | |
| if (snprintf(ctx->root, sizeof(ctx->root), "/sys/fs/cgroup%s", rel) >= | |
| (int)sizeof(ctx->root)) | |
| return 0; | |
| } | |
| if (snprintf(ctrl_path, sizeof(ctrl_path), "%s/cgroup.controllers", ctx->root) >= | |
| (int)sizeof(ctrl_path) || | |
| snprintf(subtree_path, sizeof(subtree_path), "%s/cgroup.subtree_control", | |
| ctx->root) >= (int)sizeof(subtree_path)) | |
| return 0; | |
| if (access(ctrl_path, R_OK) < 0 || access(subtree_path, W_OK) < 0) | |
| return 0; | |
| xrc = getxattr(ctx->root, "user.delegate", delegate, sizeof(delegate) - 1); | |
| if (xrc > 0) { | |
| delegate[xrc] = '\0'; | |
| DBG("cgroup user.delegate=%s", delegate); | |
| } | |
| if (snprintf(ctx->broker, sizeof(ctx->broker), "%s/broker", ctx->root) >= | |
| (int)sizeof(ctx->broker) || | |
| snprintf(ctx->payload, sizeof(ctx->payload), "%s/payload", ctx->root) >= | |
| (int)sizeof(ctx->payload)) | |
| return 0; | |
| ctx->active = 1; | |
| return 1; | |
| } | |
| static int cgroup_root_has_foreign_procs(const struct cgroup_ctx *ctx) { | |
| char path[PATH_MAX]; | |
| FILE *fp; | |
| long pid; | |
| pid_t self = getpid(); | |
| if (snprintf(path, sizeof(path), "%s/cgroup.procs", ctx->root) >= | |
| (int)sizeof(path)) | |
| return 1; | |
| fp = fopen(path, "r"); | |
| if (!fp) | |
| return 1; | |
| while (fscanf(fp, "%ld", &pid) == 1) { | |
| if ((pid_t)pid != self) { | |
| fclose(fp); | |
| return 1; | |
| } | |
| } | |
| fclose(fp); | |
| return 0; | |
| } | |
| static void cgroup_rollback_setup(const struct cgroup_ctx *ctx) { | |
| if (cgroup_move_pid(ctx->root, getpid()) < 0) { | |
| DBG("cgroup containment rollback: failed to move self back to root (%s)", | |
| strerror(errno)); | |
| } | |
| if (rmdir(ctx->broker) < 0 && errno != ENOENT && errno != ENOTEMPTY) { | |
| DBG("cgroup containment rollback: failed to remove broker leaf (%s)", | |
| strerror(errno)); | |
| } | |
| if (rmdir(ctx->payload) < 0 && errno != ENOENT && errno != ENOTEMPTY) { | |
| DBG("cgroup containment rollback: failed to remove payload leaf (%s)", | |
| strerror(errno)); | |
| } | |
| } | |
| static void cgroup_apply_broker_limits(const struct cgroup_ctx *ctx) { | |
| if (ctx->memory_enabled) { | |
| if (cgroup_write_file(ctx->broker, "memory.low", BROKER_MEMORY_LOW) < 0 || | |
| cgroup_write_file(ctx->broker, "memory.high", BROKER_MEMORY_HIGH) < 0 || | |
| cgroup_write_file(ctx->broker, "memory.max", BROKER_MEMORY_MAX) < 0 || | |
| cgroup_write_file(ctx->broker, "memory.oom.group", "1") < 0) | |
| die("cgroup broker memory limits"); | |
| } | |
| if (ctx->pids_enabled && | |
| cgroup_write_file(ctx->broker, "pids.max", BROKER_PIDS_MAX) < 0) | |
| die("cgroup broker pids.max"); | |
| if (ctx->cpu_enabled && | |
| cgroup_write_file(ctx->broker, "cpu.weight", BROKER_CPU_WEIGHT) < 0) | |
| die("cgroup broker cpu.weight"); | |
| } | |
| static void cgroup_apply_payload_limits(const struct cgroup_ctx *ctx) { | |
| char path[PATH_MAX]; | |
| if (ctx->memory_enabled) { | |
| if (cgroup_write_file(ctx->payload, "memory.high", PAYLOAD_MEMORY_HIGH) < 0 || | |
| cgroup_write_file(ctx->payload, "memory.max", PAYLOAD_MEMORY_MAX) < 0 || | |
| cgroup_write_file(ctx->payload, "memory.oom.group", "1") < 0) | |
| die("cgroup payload memory limits"); | |
| if (snprintf(path, sizeof(path), "%s/memory.swap.max", ctx->payload) < | |
| (int)sizeof(path) && | |
| access(path, F_OK) == 0) { | |
| if (cgroup_write_file(ctx->payload, "memory.swap.max", "0") < 0) | |
| die("cgroup payload memory.swap.max"); | |
| } else { | |
| DBG("cgroup payload memory.swap.max not available, skipping"); | |
| } | |
| } | |
| if (ctx->pids_enabled && | |
| cgroup_write_file(ctx->payload, "pids.max", PAYLOAD_PIDS_MAX) < 0) | |
| die("cgroup payload pids.max"); | |
| if (ctx->cpu_enabled) { | |
| if (cgroup_write_file(ctx->payload, "cpu.weight", PAYLOAD_CPU_WEIGHT) < 0) | |
| die("cgroup payload cpu.weight"); | |
| if (snprintf(path, sizeof(path), "%s/cpu.max", ctx->payload) < | |
| (int)sizeof(path) && | |
| access(path, F_OK) == 0) { | |
| if (cgroup_write_file(ctx->payload, "cpu.max", PAYLOAD_CPU_MAX) < 0) | |
| die("cgroup payload cpu.max"); | |
| } else { | |
| DBG("cgroup payload cpu.max not available, skipping"); | |
| } | |
| } | |
| } | |
| static void cgroup_setup_containment(void) { | |
| struct cgroup_ctx ctx; | |
| if (!detect_delegated_cgroup_root(&ctx)) { | |
| DBG("cgroup containment inactive: no delegated writable subtree"); | |
| return; | |
| } | |
| DBG("cgroup containment active under %s", ctx.root); | |
| if (cgroup_root_has_foreign_procs(&ctx)) { | |
| DBG("cgroup containment inactive: delegated root contains foreign pids"); | |
| return; | |
| } | |
| if (cgroup_mkdir_leaf(ctx.broker) < 0) { | |
| if (errno == EACCES || errno == EPERM || errno == EROFS) { | |
| DBG("cgroup containment inactive: cannot create broker leaf (%s)", | |
| strerror(errno)); | |
| return; | |
| } | |
| die("cgroup mkdir broker"); | |
| } | |
| if (cgroup_move_pid(ctx.broker, getpid()) < 0) | |
| die("cgroup move self to broker"); | |
| if (cgroup_enable_controllers(&ctx, 1, 1, 1) < 0) { | |
| if (errno == EBUSY || errno == EACCES || errno == EPERM || | |
| errno == EROFS || errno == EOPNOTSUPP) { | |
| DBG("cgroup containment inactive: cannot enable controllers (%s)", | |
| strerror(errno)); | |
| cgroup_rollback_setup(&ctx); | |
| return; | |
| } | |
| die("cgroup enable controllers"); | |
| } | |
| if (cgroup_mkdir_leaf(ctx.payload) < 0) | |
| die("cgroup mkdir payload"); | |
| cgroup_apply_broker_limits(&ctx); | |
| cgroup_apply_payload_limits(&ctx); | |
| DBG("cgroup broker leaf: %s", ctx.broker); | |
| DBG("cgroup payload leaf: %s", ctx.payload); | |
| g_cgroup = ctx; | |
| } | |
| static void cgroup_move_child_to_payload(pid_t pid) { | |
| if (!g_cgroup.active) | |
| return; | |
| if (cgroup_move_pid(g_cgroup.payload, pid) < 0) | |
| die("cgroup move child to payload"); | |
| } | |
| static void mkdir_if_missing(const char *path, mode_t mode) { | |
| if (mkdir(path, mode) < 0 && errno != EEXIST) | |
| die(path); | |
| } | |
| #if defined(__has_include) | |
| #if __has_include(<linux/landlock.h>) | |
| #include <linux/landlock.h> | |
| #define SP_HAVE_LANDLOCK 1 | |
| #endif | |
| #endif | |
| #ifndef SP_HAVE_LANDLOCK | |
| #define SP_HAVE_LANDLOCK 0 | |
| #endif | |
| struct fs_sandbox { | |
| char resolved_cwd[PATH_MAX]; | |
| char final_root[PATH_MAX]; | |
| }; | |
| static void write_text_file(const char *path, const char *data) { | |
| int fd = open(path, O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC, 0644); | |
| if (fd < 0) | |
| die(path); | |
| size_t len = strlen(data); | |
| if (write(fd, data, len) != (ssize_t)len) | |
| die("write"); | |
| close(fd); | |
| } | |
| static int path_exists(const char *path) { | |
| struct stat st; | |
| return stat(path, &st) == 0; | |
| } | |
| static void mkdir_parents(const char *path, mode_t mode) { | |
| char tmp[PATH_MAX]; | |
| size_t len = strlen(path); | |
| if (len >= sizeof(tmp)) | |
| die("mkdir_parents"); | |
| memcpy(tmp, path, len + 1); | |
| for (char *p = tmp + 1; *p; ++p) { | |
| if (*p != '/') | |
| continue; | |
| *p = '\0'; | |
| mkdir_if_missing(tmp, mode); | |
| *p = '/'; | |
| } | |
| mkdir_if_missing(tmp, mode); | |
| } | |
| static void ensure_parent_dir(const char *path, mode_t mode) { | |
| char tmp[PATH_MAX]; | |
| char *slash; | |
| if (strlen(path) >= sizeof(tmp)) | |
| die("ensure_parent_dir"); | |
| strcpy(tmp, path); | |
| slash = strrchr(tmp, '/'); | |
| if (!slash) | |
| return; | |
| if (slash == tmp) { | |
| mkdir_if_missing("/", mode); | |
| return; | |
| } | |
| *slash = '\0'; | |
| mkdir_parents(tmp, mode); | |
| } | |
| static int path_contains(const char *base, const char *path) { | |
| size_t len = strlen(base); | |
| if (strcmp(base, "/") == 0) | |
| return 1; | |
| if (strncmp(base, path, len) != 0) | |
| return 0; | |
| return path[len] == '\0' || path[len] == '/'; | |
| } | |
| static void path_append(char *dst, size_t dst_size, const char *root, | |
| const char *suffix) { | |
| if (snprintf(dst, dst_size, "%s%s", root, suffix) >= (int)dst_size) | |
| die("path too long"); | |
| } | |
| static void stage_path(char *dst, size_t dst_size, const char *base, | |
| const char *suffix) { | |
| path_append(dst, dst_size, base, suffix); | |
| } | |
| static void normalize_absolute_path(const char *path, char *out, | |
| size_t out_size) { | |
| const char *segments[PATH_MAX / 2]; | |
| size_t count = 0; | |
| char tmp[PATH_MAX]; | |
| char *save = NULL; | |
| char *tok; | |
| if (strlen(path) >= sizeof(tmp)) | |
| die("normalize path"); | |
| strcpy(tmp, path); | |
| for (tok = strtok_r(tmp, "/", &save); tok; tok = strtok_r(NULL, "/", &save)) { | |
| if (strcmp(tok, ".") == 0 || *tok == '\0') | |
| continue; | |
| if (strcmp(tok, "..") == 0) { | |
| if (count > 0) | |
| --count; | |
| continue; | |
| } | |
| segments[count++] = tok; | |
| } | |
| if (snprintf(out, out_size, "/") >= (int)out_size) | |
| die("normalize path"); | |
| for (size_t i = 0; i < count; ++i) { | |
| size_t used = strlen(out); | |
| if (snprintf(out + used, out_size - used, "%s%s", i == 0 ? "" : "/", | |
| segments[i]) >= (int)(out_size - used)) | |
| die("normalize path"); | |
| } | |
| } | |
| static void resolve_bind_target(const char *path, char *resolved, | |
| size_t resolved_size) { | |
| struct stat st; | |
| char link[PATH_MAX]; | |
| char base[PATH_MAX]; | |
| char combined[PATH_MAX]; | |
| ssize_t len; | |
| if (lstat(path, &st) < 0 || !S_ISLNK(st.st_mode)) { | |
| if (strlen(path) >= resolved_size) | |
| die("resolve bind target"); | |
| strcpy(resolved, path); | |
| return; | |
| } | |
| len = readlink(path, link, sizeof(link) - 1); | |
| if (len < 0) | |
| die("readlink"); | |
| link[len] = '\0'; | |
| if (link[0] == '/') { | |
| normalize_absolute_path(link, resolved, resolved_size); | |
| return; | |
| } | |
| if (strlen(path) >= sizeof(base)) | |
| die("resolve bind target"); | |
| strcpy(base, path); | |
| char *slash = strrchr(base, '/'); | |
| if (!slash) | |
| die("resolve bind target"); | |
| if (slash == base) { | |
| base[1] = '\0'; | |
| } else { | |
| *slash = '\0'; | |
| } | |
| if (snprintf(combined, sizeof(combined), "%s/%s", base, link) >= | |
| (int)sizeof(combined)) | |
| die("resolve bind target"); | |
| normalize_absolute_path(combined, resolved, resolved_size); | |
| } | |
| static void bind_mount_dir(const char *src, const char *dst, int readonly) { | |
| if (!path_exists(src)) | |
| return; | |
| mkdir_parents(dst, 0755); | |
| if (mount(src, dst, NULL, MS_BIND | MS_REC, NULL) < 0) | |
| die(src); | |
| if (!readonly) | |
| return; | |
| if (mount(NULL, dst, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY | MS_REC, NULL) < | |
| 0) | |
| die(dst); | |
| } | |
| static void bind_mount_file(const char *src, const char *dst) { | |
| if (!path_exists(src)) | |
| return; | |
| ensure_parent_dir(dst, 0755); | |
| if (!path_exists(dst)) { | |
| int fd = open(dst, O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC, 0644); | |
| if (fd < 0) | |
| die(dst); | |
| close(fd); | |
| } | |
| if (mount(src, dst, NULL, MS_BIND, NULL) < 0) | |
| die(src); | |
| } | |
| static void mount_procfs(const char *root) { | |
| char path[PATH_MAX]; | |
| path_append(path, sizeof(path), root, "/proc"); | |
| mkdir_parents(path, 0555); | |
| if (mount("proc", path, "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, "hidepid=2") < 0) | |
| die("mount proc"); | |
| } | |
| static void mount_private_tmp(const char *root) { | |
| char path[PATH_MAX]; | |
| path_append(path, sizeof(path), root, "/tmp"); | |
| mkdir_parents(path, 01777); | |
| if (mount("tmpfs", path, "tmpfs", MS_NODEV | MS_NOSUID, | |
| "mode=1777,size=64m") < 0) | |
| die("mount tmpfs /tmp"); | |
| } | |
| static void mount_minimal_dev(const char *root) { | |
| static const char *const dev_files[] = { | |
| "/dev/null", "/dev/zero", "/dev/full", "/dev/random", "/dev/urandom", | |
| }; | |
| char dev_root[PATH_MAX]; | |
| path_append(dev_root, sizeof(dev_root), root, "/dev"); | |
| mkdir_parents("/dev-min", 0755); | |
| for (size_t i = 0; i < sizeof(dev_files) / sizeof(dev_files[0]); ++i) { | |
| char src[PATH_MAX]; | |
| char dst[PATH_MAX]; | |
| path_append(src, sizeof(src), "/oldroot", dev_files[i]); | |
| path_append(dst, sizeof(dst), "/dev-min", dev_files[i] + 4); | |
| bind_mount_file(src, dst); | |
| } | |
| bind_mount_file("/oldroot/dev/tty", "/dev-min/tty"); | |
| mkdir_parents("/dev-min/pts", 0755); | |
| if (mount("devpts", "/dev-min/pts", "devpts", | |
| MS_NOSUID | MS_NOEXEC, "newinstance,ptmxmode=0666,mode=0620") < 0) | |
| die("mount devpts"); | |
| unlink("/dev-min/ptmx"); | |
| if (symlink("pts/ptmx", "/dev-min/ptmx") < 0) | |
| die("symlink /dev/ptmx"); | |
| mkdir_parents(dev_root, 0755); | |
| if (mount("/dev-min", dev_root, NULL, MS_BIND | MS_REC, NULL) < 0) | |
| die("mount /dev"); | |
| } | |
| static void mount_runtime_tree(const struct fs_sandbox *sandbox) { | |
| static const char *const runtime_dirs[] = { | |
| "/bin", "/sbin", "/usr", "/lib", "/lib64", "/etc", | |
| }; | |
| for (size_t i = 0; i < sizeof(runtime_dirs) / sizeof(runtime_dirs[0]); ++i) { | |
| char src[PATH_MAX]; | |
| char dst[PATH_MAX]; | |
| const char *path = runtime_dirs[i]; | |
| if (path_contains(sandbox->resolved_cwd, path)) | |
| continue; | |
| path_append(src, sizeof(src), "/oldroot", path); | |
| path_append(dst, sizeof(dst), sandbox->final_root, path); | |
| bind_mount_dir(src, dst, 1); | |
| } | |
| } | |
| static void mount_resolv_conf(const char *root) { | |
| char resolv_dst[PATH_MAX]; | |
| char mount_dst[PATH_MAX]; | |
| path_append(resolv_dst, sizeof(resolv_dst), root, "/etc/resolv.conf"); | |
| resolve_bind_target(resolv_dst, mount_dst, sizeof(mount_dst)); | |
| write_text_file("/resolv.conf.tmp", "nameserver 8.8.8.8\n"); | |
| bind_mount_file("/resolv.conf.tmp", mount_dst); | |
| } | |
| static void mount_overlay_cwd(struct fs_sandbox *sandbox) { | |
| char lower_src[PATH_MAX]; | |
| char opts[4096]; | |
| path_append(lower_src, sizeof(lower_src), "/oldroot", sandbox->resolved_cwd); | |
| bind_mount_dir(lower_src, "/lower", 0); | |
| mkdir_parents("/upper", 0700); | |
| mkdir_parents("/work", 0700); | |
| mkdir_parents("/merged", 0755); | |
| if (snprintf(opts, sizeof(opts), | |
| "lowerdir=/lower,upperdir=/upper,workdir=/work,userxattr") >= | |
| (int)sizeof(opts)) | |
| die("overlay options too long"); | |
| if (mount("overlay", "/merged", "overlay", 0, opts) < 0) { | |
| if (errno == EINVAL && | |
| (strcmp(sandbox->resolved_cwd, "/") == 0 || | |
| strcmp(sandbox->resolved_cwd, "/tmp") == 0)) { | |
| if (strcmp(sandbox->resolved_cwd, "/tmp") == 0) { | |
| mkdir_parents("/merged", 0755); | |
| if (mount("/lower", "/merged", NULL, MS_BIND | MS_REC, NULL) < 0) | |
| die("/merged"); | |
| } else { | |
| bind_mount_dir("/lower", "/merged", 1); | |
| } | |
| return; | |
| } | |
| die("mount overlay"); | |
| } | |
| } | |
| static void setup_final_root(struct fs_sandbox *sandbox) { | |
| char dst[PATH_MAX]; | |
| if (strcmp(sandbox->resolved_cwd, "/") == 0) { | |
| strcpy(sandbox->final_root, "/merged"); | |
| } else { | |
| strcpy(sandbox->final_root, "/sandbox"); | |
| mkdir_parents(sandbox->final_root, 0755); | |
| mount_runtime_tree(sandbox); | |
| if (!path_contains(sandbox->resolved_cwd, "/tmp")) | |
| mount_private_tmp(sandbox->final_root); | |
| path_append(dst, sizeof(dst), sandbox->final_root, sandbox->resolved_cwd); | |
| mkdir_parents(dst, 0755); | |
| if (mount("/merged", dst, NULL, MS_BIND | MS_REC, NULL) < 0) | |
| die("mount cwd overlay"); | |
| } | |
| if (strcmp(sandbox->resolved_cwd, "/") == 0 && | |
| !path_contains(sandbox->resolved_cwd, "/tmp")) | |
| mount_private_tmp(sandbox->final_root); | |
| mount_procfs(sandbox->final_root); | |
| mount_minimal_dev(sandbox->final_root); | |
| mount_resolv_conf(sandbox->final_root); | |
| } | |
| static void prepare_fs_sandbox(struct fs_sandbox *sandbox, const char *cwd, | |
| const char *base) { | |
| char path[PATH_MAX]; | |
| if (!unsafe_share_cwd && (strcmp(cwd, "/") == 0 || strncmp(cwd, "/home/", 6) == 0 || strcmp(cwd, "/root") == 0)) { | |
| fprintf(stderr, "Unsafe working directory %s. Use --unsafe-share-cwd\\n", cwd); | |
| exit(1); | |
| } | |
| if (!realpath(cwd, sandbox->resolved_cwd)) | |
| die("realpath cwd"); | |
| if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) | |
| die("mount MS_PRIVATE"); | |
| if (mount("tmpfs", base, "tmpfs", MS_NODEV | MS_NOSUID, | |
| "mode=0700,size=128m") < 0) | |
| die("mount tmpfs overlay base"); | |
| stage_path(path, sizeof(path), base, "/oldroot"); | |
| mkdir_parents(path, 0755); | |
| stage_path(path, sizeof(path), base, "/dev/net"); | |
| mkdir_parents(path, 0755); | |
| stage_path(path, sizeof(path), base, "/lower"); | |
| mkdir_parents(path, 0755); | |
| stage_path(path, sizeof(path), base, "/upper"); | |
| mkdir_parents(path, 0700); | |
| stage_path(path, sizeof(path), base, "/work"); | |
| mkdir_parents(path, 0700); | |
| stage_path(path, sizeof(path), base, "/merged"); | |
| mkdir_parents(path, 0755); | |
| stage_path(path, sizeof(path), base, "/sandbox"); | |
| mkdir_parents(path, 0755); | |
| stage_path(path, sizeof(path), base, "/dev-min"); | |
| mkdir_parents(path, 0755); | |
| stage_path(path, sizeof(path), base, "/resolv.conf.tmp"); | |
| ensure_parent_dir(path, 0755); | |
| if (chdir(base) < 0) | |
| die("chdir overlay base"); | |
| if (syscall(SYS_pivot_root, ".", "oldroot") < 0) | |
| die("pivot_root"); | |
| if (chdir("/") < 0) | |
| die("chdir /"); | |
| mkdir_parents("/dev", 0755); | |
| mkdir_parents("/dev/net", 0755); | |
| if (path_exists("/oldroot/dev/net")) { | |
| if (mount("/oldroot/dev/net", "/dev/net", NULL, MS_BIND | MS_REC, NULL) < 0) | |
| die("bind /dev/net"); | |
| } | |
| path_append(path, sizeof(path), "/oldroot", base); | |
| if (path_exists(path) && | |
| mount("tmpfs", path, "tmpfs", MS_NODEV | MS_NOSUID, | |
| "mode=0000,size=4k") < 0) | |
| die("hide overlay base"); | |
| mount_overlay_cwd(sandbox); | |
| setup_final_root(sandbox); | |
| } | |
| static void enter_fs_sandbox(const struct fs_sandbox *sandbox) { | |
| if (chroot(sandbox->final_root) < 0) | |
| die("chroot sandbox"); | |
| if (chdir(sandbox->resolved_cwd) < 0) | |
| die("chdir sandbox cwd"); | |
| } | |
| static int ensure_no_new_privs(void) { | |
| return prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); | |
| } | |
| static uint64_t landlock_read_exec_rights(void) { | |
| #if SP_HAVE_LANDLOCK | |
| return LANDLOCK_ACCESS_FS_EXECUTE | LANDLOCK_ACCESS_FS_READ_FILE | | |
| LANDLOCK_ACCESS_FS_READ_DIR; | |
| #else | |
| return 0; | |
| #endif | |
| } | |
| static uint64_t landlock_write_rights_for_abi(int abi) { | |
| uint64_t rights = 0; | |
| #if SP_HAVE_LANDLOCK | |
| (void)abi; | |
| rights = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_REMOVE_DIR | | |
| LANDLOCK_ACCESS_FS_REMOVE_FILE | LANDLOCK_ACCESS_FS_MAKE_CHAR | | |
| LANDLOCK_ACCESS_FS_MAKE_DIR | LANDLOCK_ACCESS_FS_MAKE_REG | | |
| LANDLOCK_ACCESS_FS_MAKE_SOCK | LANDLOCK_ACCESS_FS_MAKE_FIFO | | |
| LANDLOCK_ACCESS_FS_MAKE_BLOCK | LANDLOCK_ACCESS_FS_MAKE_SYM; | |
| #ifdef LANDLOCK_ACCESS_FS_REFER | |
| if (abi >= 2) | |
| rights |= LANDLOCK_ACCESS_FS_REFER; | |
| #endif | |
| #ifdef LANDLOCK_ACCESS_FS_TRUNCATE | |
| if (abi >= 3) | |
| rights |= LANDLOCK_ACCESS_FS_TRUNCATE; | |
| #endif | |
| #else | |
| (void)abi; | |
| #endif | |
| return rights; | |
| } | |
| static int add_landlock_rule(int ruleset_fd, const char *path, | |
| uint64_t allowed_access) { | |
| #if SP_HAVE_LANDLOCK && defined(__NR_landlock_add_rule) | |
| int dirfd = open(path, O_PATH | O_CLOEXEC); | |
| if (dirfd < 0) | |
| return -1; | |
| struct landlock_path_beneath_attr rule = { | |
| .allowed_access = allowed_access, | |
| .parent_fd = dirfd, | |
| }; | |
| int rc = (int)syscall(__NR_landlock_add_rule, ruleset_fd, | |
| LANDLOCK_RULE_PATH_BENEATH, &rule, 0); | |
| close(dirfd); | |
| return rc; | |
| #else | |
| (void)ruleset_fd; | |
| (void)path; | |
| (void)allowed_access; | |
| errno = ENOSYS; | |
| return -1; | |
| #endif | |
| } | |
| static int apply_landlock_policy(const char *cwd_path) { | |
| #if SP_HAVE_LANDLOCK && defined(__NR_landlock_create_ruleset) && \ | |
| defined(__NR_landlock_restrict_self) | |
| int abi = (int)syscall(__NR_landlock_create_ruleset, NULL, 0, | |
| LANDLOCK_CREATE_RULESET_VERSION); | |
| if (abi < 0) { | |
| if (errno == ENOSYS || errno == EOPNOTSUPP) { | |
| fprintf(stderr, "[sockpuppet] Warning: Landlock not supported by kernel, continuing without filesystem sandbox\\n"); | |
| return 0; | |
| } | |
| return -1; | |
| } | |
| uint64_t read_exec = landlock_read_exec_rights(); | |
| uint64_t write_rights = landlock_write_rights_for_abi(abi); | |
| struct landlock_ruleset_attr ruleset = { | |
| .handled_access_fs = read_exec | write_rights, | |
| }; | |
| int ruleset_fd = (int)syscall(__NR_landlock_create_ruleset, &ruleset, | |
| sizeof(ruleset), 0); | |
| if (ruleset_fd < 0) { | |
| if (errno == ENOSYS || errno == EOPNOTSUPP) { | |
| fprintf(stderr, "[sockpuppet] Warning: Landlock not supported by kernel, continuing without filesystem sandbox\\n"); | |
| return 0; | |
| } | |
| return -1; | |
| } | |
| if (add_landlock_rule(ruleset_fd, "/", read_exec) < 0 || | |
| add_landlock_rule(ruleset_fd, cwd_path, read_exec | write_rights) < 0 || | |
| add_landlock_rule(ruleset_fd, "/tmp", read_exec | write_rights) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev", LANDLOCK_ACCESS_FS_READ_DIR) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/pts", | |
| LANDLOCK_ACCESS_FS_READ_DIR | | |
| LANDLOCK_ACCESS_FS_READ_FILE | | |
| LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/null", | |
| LANDLOCK_ACCESS_FS_READ_FILE | | |
| LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/tty", | |
| LANDLOCK_ACCESS_FS_READ_FILE | | |
| LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/ptmx", | |
| LANDLOCK_ACCESS_FS_READ_FILE | | |
| LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/zero", | |
| LANDLOCK_ACCESS_FS_READ_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/full", | |
| LANDLOCK_ACCESS_FS_READ_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/random", | |
| LANDLOCK_ACCESS_FS_READ_FILE) < 0 || | |
| add_landlock_rule(ruleset_fd, "/dev/urandom", | |
| LANDLOCK_ACCESS_FS_READ_FILE) < 0) { | |
| close(ruleset_fd); | |
| return -1; | |
| } | |
| if (ensure_no_new_privs() < 0) { | |
| close(ruleset_fd); | |
| return -1; | |
| } | |
| if (syscall(__NR_landlock_restrict_self, ruleset_fd, 0) < 0) { | |
| close(ruleset_fd); | |
| return -1; | |
| } | |
| close(ruleset_fd); | |
| return 0; | |
| #else | |
| (void)cwd_path; | |
| return 0; | |
| #endif | |
| } | |
| #ifndef SECCOMP_RET_KILL_PROCESS | |
| #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL | |
| #endif | |
| #ifndef __X32_SYSCALL_BIT | |
| #define __X32_SYSCALL_BIT 0x40000000U | |
| #endif | |
| #define SP_CLONE_NAMESPACE_FLAGS_BASE \ | |
| (CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWIPC | \ | |
| CLONE_NEWUTS | CLONE_NEWPID) | |
| #ifdef CLONE_NEWCGROUP | |
| #define SP_CLONE_NAMESPACE_FLAGS_CGROUP | CLONE_NEWCGROUP | |
| #else | |
| #define SP_CLONE_NAMESPACE_FLAGS_CGROUP | |
| #endif | |
| #ifdef CLONE_NEWTIME | |
| #define SP_CLONE_NAMESPACE_FLAGS_TIME | CLONE_NEWTIME | |
| #else | |
| #define SP_CLONE_NAMESPACE_FLAGS_TIME | |
| #endif | |
| #define SP_CLONE_NAMESPACE_FLAGS \ | |
| (SP_CLONE_NAMESPACE_FLAGS_BASE SP_CLONE_NAMESPACE_FLAGS_CGROUP \ | |
| SP_CLONE_NAMESPACE_FLAGS_TIME) | |
| #define SP_SECCOMP_KILL SECCOMP_RET_KILL_PROCESS | |
| #define SP_SECCOMP_DENY_NR(nr) \ | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)(nr), 0, 1), \ | |
| BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL) | |
| #define SP_SECCOMP_ERRNO_NR(nr, err) \ | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)(nr), 0, 1), \ | |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | ((err) & SECCOMP_RET_DATA)) | |
| #define SP_SECCOMP_ALLOW() BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW) | |
| static int apply_child_seccomp(void) { | |
| #if defined(__x86_64__) | |
| static const struct sock_filter filter[] = { | |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, | |
| (unsigned int)offsetof(struct seccomp_data, arch)), | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0), | |
| BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL), | |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, | |
| (unsigned int)offsetof(struct seccomp_data, nr)), | |
| BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, __X32_SYSCALL_BIT, 0, 1), | |
| BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL), | |
| #ifdef __NR_unshare | |
| SP_SECCOMP_DENY_NR(__NR_unshare), | |
| #endif | |
| #ifdef __NR_setns | |
| SP_SECCOMP_DENY_NR(__NR_setns), | |
| #endif | |
| #ifdef __NR_mount | |
| SP_SECCOMP_DENY_NR(__NR_mount), | |
| #endif | |
| #ifdef __NR_umount2 | |
| SP_SECCOMP_DENY_NR(__NR_umount2), | |
| #endif | |
| #ifdef __NR_pivot_root | |
| SP_SECCOMP_DENY_NR(__NR_pivot_root), | |
| #endif | |
| #ifdef __NR_open_tree | |
| SP_SECCOMP_DENY_NR(__NR_open_tree), | |
| #endif | |
| #ifdef __NR_move_mount | |
| SP_SECCOMP_DENY_NR(__NR_move_mount), | |
| #endif | |
| #ifdef __NR_fsopen | |
| SP_SECCOMP_DENY_NR(__NR_fsopen), | |
| #endif | |
| #ifdef __NR_fsconfig | |
| SP_SECCOMP_DENY_NR(__NR_fsconfig), | |
| #endif | |
| #ifdef __NR_fsmount | |
| SP_SECCOMP_DENY_NR(__NR_fsmount), | |
| #endif | |
| #ifdef __NR_fspick | |
| SP_SECCOMP_DENY_NR(__NR_fspick), | |
| #endif | |
| #ifdef __NR_mount_setattr | |
| SP_SECCOMP_DENY_NR(__NR_mount_setattr), | |
| #endif | |
| #ifdef __NR_bpf | |
| SP_SECCOMP_DENY_NR(__NR_bpf), | |
| #endif | |
| #ifdef __NR_perf_event_open | |
| SP_SECCOMP_DENY_NR(__NR_perf_event_open), | |
| #endif | |
| #ifdef __NR_userfaultfd | |
| SP_SECCOMP_DENY_NR(__NR_userfaultfd), | |
| #endif | |
| #ifdef __NR_ptrace | |
| SP_SECCOMP_DENY_NR(__NR_ptrace), | |
| #endif | |
| #ifdef __NR_init_module | |
| SP_SECCOMP_DENY_NR(__NR_init_module), | |
| #endif | |
| #ifdef __NR_finit_module | |
| SP_SECCOMP_DENY_NR(__NR_finit_module), | |
| #endif | |
| #ifdef __NR_delete_module | |
| SP_SECCOMP_DENY_NR(__NR_delete_module), | |
| #endif | |
| #ifdef __NR_kexec_load | |
| SP_SECCOMP_DENY_NR(__NR_kexec_load), | |
| #endif | |
| #ifdef __NR_io_uring_setup | |
| SP_SECCOMP_ERRNO_NR(__NR_io_uring_setup, EPERM), | |
| #endif | |
| #ifdef __NR_io_uring_enter | |
| SP_SECCOMP_ERRNO_NR(__NR_io_uring_enter, EPERM), | |
| #endif | |
| #ifdef __NR_io_uring_register | |
| SP_SECCOMP_ERRNO_NR(__NR_io_uring_register, EPERM), | |
| #endif | |
| #ifdef __NR_process_vm_readv | |
| SP_SECCOMP_DENY_NR(__NR_process_vm_readv), | |
| #endif | |
| #ifdef __NR_process_vm_writev | |
| SP_SECCOMP_DENY_NR(__NR_process_vm_writev), | |
| #endif | |
| #ifdef __NR_keyctl | |
| SP_SECCOMP_DENY_NR(__NR_keyctl), | |
| #endif | |
| #ifdef __NR_add_key | |
| SP_SECCOMP_DENY_NR(__NR_add_key), | |
| #endif | |
| #ifdef __NR_request_key | |
| SP_SECCOMP_DENY_NR(__NR_request_key), | |
| #endif | |
| #ifdef __NR_ioctl | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)__NR_ioctl, 0, 6), | |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, | |
| (unsigned int)offsetof(struct seccomp_data, args[1])), | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)TIOCSTI, 0, 1), | |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & SECCOMP_RET_DATA)), | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)TIOCLINUX, 0, 1), | |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & SECCOMP_RET_DATA)), | |
| SP_SECCOMP_ALLOW(), | |
| #endif | |
| #ifdef __NR_clone | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)__NR_clone, 0, 4), | |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, | |
| (unsigned int)offsetof(struct seccomp_data, args[0])), | |
| BPF_STMT(BPF_ALU | BPF_AND | BPF_K, (unsigned int)SP_CLONE_NAMESPACE_FLAGS), | |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 1, 0), | |
| BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL), | |
| #endif | |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), | |
| }; | |
| struct sock_fprog prog = { | |
| .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), | |
| .filter = (struct sock_filter *)filter, | |
| }; | |
| if (ensure_no_new_privs() < 0) | |
| return -1; | |
| return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); | |
| #else | |
| fprintf(stderr, "[sockpuppet] Warning: Seccomp not supported on this architecture, continuing without syscall filtering\\n"); | |
| return 0; | |
| #endif | |
| } | |
| /* ---------- epoll helpers ---------- */ | |
| static void epoll_add_tcp(struct tcp_flow *f) { | |
| if (f->sock < 0 || g_epfd < 0) | |
| return; | |
| f->ew.type = FD_TCP; | |
| f->ew.fd = f->sock; | |
| f->ew.flow = f; | |
| struct epoll_event ev = {.events = EPOLLIN, .data.ptr = &f->ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, f->sock, &ev) < 0) | |
| die("epoll_ctl add tcp"); | |
| } | |
| static void epoll_mod_tcp(struct tcp_flow *f, uint32_t events) { | |
| if (f->sock < 0 || g_epfd < 0) | |
| return; | |
| struct epoll_event ev = {.events = events, .data.ptr = &f->ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_MOD, f->sock, &ev) < 0) | |
| perror("epoll_ctl mod tcp"); | |
| } | |
| static void epoll_add_udp(struct udp_flow *f) { | |
| if (f->udp_relay < 0 || g_epfd < 0) | |
| return; | |
| f->ew.type = FD_UDP_RELAY; | |
| f->ew.fd = f->udp_relay; | |
| f->ew.flow = f; | |
| struct epoll_event ev = {.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP, | |
| .data.ptr = &f->ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, f->udp_relay, &ev) < 0) | |
| die("epoll_ctl add udp"); | |
| } | |
| static void epoll_mod_udp(struct udp_flow *f, uint32_t events) { | |
| if (f->udp_relay < 0 || g_epfd < 0) | |
| return; | |
| struct epoll_event ev = {.events = events, .data.ptr = &f->ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_MOD, f->udp_relay, &ev) < 0) | |
| perror("epoll_ctl mod udp"); | |
| } | |
| static void epoll_add_udp_ctrl(struct udp_flow *f, uint32_t events) { | |
| if (f->tcp_ctrl < 0 || g_epfd < 0) | |
| return; | |
| f->ctrl_ew.type = FD_UDP_CTRL; | |
| f->ctrl_ew.fd = f->tcp_ctrl; | |
| f->ctrl_ew.flow = f; | |
| struct epoll_event ev = {.events = events, .data.ptr = &f->ctrl_ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, f->tcp_ctrl, &ev) < 0) | |
| die("epoll_ctl add udp ctrl"); | |
| } | |
| static void epoll_mod_udp_ctrl(struct udp_flow *f, uint32_t events) { | |
| if (f->tcp_ctrl < 0 || g_epfd < 0) | |
| return; | |
| struct epoll_event ev = {.events = events, .data.ptr = &f->ctrl_ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_MOD, f->tcp_ctrl, &ev) < 0) | |
| perror("epoll_ctl mod udp ctrl"); | |
| } | |
| static void epoll_del(int fd) { | |
| if (fd >= 0 && g_epfd >= 0) | |
| epoll_ctl(g_epfd, EPOLL_CTL_DEL, fd, NULL); | |
| } | |
| static int set_nonblocking(int fd) { | |
| int flags = fcntl(fd, F_GETFL, 0); | |
| if (flags < 0) | |
| return -1; | |
| return fcntl(fd, F_SETFL, flags | O_NONBLOCK); | |
| } | |
| static int interactive_open_pty_master(void) { | |
| return posix_openpt(O_RDWR | O_NOCTTY | O_CLOEXEC); | |
| } | |
| static int interactive_sync_winsize(struct interactive_session *session) { | |
| if (!session || !session->active || session->host_tty_fd < 0 || | |
| session->pty_master_fd < 0) | |
| return 0; | |
| if (ioctl(session->host_tty_fd, TIOCGWINSZ, &session->host_winsize) < 0) { | |
| if (errno == ENOTTY) | |
| return 0; | |
| return -1; | |
| } | |
| session->host_winsize_saved = 1; | |
| if (ioctl(session->pty_master_fd, TIOCSWINSZ, &session->host_winsize) < 0) | |
| return -1; | |
| return 0; | |
| } | |
| static void interactive_restore_terminal(struct interactive_session *session) { | |
| if (!session || session->host_tty_fd < 0 || !session->host_termios_saved) | |
| return; | |
| IGNORE_RESULT(tcsetattr(session->host_tty_fd, TCSAFLUSH, | |
| &session->host_termios)); | |
| } | |
| static void interactive_close_session(struct interactive_session *session) { | |
| if (!session) | |
| return; | |
| interactive_restore_terminal(session); | |
| if (session->pty_master_fd >= 0) { | |
| close(session->pty_master_fd); | |
| session->pty_master_fd = -1; | |
| } | |
| if (session->pty_slave_fd >= 0) { | |
| close(session->pty_slave_fd); | |
| session->pty_slave_fd = -1; | |
| } | |
| if (session->host_tty_fd >= 0) { | |
| close(session->host_tty_fd); | |
| session->host_tty_fd = -1; | |
| } | |
| session->active = 0; | |
| } | |
| static int interactive_parent_setup(struct interactive_session *session) { | |
| struct termios raw; | |
| memset(session, 0, sizeof(*session)); | |
| session->host_tty_fd = -1; | |
| session->pty_master_fd = -1; | |
| session->pty_slave_fd = -1; | |
| session->host_tty_fd = open("/dev/tty", O_RDWR | O_NOCTTY | O_CLOEXEC); | |
| if (session->host_tty_fd < 0) | |
| return -1; | |
| if (!isatty(session->host_tty_fd)) { | |
| errno = ENOTTY; | |
| return -1; | |
| } | |
| session->pty_master_fd = interactive_open_pty_master(); | |
| if (session->pty_master_fd < 0) | |
| return -1; | |
| if (grantpt(session->pty_master_fd) < 0) | |
| return -1; | |
| if (unlockpt(session->pty_master_fd) < 0) | |
| return -1; | |
| #ifdef TIOCGPTPEER | |
| session->pty_slave_fd = | |
| ioctl(session->pty_master_fd, TIOCGPTPEER, O_RDWR | O_NOCTTY | O_CLOEXEC); | |
| if (session->pty_slave_fd < 0) | |
| return -1; | |
| #else | |
| errno = ENOTSUP; | |
| return -1; | |
| #endif | |
| if (tcgetattr(session->host_tty_fd, &session->host_termios) < 0) | |
| return -1; | |
| session->host_termios_saved = 1; | |
| raw = session->host_termios; | |
| cfmakeraw(&raw); | |
| if (tcsetattr(session->host_tty_fd, TCSAFLUSH, &raw) < 0) | |
| return -1; | |
| session->active = 1; | |
| if (interactive_sync_winsize(session) < 0) | |
| return -1; | |
| return 0; | |
| } | |
| static void interactive_child_setup(const struct interactive_session *session) { | |
| if (!session || session->pty_slave_fd < 0) { | |
| errno = EBADF; | |
| die("interactive child setup"); | |
| } | |
| if (setsid() < 0) | |
| die("setsid"); | |
| if (ioctl(session->pty_slave_fd, TIOCSCTTY, 0) < 0) | |
| die("TIOCSCTTY"); | |
| if (dup2(session->pty_slave_fd, STDIN_FILENO) < 0) | |
| die("dup2 stdin"); | |
| if (dup2(session->pty_slave_fd, STDOUT_FILENO) < 0) | |
| die("dup2 stdout"); | |
| if (dup2(session->pty_slave_fd, STDERR_FILENO) < 0) | |
| die("dup2 stderr"); | |
| } | |
| static int start_nonblocking_connect(int fd, const struct sockaddr_in *addr) { | |
| if (set_nonblocking(fd) < 0) | |
| return -1; | |
| if (connect(fd, (const struct sockaddr *)addr, sizeof(*addr)) == 0) | |
| return 0; | |
| if (errno == EINPROGRESS) | |
| return 1; | |
| return -1; | |
| } | |
| static int socket_connect_complete(int fd) { | |
| int so_error = 0; | |
| socklen_t len = sizeof(so_error); | |
| if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &so_error, &len) < 0) | |
| return -1; | |
| if (so_error != 0) { | |
| errno = so_error; | |
| return -1; | |
| } | |
| return 0; | |
| } | |
| static void socks_io_reset(struct socks_io *io) { memset(io, 0, sizeof(*io)); } | |
| static int socks_has_pending_tx(const struct socks_io *io) { | |
| return io->tx_off < io->tx_len; | |
| } | |
| static size_t socks_response_need(const struct socks_io *io) { | |
| if (io->state == SOCKS_IO_METHOD || io->state == SOCKS_IO_AUTH) | |
| return 2; | |
| if (io->state != SOCKS_IO_REQUEST) | |
| return 0; | |
| if (io->rx_len < 4) | |
| return 4; | |
| if (io->rxbuf[3] == 0x01) | |
| return 10; | |
| if (io->rxbuf[3] == 0x04) | |
| return 22; | |
| if (io->rxbuf[3] == 0x03) { | |
| if (io->rx_len < 5) | |
| return 5; | |
| size_t total = (size_t)(4 + 1 + io->rxbuf[4] + 2); | |
| if (total > sizeof(io->rxbuf)) | |
| return sizeof(io->rxbuf) + 1; /* Trigger failure check */ | |
| return total; | |
| } | |
| return 4; | |
| } | |
| static void socks_consume_rx(struct socks_io *io, size_t used) { | |
| if (used >= io->rx_len) { | |
| io->rx_len = 0; | |
| return; | |
| } | |
| memmove(io->rxbuf, io->rxbuf + used, io->rx_len - used); | |
| io->rx_len -= used; | |
| } | |
| static int socks_queue_send(struct socks_io *io, const uint8_t *data, size_t len) { | |
| if (len > sizeof(io->txbuf)) | |
| return -1; | |
| memcpy(io->txbuf, data, len); | |
| io->tx_off = 0; | |
| io->tx_len = len; | |
| return 0; | |
| } | |
| static int socks_queue_greeting(struct socks_io *io, const struct socks_config *cfg) { | |
| uint8_t greeting[4]; | |
| if (cfg->username[0] != '\0') { | |
| greeting[0] = 0x05; | |
| greeting[1] = 0x02; | |
| greeting[2] = 0x00; | |
| greeting[3] = 0x02; | |
| return socks_queue_send(io, greeting, 4); | |
| } | |
| greeting[0] = 0x05; | |
| greeting[1] = 0x01; | |
| greeting[2] = 0x00; | |
| return socks_queue_send(io, greeting, 3); | |
| } | |
| static int socks_queue_auth(struct socks_io *io, const struct socks_config *cfg) { | |
| size_t ulen = strlen(cfg->username); | |
| size_t plen = strlen(cfg->password); | |
| uint8_t auth[513]; | |
| size_t off = 0; | |
| if (ulen > 255 || plen > 255) | |
| return -1; | |
| auth[off++] = 0x01; | |
| auth[off++] = (uint8_t)ulen; | |
| memcpy(auth + off, cfg->username, ulen); | |
| off += ulen; | |
| auth[off++] = (uint8_t)plen; | |
| memcpy(auth + off, cfg->password, plen); | |
| off += plen; | |
| return socks_queue_send(io, auth, off); | |
| } | |
| static int socks_queue_request(struct socks_io *io) { | |
| uint8_t req[10]; | |
| uint16_t port_be = htons(io->target_port); | |
| req[0] = 0x05; | |
| req[1] = io->is_udp ? 0x03 : 0x01; | |
| req[2] = 0x00; | |
| req[3] = 0x01; | |
| memcpy(req + 4, &io->target_ip, 4); | |
| memcpy(req + 8, &port_be, 2); | |
| return socks_queue_send(io, req, sizeof(req)); | |
| } | |
| static int socks_begin_handshake(struct socks_io *io, | |
| const struct socks_config *cfg) { | |
| if (socks_queue_greeting(io, cfg) < 0) | |
| return -1; | |
| io->state = SOCKS_IO_METHOD; | |
| return 0; | |
| } | |
| static int socks_flush_tx(int fd, struct socks_io *io) { | |
| while (socks_has_pending_tx(io)) { | |
| ssize_t sent = | |
| send(fd, io->txbuf + io->tx_off, io->tx_len - io->tx_off, MSG_NOSIGNAL); | |
| if (sent > 0) { | |
| io->tx_off += (size_t)sent; | |
| continue; | |
| } | |
| if (sent < 0 && errno == EINTR) | |
| continue; | |
| if (sent < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) | |
| return 0; | |
| return -1; | |
| } | |
| io->tx_off = 0; | |
| io->tx_len = 0; | |
| return 0; | |
| } | |
| static int socks_process_rx(struct socks_io *io, const struct socks_config *cfg, | |
| struct sockaddr_in *relay_addr) { | |
| for (;;) { | |
| size_t need = socks_response_need(io); | |
| if (io->state == SOCKS_IO_READY) | |
| return 1; | |
| if (need > sizeof(io->rxbuf)) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| if (need == 0 || io->rx_len < need) | |
| return 0; | |
| if (io->state == SOCKS_IO_METHOD) { | |
| if (io->rxbuf[0] != 0x05) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| if (io->rxbuf[1] == 0x00) { | |
| socks_consume_rx(io, 2); | |
| if (socks_queue_request(io) < 0) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| io->state = SOCKS_IO_REQUEST; | |
| continue; | |
| } | |
| if (io->rxbuf[1] == 0x02 && cfg->username[0] != '\0') { | |
| socks_consume_rx(io, 2); | |
| if (socks_queue_auth(io, cfg) < 0) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| io->state = SOCKS_IO_AUTH; | |
| continue; | |
| } | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| if (io->state == SOCKS_IO_AUTH) { | |
| if (io->rxbuf[0] != 0x01 || io->rxbuf[1] != 0x00) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| socks_consume_rx(io, 2); | |
| if (socks_queue_request(io) < 0) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| io->state = SOCKS_IO_REQUEST; | |
| continue; | |
| } | |
| if (io->state == SOCKS_IO_REQUEST) { | |
| uint8_t atyp; | |
| size_t used = need; | |
| if (io->rxbuf[0] != 0x05 || io->rxbuf[1] != 0x00) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| atyp = io->rxbuf[3]; | |
| if (relay_addr) { | |
| memset(relay_addr, 0, sizeof(*relay_addr)); | |
| relay_addr->sin_family = AF_INET; | |
| if (atyp == 0x01 && need >= 10) { | |
| memcpy(&relay_addr->sin_addr.s_addr, io->rxbuf + 4, 4); | |
| relay_addr->sin_port = htons( | |
| (uint16_t)((io->rxbuf[8] << 8) | io->rxbuf[9])); | |
| } else if (atyp == 0x03 && need >= 7) { | |
| relay_addr->sin_addr = cfg->addr.sin_addr; | |
| relay_addr->sin_port = htons( | |
| (uint16_t)((io->rxbuf[need - 2] << 8) | io->rxbuf[need - 1])); | |
| } else if (atyp == 0x04) { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } else { | |
| io->state = SOCKS_IO_FAILED; | |
| return -1; | |
| } | |
| if (relay_addr->sin_addr.s_addr == 0 || | |
| relay_addr->sin_addr.s_addr == htonl(0x7f000001)) { | |
| relay_addr->sin_addr = cfg->addr.sin_addr; | |
| } | |
| DBG("SOCKS UDP relay ready at %s:%d", inet_ntoa(relay_addr->sin_addr), | |
| ntohs(relay_addr->sin_port)); | |
| } | |
| socks_consume_rx(io, used); | |
| io->state = SOCKS_IO_READY; | |
| return 1; | |
| } | |
| } | |
| } | |
| static int socks_recv_and_process(int fd, struct socks_io *io, | |
| const struct socks_config *cfg, | |
| struct sockaddr_in *relay_addr) { | |
| for (;;) { | |
| ssize_t r = recv(fd, io->rxbuf + io->rx_len, sizeof(io->rxbuf) - io->rx_len, | |
| 0); | |
| if (r > 0) { | |
| io->rx_len += (size_t)r; | |
| if (socks_process_rx(io, cfg, relay_addr) != 0) | |
| return (io->state == SOCKS_IO_READY) ? 1 : -1; | |
| continue; | |
| } | |
| if (r == 0) | |
| return -1; | |
| if (errno == EINTR) | |
| continue; | |
| if (errno == EAGAIN || errno == EWOULDBLOCK) | |
| return 0; | |
| return -1; | |
| } | |
| } | |
| /* Drop all capabilities (for rootless mode) */ | |
| static void drop_caps(void) { | |
| struct __user_cap_header_struct hdr = { | |
| .version = _LINUX_CAPABILITY_VERSION_3, | |
| .pid = 0, | |
| }; | |
| struct __user_cap_data_struct data[2] = {{0}}; | |
| struct __user_cap_data_struct verify[2] = {{0}}; | |
| /* Drop bounding set first while we still have CAP_SETPCAP. | |
| * EINVAL is expected for capability numbers the kernel doesn't know. */ | |
| for (int cap = 0; cap <= CAP_LAST_CAP; cap++) { | |
| if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0) < 0 && errno != EINVAL) { | |
| perror("PR_CAPBSET_DROP"); | |
| exit(1); | |
| } | |
| } | |
| /* Clear ambient capabilities - EINVAL expected if not supported */ | |
| if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) < 0 && | |
| errno != EINVAL) { | |
| perror("PR_CAP_AMBIENT_CLEAR_ALL"); | |
| exit(1); | |
| } | |
| /* Verify bounding set is empty */ | |
| for (int cap = 0; cap <= CAP_LAST_CAP; cap++) { | |
| int rc = prctl(PR_CAPBSET_READ, cap, 0, 0, 0); | |
| if (rc > 0) { | |
| fprintf(stderr, "Capability %d survived drop\n", cap); | |
| exit(1); | |
| } | |
| } | |
| /* Now clear all capability sets */ | |
| if (syscall(SYS_capset, &hdr, data) < 0) { | |
| perror("capset"); | |
| exit(1); | |
| } | |
| if (syscall(SYS_capget, &hdr, verify) < 0) { | |
| perror("capget verify"); | |
| exit(1); | |
| } | |
| if (verify[0].effective || verify[0].permitted || verify[0].inheritable || | |
| verify[1].effective || verify[1].permitted || verify[1].inheritable) { | |
| fprintf(stderr, "capabilities survived drop\n"); | |
| exit(1); | |
| } | |
| /* Disable core dumps (prevents leaking sensitive data) */ | |
| if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) { | |
| perror("PR_SET_DUMPABLE"); | |
| exit(1); | |
| } | |
| } | |
| /* Parse SOCKS URL: [socks5://]host:port */ | |
| static void parse_socks_auth_file(const char *path) { | |
| FILE *f = fopen(path, "r"); | |
| if (!f) die("fopen socks auth file"); | |
| char buf[256]; | |
| if (!fgets(buf, sizeof(buf), f)) die("read socks auth file"); | |
| fclose(f); | |
| char *newline = strchr(buf, '\n'); | |
| if (newline) *newline = '\0'; | |
| char *colon = strchr(buf, ':'); | |
| if (!colon) die("invalid auth file format"); | |
| *colon = '\0'; | |
| snprintf(socks_proxy.username, sizeof(socks_proxy.username), "%s", buf); | |
| snprintf(socks_proxy.password, sizeof(socks_proxy.password), "%s", colon + 1); | |
| } | |
| static int parse_socks_url(const char *url, struct socks_config *cfg) { | |
| const char *p = url; | |
| const char *colon; | |
| long port; | |
| memset(cfg, 0, sizeof(*cfg)); | |
| cfg->port = 1080; /* default SOCKS port */ | |
| /* Skip protocol prefix if present */ | |
| if (strncmp(p, "socks5://", 9) == 0) | |
| p += 9; | |
| else if (strncmp(p, "socks://", 8) == 0) | |
| p += 8; | |
| else if (strncmp(p, "socks5h://", 10) == 0) | |
| p += 10; | |
| if (strchr(p, '@')) { | |
| fprintf(stderr, "Credentials in SOCKS URL are not allowed for security reasons.\\n"); | |
| exit(1); | |
| } | |
| /* Parse host:port */ | |
| colon = strrchr(p, ':'); | |
| if (colon) { | |
| size_t hlen = (size_t)(colon - p); | |
| if (hlen == 0 || hlen >= sizeof(cfg->host) || | |
| parse_long_strict(colon + 1, 1, 65535, &port) < 0) { | |
| fprintf(stderr, "Invalid SOCKS proxy: %s\n", url); | |
| exit(1); | |
| } | |
| memcpy(cfg->host, p, hlen); | |
| cfg->host[hlen] = '\0'; | |
| cfg->port = (int)port; | |
| } else { | |
| size_t hlen = strlen(p); | |
| if (hlen == 0 || hlen >= sizeof(cfg->host)) { | |
| fprintf(stderr, "Invalid SOCKS proxy: %s\n", url); | |
| exit(1); | |
| } | |
| memcpy(cfg->host, p, hlen); | |
| cfg->host[hlen] = '\0'; | |
| } | |
| cfg->enabled = (cfg->host[0] != '\0'); | |
| return cfg->enabled; | |
| } | |
| static int resolve_socks_proxy(struct socks_config *cfg) { | |
| struct sockaddr_in proxy_addr = { | |
| .sin_family = AF_INET, | |
| .sin_port = htons((uint16_t)cfg->port), | |
| }; | |
| if (!cfg->enabled) | |
| return 0; | |
| if (cfg->addr_valid) | |
| return 0; | |
| if (inet_pton(AF_INET, cfg->host, &proxy_addr.sin_addr) <= 0) { | |
| struct addrinfo hints = {.ai_family = AF_INET, .ai_socktype = SOCK_STREAM}; | |
| struct addrinfo *res; | |
| if (getaddrinfo(cfg->host, NULL, &hints, &res) != 0) | |
| return -1; | |
| proxy_addr.sin_addr = ((struct sockaddr_in *)res->ai_addr)->sin_addr; | |
| freeaddrinfo(res); | |
| } | |
| cfg->addr = proxy_addr; | |
| cfg->addr_valid = 1; | |
| return 0; | |
| } | |
| extern char **environ; | |
| static void close_extra_fds_for_exec(void) { | |
| #ifdef __NR_close_range | |
| if (syscall(__NR_close_range, 3U, ~0U, 0U) == 0) | |
| return; | |
| if (errno != ENOSYS && errno != EINVAL) | |
| die("close_range"); | |
| #endif | |
| DIR *dir = opendir("/proc/self/fd"); | |
| if (dir != NULL) { | |
| int scan_fd = dirfd(dir); | |
| struct dirent *ent; | |
| int *fds = NULL; | |
| size_t fds_len = 0; | |
| size_t fds_cap = 0; | |
| while ((ent = readdir(dir)) != NULL) { | |
| char *end = NULL; | |
| long fd = strtol(ent->d_name, &end, 10); | |
| if (end == NULL || *end != '\0') | |
| continue; | |
| if (fd <= 2 || fd == scan_fd) | |
| continue; | |
| if (fds_len == fds_cap) { | |
| size_t new_cap = (fds_cap == 0) ? 16 : fds_cap * 2; | |
| int *new_fds = realloc(fds, new_cap * sizeof(*new_fds)); | |
| if (new_fds == NULL) { | |
| free(fds); | |
| closedir(dir); | |
| die("realloc close fd list"); | |
| } | |
| fds = new_fds; | |
| fds_cap = new_cap; | |
| } | |
| fds[fds_len++] = (int)fd; | |
| } | |
| closedir(dir); | |
| for (size_t i = 0; i < fds_len; ++i) | |
| close(fds[i]); | |
| free(fds); | |
| return; | |
| } | |
| long maxfd = sysconf(_SC_OPEN_MAX); | |
| if (maxfd < 0) | |
| maxfd = 256; | |
| for (int fd = 3; fd < maxfd; ++fd) | |
| close(fd); | |
| } | |
| static int env_name_matches(const char *entry, const char *name) { | |
| const char *eq = strchr(entry, '='); | |
| size_t len = eq != NULL ? (size_t)(eq - entry) : strlen(entry); | |
| return strlen(name) == len && strncmp(entry, name, len) == 0; | |
| } | |
| static int should_keep_env(const char *entry) { | |
| static const char *const keep[] = { | |
| "PATH", "HOME", "USER", "LOGNAME", "SHELL", | |
| "TERM", "LANG", "TZ", NULL, | |
| }; | |
| if (strncmp(entry, "LC_", 3) == 0 && strchr(entry, '=') != NULL) | |
| return 1; | |
| for (size_t i = 0; keep[i] != NULL; ++i) { | |
| if (env_name_matches(entry, keep[i])) | |
| return 1; | |
| } | |
| return 0; | |
| } | |
| static char *make_env_entry(const char *name, const char *value) { | |
| size_t name_len = strlen(name); | |
| size_t value_len = strlen(value); | |
| char *entry = malloc(name_len + 1 + value_len + 1); | |
| if (entry == NULL) | |
| die("malloc env"); | |
| memcpy(entry, name, name_len); | |
| entry[name_len] = '='; | |
| memcpy(entry + name_len + 1, value, value_len + 1); | |
| return entry; | |
| } | |
| static char **build_sanitized_envp(const char *cwd) { | |
| static const char default_path[] = | |
| "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"; | |
| size_t keep_count = 0; | |
| int have_path = 0; | |
| for (char **env = environ; env != NULL && *env != NULL; ++env) { | |
| if (!should_keep_env(*env)) | |
| continue; | |
| if (env_name_matches(*env, "PWD") || env_name_matches(*env, "TMPDIR")) | |
| continue; | |
| if (env_name_matches(*env, "PATH")) | |
| have_path = 1; | |
| ++keep_count; | |
| } | |
| char **envp = calloc(keep_count + (have_path ? 0U : 1U) + 4U, sizeof(*envp)); | |
| size_t idx = 0; | |
| if (envp == NULL) | |
| die("calloc envp"); | |
| for (char **env = environ; env != NULL && *env != NULL; ++env) { | |
| if (!should_keep_env(*env)) | |
| continue; | |
| if (env_name_matches(*env, "PWD") || env_name_matches(*env, "TMPDIR")) | |
| continue; | |
| envp[idx] = strdup(*env); | |
| if (envp[idx] == NULL) | |
| die("strdup env"); | |
| ++idx; | |
| } | |
| if (!have_path) { | |
| envp[idx++] = make_env_entry("PATH", default_path); | |
| } | |
| envp[idx++] = make_env_entry("PWD", cwd); | |
| envp[idx++] = make_env_entry("TMPDIR", "/tmp"); | |
| envp[idx++] = make_env_entry("HOME", "/tmp/home"); | |
| envp[idx] = NULL; | |
| mkdir("/tmp/home", 0700); | |
| return envp; | |
| } | |
| static void copy_ifname(char dst[IFNAMSIZ], const char *src) { | |
| int rc = snprintf(dst, IFNAMSIZ, "%s", src); | |
| if (rc < 0 || rc >= IFNAMSIZ) { | |
| errno = ENAMETOOLONG; | |
| die("ifname too long"); | |
| } | |
| } | |
| /* ---------- FD passing ---------- */ | |
| static void send_fd(int sock, int fd) { | |
| struct msghdr msg; | |
| memset(&msg, 0, sizeof(msg)); | |
| char byte = 'X'; | |
| struct iovec iov = {&byte, 1}; | |
| msg.msg_iov = &iov; | |
| msg.msg_iovlen = 1; | |
| char cbuf[CMSG_SPACE(sizeof(int))]; | |
| msg.msg_control = cbuf; | |
| msg.msg_controllen = sizeof(cbuf); | |
| struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); | |
| cmsg->cmsg_level = SOL_SOCKET; | |
| cmsg->cmsg_type = SCM_RIGHTS; | |
| cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | |
| memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); | |
| if (sendmsg(sock, &msg, 0) < 0) | |
| die("sendmsg"); | |
| } | |
| static int recv_fd(int sock) { | |
| struct msghdr msg; | |
| memset(&msg, 0, sizeof(msg)); | |
| char byte; | |
| struct iovec iov = {&byte, 1}; | |
| msg.msg_iov = &iov; | |
| msg.msg_iovlen = 1; | |
| char cbuf[CMSG_SPACE(sizeof(int))]; | |
| msg.msg_control = cbuf; | |
| msg.msg_controllen = sizeof(cbuf); | |
| ssize_t n = recvmsg(sock, &msg, MSG_CMSG_CLOEXEC); | |
| if (n < 0) | |
| die("recvmsg"); | |
| if (n == 0 || (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) != 0) | |
| die("recv_fd truncated"); | |
| struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); | |
| int fd; | |
| if (cmsg == NULL || cmsg->cmsg_level != SOL_SOCKET || | |
| cmsg->cmsg_type != SCM_RIGHTS || | |
| cmsg->cmsg_len < CMSG_LEN(sizeof(int))) | |
| die("recv_fd invalid ancillary data"); | |
| memcpy(&fd, CMSG_DATA(cmsg), sizeof(int)); | |
| return fd; | |
| } | |
| /* ---------- TUN and interface helpers for child process ---------- */ | |
| static int tun_create(const char *name) { | |
| int fd = open("/dev/net/tun", O_RDWR | O_CLOEXEC); | |
| if (fd < 0) | |
| die("open /dev/net/tun"); | |
| struct ifreq ifr; | |
| memset(&ifr, 0, sizeof(ifr)); | |
| ifr.ifr_flags = IFF_TUN | IFF_NO_PI; | |
| copy_ifname(ifr.ifr_name, name); | |
| if (ioctl(fd, TUNSETIFF, &ifr) < 0) | |
| die("TUNSETIFF"); | |
| return fd; | |
| } | |
| static void if_up(const char *ifname) { | |
| int s = socket(AF_INET, SOCK_DGRAM, 0); | |
| if (s < 0) | |
| die("socket"); | |
| struct ifreq ifr; | |
| memset(&ifr, 0, sizeof(ifr)); | |
| copy_ifname(ifr.ifr_name, ifname); | |
| if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) | |
| die("SIOCGIFFLAGS"); | |
| ifr.ifr_flags |= IFF_UP | IFF_RUNNING; | |
| if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) | |
| die("SIOCSIFFLAGS"); | |
| close(s); | |
| } | |
| /* point-to-point address */ | |
| static void if_addr_ptp(const char *ifname, const char *local, | |
| const char *peer) { | |
| int s = socket(AF_INET, SOCK_DGRAM, 0); | |
| struct ifreq ifr = {0}; | |
| struct sockaddr_in addr = {.sin_family = AF_INET}; | |
| if (s < 0) | |
| die("socket"); | |
| copy_ifname(ifr.ifr_name, ifname); | |
| if (inet_pton(AF_INET, local, &addr.sin_addr) != 1) | |
| die("inet_pton local"); | |
| memcpy(&ifr.ifr_addr, &addr, sizeof(addr)); | |
| if (ioctl(s, SIOCSIFADDR, &ifr) < 0) | |
| die("SIOCSIFADDR"); | |
| if (inet_pton(AF_INET, peer, &addr.sin_addr) != 1) | |
| die("inet_pton peer"); | |
| memcpy(&ifr.ifr_dstaddr, &addr, sizeof(addr)); | |
| if (ioctl(s, SIOCSIFDSTADDR, &ifr) < 0) | |
| die("SIOCSIFDSTADDR"); | |
| close(s); | |
| } | |
| static int if_index(const char *ifname) { | |
| int s = socket(AF_INET, SOCK_DGRAM, 0); | |
| if (s < 0) | |
| die("socket"); | |
| struct ifreq ifr; | |
| memset(&ifr, 0, sizeof(ifr)); | |
| copy_ifname(ifr.ifr_name, ifname); | |
| if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) | |
| die("SIOCGIFINDEX"); | |
| close(s); | |
| return ifr.ifr_ifindex; | |
| } | |
| static void add_default_route(const char *ifname, const char *gw) { | |
| int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | |
| struct { | |
| struct nlmsghdr nlh; | |
| struct rtmsg rtm; | |
| char buf[256]; | |
| } req = {0}; | |
| if (fd < 0) | |
| die("socket"); | |
| req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); | |
| req.nlh.nlmsg_type = RTM_NEWROUTE; | |
| req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; | |
| req.rtm.rtm_family = AF_INET; | |
| req.rtm.rtm_table = RT_TABLE_MAIN; | |
| req.rtm.rtm_protocol = RTPROT_BOOT; | |
| req.rtm.rtm_scope = RT_SCOPE_UNIVERSE; | |
| req.rtm.rtm_type = RTN_UNICAST; | |
| struct rtattr *rta; | |
| rta = (void *)req.buf; | |
| rta->rta_type = RTA_GATEWAY; | |
| rta->rta_len = RTA_LENGTH(4); | |
| if (inet_pton(AF_INET, gw, RTA_DATA(rta)) != 1) | |
| die("inet_pton gateway"); | |
| req.nlh.nlmsg_len += rta->rta_len; | |
| rta = (void *)((char *)rta + rta->rta_len); | |
| rta->rta_type = RTA_OIF; | |
| rta->rta_len = RTA_LENGTH(4); | |
| *(int *)RTA_DATA(rta) = if_index(ifname); | |
| req.nlh.nlmsg_len += rta->rta_len; | |
| if (send(fd, &req, req.nlh.nlmsg_len, 0) != (ssize_t)req.nlh.nlmsg_len) | |
| die("send netlink route"); | |
| close(fd); | |
| } | |
| static void if_up_netlink(const char *ifname) { | |
| int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | |
| if (fd < 0) | |
| die("socket"); | |
| struct { | |
| struct nlmsghdr nlh; | |
| struct ifinfomsg ifi; | |
| } req = {0}; | |
| req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); | |
| req.nlh.nlmsg_type = RTM_NEWLINK; | |
| req.nlh.nlmsg_flags = NLM_F_REQUEST; | |
| req.ifi.ifi_family = AF_UNSPEC; | |
| req.ifi.ifi_index = if_index(ifname); | |
| req.ifi.ifi_flags = IFF_UP | IFF_RUNNING; | |
| req.ifi.ifi_change = IFF_UP | IFF_RUNNING; | |
| if (send(fd, &req, req.nlh.nlmsg_len, 0) < 0) | |
| die("send netlink"); | |
| close(fd); | |
| } | |
| /* ----------- TCP/IP helpers ------------------ */ | |
| static uint32_t csum16_partial(const void *buf, size_t len); | |
| static uint16_t csum16_fold(uint32_t sum) { | |
| while (sum >> 16) | |
| sum = (sum & 0xffffU) + (sum >> 16); | |
| return (uint16_t)~sum; | |
| } | |
| static uint16_t csum16(const void *buf, size_t len) { | |
| return csum16_fold(csum16_partial(buf, len)); | |
| } | |
| /* ---------- Persistent UDP Flow Management ---------- */ | |
| static void udp_close_flow(struct udp_flow *f); | |
| static struct udp_flow *udp_find(uint32_t cip, uint16_t cport, uint32_t sip, | |
| uint16_t sport) { | |
| for (int i = 0; i < MAX_UDP; i++) { | |
| struct udp_flow *f = &udp_flows[i]; | |
| /* Full 4-tuple match for proper flow isolation */ | |
| if ((f->udp_relay >= 0 || f->udp_staging >= 0 || f->tcp_ctrl >= 0) && | |
| f->cli_ip == cip && | |
| f->cli_port == cport && | |
| f->srv_ip == sip && f->srv_port == sport) | |
| return f; | |
| } | |
| return NULL; | |
| } | |
| static struct udp_flow *udp_alloc(void) { | |
| /* First try to find an empty slot */ | |
| for (int i = 0; i < MAX_UDP; i++) { | |
| if (udp_flows[i].udp_relay < 0 && udp_flows[i].udp_staging < 0 && | |
| udp_flows[i].tcp_ctrl < 0) | |
| return &udp_flows[i]; | |
| } | |
| /* Otherwise evict oldest */ | |
| struct udp_flow *oldest = &udp_flows[0]; | |
| for (int i = 1; i < MAX_UDP; i++) { | |
| if (udp_flows[i].last_used < oldest->last_used) | |
| oldest = &udp_flows[i]; | |
| } | |
| if (oldest->tcp_ctrl >= 0) | |
| epoll_del(oldest->tcp_ctrl); | |
| if (oldest->tcp_ctrl >= 0) | |
| close(oldest->tcp_ctrl); | |
| if (oldest->udp_relay >= 0) { | |
| epoll_del(oldest->udp_relay); | |
| close(oldest->udp_relay); | |
| } | |
| if (oldest->udp_staging >= 0) | |
| close(oldest->udp_staging); | |
| memset(oldest, 0, sizeof(*oldest)); | |
| return oldest; | |
| } | |
| static uint32_t udp_ctrl_events(const struct udp_flow *f) { | |
| uint32_t events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP; | |
| if (f->socks.connect_pending || socks_has_pending_tx(&f->socks)) | |
| events |= EPOLLOUT; | |
| return events; | |
| } | |
| static void udp_update_events(struct udp_flow *f) { | |
| uint32_t events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP; | |
| if (f->udp_relay >= 0 && f->pending_set) | |
| events |= EPOLLOUT; | |
| epoll_mod_udp(f, events); | |
| } | |
| static int udp_queue_pending(struct udp_flow *f, const uint8_t *data, size_t len) { | |
| if (len > sizeof(f->pending_data)) | |
| return -1; | |
| if (f->pending_set) { | |
| f->dropped_backpressure++; | |
| DBG("UDP pending queue full; dropping newest datagram (drops=%lu)", | |
| f->dropped_backpressure); | |
| return 1; | |
| } | |
| memcpy(f->pending_data, data, len); | |
| f->pending_len = len; | |
| f->pending_set = 1; | |
| f->last_used = time(NULL); | |
| udp_update_events(f); | |
| return 0; | |
| } | |
| static int udp_socks_setup(struct udp_flow *f, struct socks_config *cfg) { | |
| int tcp_sock = socket(AF_INET, SOCK_STREAM, 0); | |
| int udp_sock = -1; | |
| struct sockaddr_in local = {.sin_family = AF_INET, | |
| .sin_addr.s_addr = htonl(INADDR_ANY), | |
| .sin_port = 0}; | |
| socklen_t locallen = sizeof(local); | |
| int rc; | |
| if (tcp_sock < 0) | |
| return -1; | |
| udp_sock = socket(AF_INET, SOCK_DGRAM, 0); | |
| if (udp_sock < 0) { | |
| close(tcp_sock); | |
| return -1; | |
| } | |
| if (bind(udp_sock, (struct sockaddr *)&local, sizeof(local)) < 0) { | |
| close(udp_sock); | |
| close(tcp_sock); | |
| return -1; | |
| } | |
| if (getsockname(udp_sock, (struct sockaddr *)&local, &locallen) < 0) { | |
| close(udp_sock); | |
| close(tcp_sock); | |
| return -1; | |
| } | |
| if (set_nonblocking(udp_sock) < 0) { | |
| close(udp_sock); | |
| close(tcp_sock); | |
| return -1; | |
| } | |
| socks_io_reset(&f->socks); | |
| f->socks.active = 1; | |
| f->socks.is_udp = 1; | |
| f->socks.target_ip = 0; | |
| f->socks.target_port = ntohs(local.sin_port); | |
| rc = start_nonblocking_connect(tcp_sock, &cfg->addr); | |
| if (rc < 0) { | |
| close(udp_sock); | |
| close(tcp_sock); | |
| return -1; | |
| } | |
| f->tcp_ctrl = tcp_sock; | |
| f->udp_relay = -1; | |
| f->udp_staging = udp_sock; | |
| f->socks.connect_pending = (rc > 0); | |
| f->socks.state = f->socks.connect_pending ? SOCKS_IO_CONNECTING | |
| : SOCKS_IO_METHOD; | |
| if (!f->socks.connect_pending && socks_begin_handshake(&f->socks, cfg) < 0) { | |
| close(udp_sock); | |
| close(tcp_sock); | |
| f->tcp_ctrl = -1; | |
| f->udp_staging = -1; | |
| return -1; | |
| } | |
| epoll_add_udp_ctrl(f, udp_ctrl_events(f)); | |
| return 0; | |
| } | |
| static int udp_direct_setup(struct udp_flow *f, uint32_t dst_ip, | |
| uint16_t dst_port) { | |
| int udp_sock = socket(AF_INET, SOCK_DGRAM, 0); | |
| if (udp_sock < 0) | |
| return -1; | |
| struct sockaddr_in dst = { | |
| .sin_family = AF_INET, | |
| .sin_port = htons(dst_port), | |
| .sin_addr.s_addr = dst_ip, | |
| }; | |
| if (connect(udp_sock, (struct sockaddr *)&dst, sizeof(dst)) < 0) { | |
| close(udp_sock); | |
| return -1; | |
| } | |
| if (set_nonblocking(udp_sock) < 0) { | |
| close(udp_sock); | |
| return -1; | |
| } | |
| f->tcp_ctrl = -1; | |
| f->udp_relay = udp_sock; | |
| f->relay_addr = dst; | |
| epoll_add_udp(f); | |
| udp_update_events(f); | |
| return 0; | |
| } | |
| static int udp_try_send_socks(struct udp_flow *f, const uint8_t *payload, | |
| size_t payload_len) { | |
| uint8_t pkt[65536]; | |
| struct in_addr dst_addr = {.s_addr = f->srv_ip}; | |
| if (payload_len > sizeof(pkt) - 10) | |
| return -1; | |
| pkt[0] = 0; | |
| pkt[1] = 0; | |
| pkt[2] = 0; /* RSV, FRAG */ | |
| pkt[3] = 0x01; /* ATYP = IPv4 */ | |
| memcpy(&pkt[4], &f->srv_ip, 4); | |
| pkt[8] = (uint8_t)(f->srv_port >> 8); | |
| pkt[9] = (uint8_t)(f->srv_port & 0xff); | |
| memcpy(&pkt[10], payload, payload_len); | |
| DBG("UDP via SOCKS relay %s:%d -> %s:%d len=%zu", | |
| inet_ntoa(f->relay_addr.sin_addr), ntohs(f->relay_addr.sin_port), | |
| inet_ntoa(dst_addr), f->srv_port, payload_len); | |
| for (;;) { | |
| ssize_t n = sendto(f->udp_relay, pkt, 10 + payload_len, 0, | |
| (struct sockaddr *)&f->relay_addr, | |
| sizeof(f->relay_addr)); | |
| if (n == (ssize_t)(10 + payload_len)) { | |
| f->last_used = time(NULL); | |
| return 0; | |
| } | |
| if (n >= 0) { | |
| DBG("SOCKS UDP short send: %zd/%zu, closing flow", n, 10 + payload_len); | |
| return -1; | |
| } | |
| if (errno == EINTR) | |
| continue; | |
| if (errno == EAGAIN || errno == EWOULDBLOCK) { | |
| int qrc = udp_queue_pending(f, payload, payload_len); | |
| if (qrc < 0) | |
| return -1; | |
| return 0; | |
| } | |
| if (errno == ENOBUFS || errno == ENOMEM) { | |
| f->dropped_backpressure++; | |
| DBG("SOCKS UDP send drop due to %s (drops=%lu)", strerror(errno), | |
| f->dropped_backpressure); | |
| return 0; | |
| } | |
| return -1; | |
| } | |
| } | |
| static int udp_try_send_direct(struct udp_flow *f, const uint8_t *payload, | |
| size_t payload_len) { | |
| for (;;) { | |
| ssize_t n = send(f->udp_relay, payload, payload_len, MSG_NOSIGNAL); | |
| if (n == (ssize_t)payload_len) { | |
| f->last_used = time(NULL); | |
| return 0; | |
| } | |
| if (n >= 0) { | |
| DBG("UDP short send: %zd/%zu, closing flow", n, payload_len); | |
| return -1; | |
| } | |
| if (errno == EINTR) | |
| continue; | |
| if (errno == EAGAIN || errno == EWOULDBLOCK) { | |
| int qrc = udp_queue_pending(f, payload, payload_len); | |
| if (qrc < 0) | |
| return -1; | |
| return 0; | |
| } | |
| if (errno == ENOBUFS || errno == ENOMEM) { | |
| f->dropped_backpressure++; | |
| DBG("UDP send drop due to %s (drops=%lu)", strerror(errno), | |
| f->dropped_backpressure); | |
| return 0; | |
| } | |
| return -1; | |
| } | |
| } | |
| static int udp_open_relay_socket(struct udp_flow *f, | |
| const struct sockaddr_in *relay_addr) { | |
| int udp_sock = f->udp_staging; | |
| if (udp_sock < 0) | |
| return -1; | |
| f->udp_relay = udp_sock; | |
| f->udp_staging = -1; | |
| f->relay_addr = *relay_addr; | |
| epoll_add_udp(f); | |
| udp_update_events(f); | |
| return 0; | |
| } | |
| static int udp_flush_pending(struct udp_flow *f) { | |
| if (f->udp_relay < 0 || !f->pending_set) | |
| return 0; | |
| size_t len = f->pending_len; | |
| uint8_t payload[65535]; | |
| int rc; | |
| memcpy(payload, f->pending_data, len); | |
| f->pending_set = 0; | |
| f->pending_len = 0; | |
| DBG("Flushing pending UDP datagram len=%zu", len); | |
| if (f->tcp_ctrl >= 0) | |
| rc = udp_try_send_socks(f, payload, len); | |
| else | |
| rc = udp_try_send_direct(f, payload, len); | |
| if (rc < 0) | |
| return -1; | |
| udp_update_events(f); | |
| return 0; | |
| } | |
| /* Inject UDP packet into TUN (response from server to client) */ | |
| static void udp_inject_tun(int tunfd, struct udp_flow *f, const uint8_t *data, | |
| size_t len) { | |
| uint8_t out[65536]; | |
| if (len > sizeof(out) - sizeof(struct iphdr) - sizeof(struct udphdr)) | |
| return; | |
| struct iphdr *ip = (struct iphdr *)out; | |
| struct udphdr *udp = (struct udphdr *)(out + sizeof(*ip)); | |
| memset(ip, 0, sizeof(*ip)); | |
| ip->version = 4; | |
| ip->ihl = 5; | |
| ip->ttl = 64; | |
| ip->protocol = IPPROTO_UDP; | |
| ip->saddr = f->srv_ip; | |
| ip->daddr = f->cli_ip; | |
| ip->tot_len = htons((uint16_t)(sizeof(*ip) + sizeof(*udp) + len)); | |
| ip->check = csum16(ip, sizeof(*ip)); | |
| udp->source = htons(f->srv_port); | |
| udp->dest = htons(f->cli_port); | |
| udp->len = htons((uint16_t)(sizeof(*udp) + len)); | |
| udp->check = 0; | |
| memcpy(out + sizeof(*ip) + sizeof(*udp), data, len); | |
| IGNORE_RESULT(tun_write_packet(tunfd, out, sizeof(*ip) + sizeof(*udp) + len, | |
| "UDP inject")); | |
| } | |
| static void handle_udp(int tunfd, uint8_t *pkt, ssize_t len) { | |
| (void)tunfd; | |
| if (len <= 0) | |
| return; | |
| size_t ulen = (size_t)len; | |
| struct iphdr *ip = (struct iphdr *)pkt; | |
| size_t iphl = ip->ihl * 4; | |
| if (ip->version != 4) | |
| return; | |
| if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > ulen) | |
| return; | |
| size_t ip_total_len = (size_t)ntohs(ip->tot_len); | |
| if (ip_total_len < iphl || ip_total_len > ulen) | |
| return; | |
| /* Reject IP fragments */ | |
| if (ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) { | |
| DBG("UDP: dropping IP fragment (frag_off=0x%04x)", ntohs(ip->frag_off)); | |
| return; | |
| } | |
| if (ulen < iphl + sizeof(struct udphdr)) | |
| return; | |
| struct udphdr *udp = (struct udphdr *)(pkt + iphl); | |
| size_t udp_len = (size_t)ntohs(udp->len); | |
| if (udp_len < sizeof(struct udphdr) || udp_len > ip_total_len - iphl) | |
| return; | |
| uint16_t dport = ntohs(udp->dest); | |
| uint16_t sport = ntohs(udp->source); | |
| uint8_t *payload = pkt + iphl + sizeof(struct udphdr); | |
| size_t plen = udp_len - sizeof(struct udphdr); | |
| /* Check for host gateway access (10.0.1.x -> 127.0.0.x) */ | |
| uint32_t target_ip = ip->daddr; | |
| int is_gateway = is_gateway_ip(target_ip); | |
| if (is_gateway) { | |
| if (!is_gateway_allowed(target_ip, dport, IPPROTO_UDP)) { | |
| DBG("[parent] UDP to 10.0.1.%d:%d blocked", gateway_last_octet(target_ip), | |
| dport); | |
| return; | |
| } | |
| target_ip = gateway_to_localhost(ip->daddr); | |
| DBG("[parent] UDP gateway: 10.0.1.%d:%d -> 127.0.0.%d:%d", | |
| gateway_last_octet(ip->daddr), dport, gateway_last_octet(ip->daddr), | |
| dport); | |
| } | |
| if (socks_proxy.enabled && !is_gateway) { | |
| /* Find or create persistent UDP flow */ | |
| struct udp_flow *f = udp_find(ip->saddr, sport, ip->daddr, dport); | |
| if (!f) { | |
| f = udp_alloc(); | |
| memset(f, 0, sizeof(*f)); | |
| f->cli_ip = ip->saddr; | |
| f->cli_port = sport; | |
| f->srv_ip = ip->daddr; | |
| f->srv_port = dport; | |
| f->tcp_ctrl = -1; | |
| f->udp_relay = -1; | |
| f->udp_staging = -1; | |
| f->pending_set = 0; | |
| f->pending_len = 0; | |
| socks_io_reset(&f->socks); | |
| if (udp_socks_setup(f, &socks_proxy) < 0) { | |
| f->udp_relay = -1; | |
| return; | |
| } | |
| } | |
| /* Update source port for response routing (may differ on reused flow) */ | |
| f->cli_port = sport; | |
| if (f->udp_relay >= 0) { | |
| if (udp_try_send_socks(f, payload, plen) < 0) { | |
| udp_close_flow(f); | |
| return; | |
| } | |
| udp_update_events(f); | |
| } else { | |
| int qrc = udp_queue_pending(f, payload, plen); | |
| if (qrc < 0) { | |
| udp_close_flow(f); | |
| return; | |
| } | |
| } | |
| } else { | |
| /* Direct UDP now uses persistent non-blocking flows too. */ | |
| struct udp_flow *f = udp_find(ip->saddr, sport, ip->daddr, dport); | |
| if (!f) { | |
| f = udp_alloc(); | |
| memset(f, 0, sizeof(*f)); | |
| f->cli_ip = ip->saddr; | |
| f->cli_port = sport; | |
| f->srv_ip = ip->daddr; | |
| f->srv_port = dport; | |
| f->tcp_ctrl = -1; | |
| f->udp_relay = -1; | |
| f->udp_staging = -1; | |
| if (udp_direct_setup(f, target_ip, dport) < 0) { | |
| f->udp_relay = -1; | |
| return; | |
| } | |
| } | |
| f->cli_port = sport; | |
| if (udp_try_send_direct(f, payload, plen) < 0) { | |
| udp_close_flow(f); | |
| return; | |
| } | |
| udp_update_events(f); | |
| DBG("UDP queued %zu bytes to %s:%d", plen, | |
| is_gateway ? "127.0.0.1" : "remote", dport); | |
| } | |
| } | |
| /* Handle ICMP echo request (ping) to gateway - responds directly */ | |
| static void handle_icmp(int tunfd, uint8_t *pkt, ssize_t len) { | |
| if (len <= 0) | |
| return; | |
| size_t ulen = (size_t)len; | |
| struct iphdr *ip = (struct iphdr *)pkt; | |
| size_t iphl = ip->ihl * 4; | |
| if (ip->version != 4) | |
| return; | |
| if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > ulen) | |
| return; | |
| size_t ip_total_len = (size_t)ntohs(ip->tot_len); | |
| if (ip_total_len < iphl || ip_total_len > ulen) | |
| return; | |
| /* Reject IP fragments */ | |
| if (ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) { | |
| DBG("ICMP: dropping IP fragment (frag_off=0x%04x)", ntohs(ip->frag_off)); | |
| return; | |
| } | |
| if (ip_total_len < iphl + 8) /* ICMP header is 8 bytes minimum */ | |
| return; | |
| /* Only respond to ping on 10.0.0.1 (always allowed) */ | |
| if (ip->daddr != HOST_PING_IP) | |
| return; | |
| uint8_t *icmp = pkt + iphl; | |
| uint8_t type = icmp[0]; | |
| /* Only respond to echo request (type 8) */ | |
| if (type != 8) | |
| return; | |
| DBG("ICMP echo request to gateway - sending reply"); | |
| /* Build echo reply */ | |
| uint8_t out[65536]; | |
| size_t icmp_len = ip_total_len - iphl; | |
| struct iphdr *rip = (struct iphdr *)out; | |
| memset(rip, 0, sizeof(*rip)); | |
| rip->version = 4; | |
| rip->ihl = 5; | |
| rip->ttl = 64; | |
| rip->protocol = IPPROTO_ICMP; | |
| rip->saddr = ip->daddr; /* Gateway IP */ | |
| rip->daddr = ip->saddr; /* Client IP */ | |
| rip->tot_len = htons((uint16_t)(sizeof(*rip) + icmp_len)); | |
| rip->check = csum16(rip, sizeof(*rip)); | |
| /* Copy ICMP data and change type to echo reply (0) */ | |
| memcpy(out + sizeof(*rip), icmp, icmp_len); | |
| out[sizeof(*rip)] = 0; /* Type = echo reply */ | |
| /* Recalculate ICMP checksum */ | |
| uint8_t *ricmp = out + sizeof(*rip); | |
| ricmp[2] = 0; | |
| ricmp[3] = 0; | |
| uint16_t icmp_csum = csum16(ricmp, icmp_len); | |
| ricmp[2] = (uint8_t)(icmp_csum & 0xff); | |
| ricmp[3] = (uint8_t)(icmp_csum >> 8); | |
| IGNORE_RESULT( | |
| tun_write_packet(tunfd, out, sizeof(*rip) + icmp_len, "ICMP reply")); | |
| } | |
| static struct tcp_flow *tcp_find(uint32_t cip, uint16_t cport, uint32_t sip, | |
| uint16_t sport) { | |
| for (int i = 0; i < MAX_TCP; i++) { | |
| struct tcp_flow *f = &tcp_flows[i]; | |
| if (f->sock >= 0 && f->cli_ip == cip && f->cli_port == cport && | |
| f->srv_ip == sip && f->srv_port == sport) | |
| return f; | |
| } | |
| return NULL; | |
| } | |
| static struct tcp_flow *tcp_alloc(void) { | |
| for (int i = 0; i < MAX_TCP; i++) { | |
| if (tcp_flows[i].sock < 0) | |
| return &tcp_flows[i]; | |
| } | |
| struct tcp_flow *oldest = NULL; | |
| for (int i = 0; i < MAX_TCP; i++) { | |
| if (tcp_flows[i].state == SP_TCP_ESTABLISHED) | |
| continue; | |
| if (oldest == NULL || tcp_flows[i].last_active < oldest->last_active) | |
| oldest = &tcp_flows[i]; | |
| } | |
| if (oldest == NULL) | |
| return NULL; | |
| if (oldest->sock >= 0) { | |
| if (g_epfd >= 0) | |
| epoll_ctl(g_epfd, EPOLL_CTL_DEL, oldest->sock, NULL); | |
| close(oldest->sock); | |
| } | |
| oldest->sock = -1; | |
| oldest->state = SP_TCP_CLOSED; | |
| oldest->pending_write_len = 0; | |
| oldest->pending_fin = 0; | |
| oldest->backend_ready = 0; | |
| memset(&oldest->socks, 0, sizeof(oldest->socks)); | |
| return oldest; | |
| } | |
| static uint32_t csum16_partial(const void *buf, size_t len) { | |
| const uint8_t *p = buf; | |
| uint32_t sum = 0; | |
| while (len > 1) { | |
| /* Little-endian integer construction from bytes */ | |
| sum += (uint32_t)p[0] | ((uint32_t)p[1] << 8); | |
| p += 2; | |
| len -= 2; | |
| } | |
| if (len) | |
| sum += (uint32_t)p[0]; | |
| return sum; | |
| } | |
| static uint32_t checksum_add_bytes(const void *buf, size_t len) { | |
| const uint8_t *p = buf; | |
| uint32_t sum = 0; | |
| while (len > 1) { | |
| sum += ((uint32_t)p[0] << 8) | (uint32_t)p[1]; | |
| p += 2; | |
| len -= 2; | |
| } | |
| if (len > 0) | |
| sum += (uint32_t)p[0] << 8; | |
| return sum; | |
| } | |
| static uint32_t checksum_add_ipv4_pseudo(uint32_t saddr, uint32_t daddr, | |
| uint8_t proto, uint16_t len) { | |
| uint8_t pseudo[12]; | |
| pseudo[0] = (uint8_t)((ntohl(saddr) >> 24) & 0xff); | |
| pseudo[1] = (uint8_t)((ntohl(saddr) >> 16) & 0xff); | |
| pseudo[2] = (uint8_t)((ntohl(saddr) >> 8) & 0xff); | |
| pseudo[3] = (uint8_t)(ntohl(saddr) & 0xff); | |
| pseudo[4] = (uint8_t)((ntohl(daddr) >> 24) & 0xff); | |
| pseudo[5] = (uint8_t)((ntohl(daddr) >> 16) & 0xff); | |
| pseudo[6] = (uint8_t)((ntohl(daddr) >> 8) & 0xff); | |
| pseudo[7] = (uint8_t)(ntohl(daddr) & 0xff); | |
| pseudo[8] = 0; | |
| pseudo[9] = proto; | |
| pseudo[10] = (uint8_t)(len >> 8); | |
| pseudo[11] = (uint8_t)(len & 0xff); | |
| return checksum_add_bytes(pseudo, sizeof(pseudo)); | |
| } | |
| static uint16_t checksum_finish(uint32_t sum) { | |
| while (sum >> 16) | |
| sum = (sum & 0xffffU) + (sum >> 16); | |
| return (uint16_t)~sum; | |
| } | |
| static int tcp_checksum_valid(const struct iphdr *ip, const struct tcphdr *tcp, | |
| size_t tcp_len) { | |
| uint32_t sum; | |
| if (tcp_len < sizeof(struct tcphdr)) | |
| return 0; | |
| sum = checksum_add_ipv4_pseudo(ip->saddr, ip->daddr, IPPROTO_TCP, | |
| (uint16_t)tcp_len); | |
| sum += checksum_add_bytes(tcp, tcp_len); | |
| return checksum_finish(sum) == 0; | |
| } | |
| static int udp_checksum_valid(const struct iphdr *ip, const struct udphdr *udp, | |
| size_t udp_len) { | |
| uint32_t sum; | |
| if (udp->check == 0) | |
| return 1; | |
| if (udp_len < sizeof(struct udphdr)) | |
| return 0; | |
| sum = checksum_add_ipv4_pseudo(ip->saddr, ip->daddr, IPPROTO_UDP, | |
| (uint16_t)udp_len); | |
| sum += checksum_add_bytes(udp, udp_len); | |
| return checksum_finish(sum) == 0; | |
| } | |
| static int icmp_checksum_valid(const uint8_t *icmp, size_t icmp_len) { | |
| uint32_t sum; | |
| if (icmp_len == 0) | |
| return 0; | |
| sum = checksum_add_bytes(icmp, icmp_len); | |
| return checksum_finish(sum) == 0; | |
| } | |
| static uint16_t tcp_checksum(struct iphdr *ip, struct tcphdr *tcp, | |
| size_t tcp_len, const uint8_t *payload, | |
| size_t plen) { | |
| struct { | |
| uint32_t src; | |
| uint32_t dst; | |
| uint8_t zero; | |
| uint8_t proto; | |
| uint16_t len; | |
| } __attribute__((packed)) pseudo; | |
| memset(&pseudo, 0, sizeof(pseudo)); | |
| pseudo.src = ip->saddr; | |
| pseudo.dst = ip->daddr; | |
| pseudo.zero = 0; | |
| pseudo.proto = IPPROTO_TCP; | |
| pseudo.len = htons((uint16_t)(tcp_len + plen)); | |
| uint32_t sum = 0; | |
| uint32_t p1 = csum16_partial(&pseudo, sizeof(pseudo)); | |
| uint32_t p2 = csum16_partial(tcp, tcp_len); | |
| uint32_t p3 = plen ? csum16_partial(payload, plen) : 0; | |
| sum = p1 + p2 + p3; | |
| while (sum >> 16) | |
| sum = (sum & 0xffff) + (sum >> 16); | |
| return (uint16_t)~sum; | |
| } | |
| static uint16_t ip_checksum(const void *buf, size_t len) { | |
| const uint16_t *p = buf; | |
| uint32_t sum = 0; | |
| while (len > 1) { | |
| sum += *p++; | |
| len -= 2; | |
| } | |
| if (len) | |
| sum += *(const uint8_t *)p; | |
| while (sum >> 16) | |
| sum = (sum & 0xffff) + (sum >> 16); | |
| return (uint16_t)~sum; | |
| } | |
| static uint16_t tcp_advertised_window(const struct tcp_flow *f) { | |
| size_t free_bytes = 0; | |
| if (f->pending_write_len < TCP_PENDING_WRITE_CAP) | |
| free_bytes = TCP_PENDING_WRITE_CAP - f->pending_write_len; | |
| if (free_bytes > 65535) | |
| free_bytes = 65535; | |
| return (uint16_t)free_bytes; | |
| } | |
| /* Send a TCP packet from server to client */ | |
| static void send_tcp_packet(int tunfd, struct tcp_flow *f, uint8_t flags, | |
| const uint8_t *payload, size_t plen) { | |
| uint8_t out[65536]; | |
| struct iphdr *ip = (struct iphdr *)out; | |
| struct tcphdr *tcp = (struct tcphdr *)(out + sizeof(*ip)); | |
| size_t tcp_hdr_len = sizeof(*tcp); | |
| size_t total_len = sizeof(*ip) + tcp_hdr_len + plen; | |
| memset(ip, 0, sizeof(*ip)); | |
| ip->version = 4; | |
| ip->ihl = 5; | |
| ip->ttl = 64; | |
| ip->protocol = IPPROTO_TCP; | |
| ip->saddr = f->srv_ip; | |
| ip->daddr = f->cli_ip; | |
| ip->tot_len = htons((uint16_t)total_len); | |
| memset(tcp, 0, sizeof(*tcp)); | |
| tcp->source = htons(f->srv_port); | |
| tcp->dest = htons(f->cli_port); | |
| tcp->seq = htonl(f->srv_next); | |
| tcp->ack_seq = htonl(f->cli_next); | |
| tcp->doff = (tcp_hdr_len / 4) & 0xF; | |
| tcp->ack = 1; | |
| if (flags & 0x08) | |
| tcp->psh = 1; /* PSH */ | |
| if (flags & 0x01) | |
| tcp->fin = 1; /* FIN */ | |
| tcp->window = htons(tcp_advertised_window(f)); | |
| if (plen > 0) | |
| memcpy(out + sizeof(*ip) + tcp_hdr_len, payload, plen); | |
| tcp->check = tcp_checksum(ip, tcp, tcp_hdr_len, payload, plen); | |
| ip->check = ip_checksum(ip, sizeof(*ip)); | |
| IGNORE_RESULT(tun_write_packet(tunfd, out, total_len, "TCP packet")); | |
| /* Update sequence number for data sent */ | |
| if (plen > 0) | |
| f->srv_next += (uint32_t)plen; | |
| if ((flags & 0x01) != 0) | |
| f->srv_next++; | |
| } | |
| /* ---------- TCP Option Parsing ---------- */ | |
| struct tcp_options { | |
| uint16_t mss; | |
| uint8_t wscale; | |
| uint32_t tsval; | |
| uint32_t tsecr; | |
| int ts_present; | |
| int sack_permitted; | |
| }; | |
| /* Parse TCP options from the options portion of TCP header. | |
| Returns 0 on success, -1 if options are malformed. */ | |
| static int parse_tcp_options(const uint8_t *opts, size_t len, | |
| struct tcp_options *out) { | |
| memset(out, 0, sizeof(*out)); | |
| out->mss = 536; /* Default MSS per RFC 879 */ | |
| size_t i = 0; | |
| while (i < len) { | |
| uint8_t kind = opts[i]; | |
| if (kind == 0) /* End of option list */ | |
| break; | |
| if (kind == 1) { /* NOP */ | |
| i++; | |
| continue; | |
| } | |
| /* All other options have length field */ | |
| if (i + 1 >= len) | |
| return -1; | |
| uint8_t optlen = opts[i + 1]; | |
| if (optlen < 2 || i + optlen > len) | |
| return -1; | |
| switch (kind) { | |
| case 2: /* MSS */ | |
| if (optlen == 4) { | |
| out->mss = (uint16_t)((opts[i + 2] << 8) | opts[i + 3]); | |
| } | |
| break; | |
| case 3: /* Window Scale */ | |
| if (optlen == 3) { | |
| out->wscale = opts[i + 2]; | |
| } | |
| break; | |
| case 4: /* SACK Permitted */ | |
| if (optlen == 2) { | |
| out->sack_permitted = 1; | |
| } | |
| break; | |
| case 8: /* Timestamp */ | |
| if (optlen == 10) { | |
| out->ts_present = 1; | |
| out->tsval = (uint32_t)((opts[i + 2] << 24) | (opts[i + 3] << 16) | | |
| (opts[i + 4] << 8) | opts[i + 5]); | |
| out->tsecr = (uint32_t)((opts[i + 6] << 24) | (opts[i + 7] << 16) | | |
| (opts[i + 8] << 8) | opts[i + 9]); | |
| } | |
| break; | |
| } | |
| i += optlen; | |
| } | |
| return 0; | |
| } | |
| /* Build TCP options for SYN-ACK response. | |
| Returns the number of bytes written to buf. */ | |
| static size_t build_synack_options(const struct tcp_options *client_opts, | |
| uint8_t *buf, uint32_t our_tsval) { | |
| size_t off = 0; | |
| /* MSS option (kind=2, len=4) - always include */ | |
| buf[off++] = 2; | |
| buf[off++] = 4; | |
| buf[off++] = 0x05; /* MSS = 1460 */ | |
| buf[off++] = 0xb4; | |
| /* Window scale option if client requested (kind=3, len=3) */ | |
| if (client_opts->wscale > 0) { | |
| buf[off++] = 1; /* NOP for alignment */ | |
| buf[off++] = 3; /* kind = Window Scale */ | |
| buf[off++] = 3; /* length = 3 */ | |
| buf[off++] = 0; /* our shift count = 0 (advertise x1 scaling) */ | |
| } | |
| /* Timestamp option if client requested (kind=8, len=10) */ | |
| if (client_opts->ts_present) { | |
| buf[off++] = 8; | |
| buf[off++] = 10; | |
| buf[off++] = (uint8_t)(our_tsval >> 24); | |
| buf[off++] = (uint8_t)(our_tsval >> 16); | |
| buf[off++] = (uint8_t)(our_tsval >> 8); | |
| buf[off++] = (uint8_t)(our_tsval); | |
| buf[off++] = (uint8_t)(client_opts->tsval >> 24); | |
| buf[off++] = (uint8_t)(client_opts->tsval >> 16); | |
| buf[off++] = (uint8_t)(client_opts->tsval >> 8); | |
| buf[off++] = (uint8_t)(client_opts->tsval); | |
| } | |
| /* Pad to 4-byte boundary with NOPs */ | |
| while (off % 4 != 0) | |
| buf[off++] = 1; /* NOP */ | |
| return off; | |
| } | |
| /* Send a TCP RST packet */ | |
| static void send_tcp_rst(int tunfd, uint32_t saddr, uint32_t daddr, | |
| uint16_t sport, uint16_t dport, uint32_t seq, | |
| uint32_t ack_seq) { | |
| uint8_t out[64]; | |
| struct iphdr *ip = (struct iphdr *)out; | |
| struct tcphdr *tcp = (struct tcphdr *)(out + sizeof(*ip)); | |
| memset(out, 0, sizeof(out)); | |
| ip->version = 4; | |
| ip->ihl = 5; | |
| ip->ttl = 64; | |
| ip->protocol = IPPROTO_TCP; | |
| ip->saddr = saddr; | |
| ip->daddr = daddr; | |
| ip->tot_len = htons(sizeof(*ip) + sizeof(*tcp)); | |
| tcp->source = htons(sport); | |
| tcp->dest = htons(dport); | |
| tcp->seq = htonl(seq); | |
| tcp->ack_seq = htonl(ack_seq); | |
| tcp->doff = 5; | |
| tcp->rst = 1; | |
| tcp->ack = 1; | |
| tcp->window = 0; | |
| tcp->check = tcp_checksum(ip, tcp, sizeof(*tcp), NULL, 0); | |
| ip->check = ip_checksum(ip, sizeof(*ip)); | |
| IGNORE_RESULT( | |
| tun_write_packet(tunfd, out, sizeof(*ip) + sizeof(*tcp), "TCP RST")); | |
| } | |
| /* Send RST for a given flow and clean it up */ | |
| static void tcp_flow_rst(int tunfd, struct tcp_flow *f) { | |
| if (f->sock >= 0) { | |
| epoll_del(f->sock); | |
| close(f->sock); | |
| } | |
| f->pending_write_off = 0; | |
| f->pending_write_len = 0; | |
| f->pending_fin = 0; | |
| f->pending_fin_seq = 0; | |
| f->backend_ready = 0; | |
| socks_io_reset(&f->socks); | |
| send_tcp_rst(tunfd, f->srv_ip, f->cli_ip, f->srv_port, f->cli_port, | |
| f->srv_next, f->cli_next); | |
| f->sock = -1; | |
| f->state = SP_TCP_CLOSED; | |
| } | |
| static void tcp_update_events(struct tcp_flow *f) { | |
| uint32_t events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP; | |
| if (!f->backend_ready || f->pending_write_len > 0 || | |
| socks_has_pending_tx(&f->socks)) | |
| events |= EPOLLOUT; | |
| epoll_mod_tcp(f, events); | |
| } | |
| static int tcp_finish_pending_fin(int tunfd, struct tcp_flow *f) { | |
| if (!f->backend_ready || f->pending_write_len > 0 || !f->pending_fin || | |
| f->pending_fin_seq != f->cli_next) | |
| return 0; | |
| f->pending_fin = 0; | |
| f->pending_fin_seq = 0; | |
| f->cli_next++; | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| if (shutdown(f->sock, SHUT_WR) < 0 && errno != ENOTCONN && errno != EPIPE) { | |
| tcp_flow_rst(tunfd, f); | |
| return -1; | |
| } | |
| f->state = SP_TCP_CLOSE_WAIT; | |
| f->last_active = time(NULL); | |
| return 1; | |
| } | |
| static int tcp_queue_pending_write(struct tcp_flow *f, const uint8_t *data, | |
| size_t len) { | |
| if (f->pending_write_off != 0 && f->pending_write_len > 0) { | |
| memmove(f->pending_write, f->pending_write + f->pending_write_off, | |
| f->pending_write_len); | |
| f->pending_write_off = 0; | |
| } | |
| if (f->pending_write_len + len > sizeof(f->pending_write)) | |
| return -1; | |
| memcpy(f->pending_write + f->pending_write_len, data, len); | |
| f->pending_write_len += len; | |
| return 0; | |
| } | |
| static int tcp_flush_pending_write(int tunfd, struct tcp_flow *f) { | |
| size_t total_sent = 0; | |
| while (f->pending_write_len > 0) { | |
| ssize_t sent = send(f->sock, f->pending_write + f->pending_write_off, | |
| f->pending_write_len, MSG_NOSIGNAL); | |
| if (sent > 0) { | |
| f->pending_write_off += (size_t)sent; | |
| f->pending_write_len -= (size_t)sent; | |
| total_sent += (size_t)sent; | |
| continue; | |
| } | |
| if (sent == 0) { | |
| tcp_flow_rst(tunfd, f); | |
| return -1; | |
| } | |
| if (sent < 0 && errno == EINTR) | |
| continue; | |
| if (sent < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) | |
| break; | |
| tcp_flow_rst(tunfd, f); | |
| return -1; | |
| } | |
| if (total_sent > 0) { | |
| f->cli_next += (uint32_t)total_sent; | |
| if (tcp_finish_pending_fin(tunfd, f) < 0) | |
| return -1; | |
| if (f->sock >= 0 && f->pending_fin == 0) | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| } | |
| if (f->pending_write_len == 0) | |
| f->pending_write_off = 0; | |
| if (f->sock >= 0) | |
| tcp_update_events(f); | |
| return 0; | |
| } | |
| static void handle_tcp(int tunfd, uint8_t *pkt, ssize_t len) { | |
| if (len <= 0) | |
| return; | |
| size_t ulen = (size_t)len; | |
| struct iphdr *ip = (struct iphdr *)pkt; | |
| size_t iphl = ip->ihl * 4; | |
| if (ip->version != 4) | |
| return; | |
| if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > ulen) | |
| return; | |
| size_t ip_total_len = (size_t)ntohs(ip->tot_len); | |
| if (ip_total_len < iphl || ip_total_len > ulen) | |
| return; | |
| /* Reject IP fragments */ | |
| if (ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) { | |
| DBG("TCP: dropping IP fragment (frag_off=0x%04x)", ntohs(ip->frag_off)); | |
| return; | |
| } | |
| if (ip_total_len < iphl + sizeof(struct tcphdr)) | |
| return; | |
| struct tcphdr *tcp = (struct tcphdr *)(pkt + iphl); | |
| size_t tcp_hdr_len = (size_t)tcp->doff * 4; | |
| if (tcp_hdr_len < sizeof(struct tcphdr) || tcp_hdr_len > 60 || | |
| ip_total_len < iphl + tcp_hdr_len) | |
| return; | |
| uint32_t cip = ip->saddr; | |
| uint32_t sip = ip->daddr; | |
| uint16_t cport = ntohs(tcp->source); | |
| uint16_t sport = ntohs(tcp->dest); | |
| /* ---------- RST ---------- */ | |
| if (tcp->rst) { | |
| struct tcp_flow *f = tcp_find(cip, cport, sip, sport); | |
| if (f && f->sock >= 0) { | |
| epoll_del(f->sock); | |
| close(f->sock); | |
| f->sock = -1; | |
| f->state = SP_TCP_CLOSED; | |
| } | |
| return; | |
| } | |
| /* ---------- SYN ---------- */ | |
| if (tcp->syn && !tcp->ack) { | |
| struct tcp_flow *f = tcp_find(cip, cport, sip, sport); | |
| /* Parse TCP options from SYN */ | |
| struct tcp_options cli_opts; | |
| if (tcp_hdr_len > sizeof(struct tcphdr)) { | |
| const uint8_t *opt_start = (const uint8_t *)tcp + sizeof(struct tcphdr); | |
| size_t opt_len = tcp_hdr_len - sizeof(struct tcphdr); | |
| if (parse_tcp_options(opt_start, opt_len, &cli_opts) < 0) | |
| return; | |
| } else { | |
| memset(&cli_opts, 0, sizeof(cli_opts)); | |
| } | |
| if (!f) { | |
| /* Rate limit new connections */ | |
| if (!check_rate_limit()) { | |
| return; /* Too many connections - drop SYN silently */ | |
| } | |
| /* first SYN */ | |
| f = tcp_alloc(); | |
| if (!f) { | |
| send_tcp_rst(tunfd, sip, cip, sport, cport, 0, ntohl(tcp->seq) + 1U); | |
| return; | |
| } | |
| memset(f, 0, sizeof(*f)); | |
| f->cli_ip = cip; | |
| f->cli_port = cport; | |
| f->srv_ip = sip; | |
| f->srv_port = sport; | |
| f->cli_isn = ntohl(tcp->seq); | |
| f->cli_next = f->cli_isn + 1; | |
| uint32_t isn; | |
| if (getrandom(&isn, sizeof(isn), 0) != (ssize_t)sizeof(isn)) | |
| die("getrandom"); | |
| f->srv_isn = isn; | |
| f->srv_next = f->srv_isn + 1; | |
| /* Store timestamp negotiation state */ | |
| f->ts_ok = cli_opts.ts_present; | |
| if (cli_opts.ts_present) { | |
| f->ts_recent = cli_opts.tsval; | |
| } | |
| int s; | |
| int connect_rc; | |
| struct sockaddr_in dst; | |
| int use_gateway = | |
| is_gateway_ip(sip) && is_gateway_allowed(sip, sport, IPPROTO_TCP); | |
| if (use_gateway) { | |
| /* Gateway access - connect to localhost (10.0.1.x -> 127.0.0.x) */ | |
| uint32_t local_ip = gateway_to_localhost(sip); | |
| DBG("[parent] TCP gateway: 10.0.1.%d:%d -> 127.0.0.%d:%d", | |
| gateway_last_octet(sip), sport, gateway_last_octet(sip), sport); | |
| s = socket(AF_INET, SOCK_STREAM, 0); | |
| if (s < 0) | |
| return; | |
| memset(&dst, 0, sizeof(dst)); | |
| dst.sin_family = AF_INET; | |
| dst.sin_port = htons(sport); | |
| dst.sin_addr.s_addr = local_ip; | |
| } else if (socks_proxy.enabled) { | |
| /* Connect via SOCKS5 proxy */ | |
| s = socket(AF_INET, SOCK_STREAM, 0); | |
| if (s < 0) | |
| return; | |
| dst = socks_proxy.addr; | |
| } else { | |
| /* Direct connection */ | |
| s = socket(AF_INET, SOCK_STREAM, 0); | |
| if (s < 0) | |
| return; | |
| memset(&dst, 0, sizeof(dst)); | |
| dst.sin_family = AF_INET; | |
| dst.sin_port = htons(sport); | |
| dst.sin_addr.s_addr = sip; | |
| } | |
| connect_rc = start_nonblocking_connect(s, &dst); | |
| if (connect_rc < 0) { | |
| close(s); | |
| f->sock = -1; | |
| send_tcp_rst(tunfd, sip, cip, sport, cport, 0, f->cli_next); | |
| return; | |
| } | |
| f->sock = s; | |
| f->backend_ready = 0; | |
| socks_io_reset(&f->socks); | |
| if (socks_proxy.enabled && !use_gateway) { | |
| f->socks.active = 1; | |
| f->socks.is_udp = 0; | |
| f->socks.target_ip = sip; | |
| f->socks.target_port = sport; | |
| f->socks.connect_pending = (connect_rc > 0); | |
| f->socks.state = f->socks.connect_pending ? SOCKS_IO_CONNECTING | |
| : SOCKS_IO_METHOD; | |
| if (!f->socks.connect_pending && | |
| socks_begin_handshake(&f->socks, &socks_proxy) < 0) { | |
| close(s); | |
| f->sock = -1; | |
| send_tcp_rst(tunfd, sip, cip, sport, cport, 0, f->cli_next); | |
| return; | |
| } | |
| } else if (connect_rc == 0) { | |
| f->backend_ready = 1; | |
| } | |
| f->state = SP_TCP_SYN_RECEIVED; | |
| f->last_active = time(NULL); | |
| epoll_add_tcp(f); | |
| tcp_update_events(f); | |
| } | |
| /* Build SYN-ACK with mirrored options */ | |
| uint8_t out[128]; | |
| struct iphdr *rip = (struct iphdr *)out; | |
| struct tcphdr *rtcp = (struct tcphdr *)(out + sizeof(*rip)); | |
| /* Build TCP options mirroring client's capabilities */ | |
| uint8_t opts[24]; | |
| uint32_t our_tsval = (uint32_t)time(NULL); | |
| size_t opts_len = build_synack_options(&cli_opts, opts, our_tsval); | |
| memset(rip, 0, sizeof(*rip)); | |
| rip->version = 4; | |
| rip->ihl = 5; | |
| rip->ttl = 64; | |
| rip->protocol = IPPROTO_TCP; | |
| rip->saddr = sip; | |
| rip->daddr = cip; | |
| memset(rtcp, 0, sizeof(*rtcp)); | |
| rtcp->source = htons(sport); | |
| rtcp->dest = htons(cport); | |
| rtcp->seq = htonl(f->srv_isn); | |
| rtcp->ack_seq = htonl(f->cli_next); | |
| rtcp->syn = 1; | |
| rtcp->ack = 1; | |
| size_t full_tcp_len = sizeof(struct tcphdr) + opts_len; | |
| rtcp->doff = (full_tcp_len / 4) & 0xF; | |
| rtcp->window = htons(tcp_advertised_window(f)); | |
| memcpy((uint8_t *)rtcp + sizeof(*rtcp), opts, opts_len); | |
| rip->tot_len = htons((uint16_t)(sizeof(*rip) + full_tcp_len)); | |
| rtcp->check = tcp_checksum(rip, rtcp, full_tcp_len, NULL, 0); | |
| rip->check = ip_checksum(rip, sizeof(*rip)); | |
| IGNORE_RESULT(tun_write_packet(tunfd, out, sizeof(*rip) + full_tcp_len, | |
| "TCP SYN-ACK")); | |
| return; | |
| } | |
| /* ---------- ACK / DATA ---------- */ | |
| if (tcp->ack && !tcp->syn) { | |
| struct tcp_flow *f = tcp_find(cip, cport, sip, sport); | |
| if (!f || f->sock < 0) | |
| return; | |
| uint32_t seq = ntohl(tcp->seq); | |
| /* Calculate payload */ | |
| size_t payload_off = iphl + tcp_hdr_len; | |
| size_t payload_len = 0; | |
| if (ip_total_len > payload_off) | |
| payload_len = ip_total_len - payload_off; | |
| if (f->state == SP_TCP_SYN_RECEIVED) { | |
| if (ntohl(tcp->ack_seq) != f->srv_next) { | |
| DBG("TCP: dropping invalid handshake ACK (%u != %u)", | |
| ntohl(tcp->ack_seq), f->srv_next); | |
| return; | |
| } | |
| f->state = SP_TCP_ESTABLISHED; | |
| tcp_update_events(f); | |
| } | |
| /* Update activity time */ | |
| f->last_active = time(NULL); | |
| uint32_t pending_end = f->cli_next + (uint32_t)f->pending_write_len; | |
| if (!f->backend_ready || f->pending_write_len > 0) { | |
| uint8_t *payload = pkt + payload_off; | |
| size_t append_off = 0; | |
| size_t append_len = 0; | |
| if (payload_len > 0 && seq <= pending_end && | |
| seq + payload_len > pending_end) { | |
| append_off = (size_t)(pending_end - seq); | |
| append_len = payload_len - append_off; | |
| } else if (payload_len > 0 && seq == pending_end) { | |
| append_len = payload_len; | |
| } | |
| if (append_len > 0 && | |
| tcp_queue_pending_write(f, payload + append_off, append_len) < 0) { | |
| tcp_flow_rst(tunfd, f); | |
| return; | |
| } | |
| if (tcp->fin) { | |
| f->pending_fin = 1; | |
| f->pending_fin_seq = seq + (uint32_t)payload_len; | |
| } | |
| if (f->backend_ready && (payload_len > 0 || tcp->fin)) | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| return; | |
| } | |
| /* Forward payload to real server and ACK only bytes the backend accepted. */ | |
| if (payload_len > 0) { | |
| uint8_t *payload = pkt + payload_off; | |
| size_t payload_off_trim = 0; | |
| if (seq < f->cli_next) { | |
| if (seq + payload_len <= f->cli_next) { | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| return; | |
| } | |
| payload_off_trim = (size_t)(f->cli_next - seq); | |
| seq = f->cli_next; | |
| payload += payload_off_trim; | |
| payload_len -= payload_off_trim; | |
| } | |
| if (seq != f->cli_next) { | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| return; | |
| } | |
| size_t total_sent = 0; | |
| while (total_sent < payload_len) { | |
| ssize_t sent = send(f->sock, payload + total_sent, | |
| payload_len - total_sent, MSG_NOSIGNAL); | |
| if (sent == 0) { | |
| tcp_flow_rst(tunfd, f); | |
| return; | |
| } | |
| if (sent < 0) { | |
| if (errno == EINTR) | |
| continue; | |
| if (errno == EAGAIN || errno == EWOULDBLOCK) | |
| break; | |
| /* Connection error - send RST and clean up */ | |
| tcp_flow_rst(tunfd, f); | |
| return; | |
| } | |
| total_sent += (size_t)sent; | |
| } | |
| if (total_sent > 0) { | |
| f->cli_next += (uint32_t)total_sent; | |
| /* Send ACK back to client */ | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| } | |
| if (total_sent < payload_len) { | |
| if (tcp_queue_pending_write(f, payload + total_sent, | |
| payload_len - total_sent) < 0) { | |
| tcp_flow_rst(tunfd, f); | |
| return; | |
| } | |
| tcp_update_events(f); | |
| } | |
| } | |
| /* Handle FIN from client */ | |
| if (tcp->fin) { | |
| if (f->pending_write_len > 0 || seq != f->cli_next) { | |
| f->pending_fin = 1; | |
| f->pending_fin_seq = seq + (uint32_t)payload_len; | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| return; | |
| } | |
| f->cli_next++; | |
| /* ACK the client FIN but keep the backend open for reads. */ | |
| send_tcp_packet(tunfd, f, 0, NULL, 0); | |
| if (shutdown(f->sock, SHUT_WR) < 0 && errno != ENOTCONN && | |
| errno != EPIPE) { | |
| tcp_flow_rst(tunfd, f); | |
| return; | |
| } | |
| f->state = SP_TCP_CLOSE_WAIT; | |
| f->last_active = time(NULL); | |
| } | |
| return; | |
| } | |
| } | |
| static void dispatch_tun_ipv4_packet(int tunfd, uint8_t *pkt, size_t len) { | |
| struct iphdr *ip; | |
| size_t iphl; | |
| size_t ip_total_len; | |
| if (len < sizeof(struct iphdr)) | |
| return; | |
| ip = (struct iphdr *)pkt; | |
| if (ip->version != 4) | |
| return; | |
| iphl = (size_t)ip->ihl * 4; | |
| if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > len) { | |
| DBG("Dropping packet: bad IHL=%zu", iphl); | |
| return; | |
| } | |
| if (ip_checksum(ip, iphl) != 0) { | |
| DBG("Dropping packet: bad IP checksum"); | |
| return; | |
| } | |
| ip_total_len = (size_t)ntohs(ip->tot_len); | |
| if (ip_total_len < iphl || ip_total_len > len) | |
| return; | |
| if ((ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) != 0) { | |
| DBG("Dropping packet: dropping IP fragment (frag_off=0x%04x)", | |
| ntohs(ip->frag_off)); | |
| return; | |
| } | |
| if (ip->protocol == IPPROTO_TCP) { | |
| if (ip_total_len < iphl + sizeof(struct tcphdr)) | |
| return; | |
| const struct tcphdr *tcp = (const struct tcphdr *)(pkt + iphl); | |
| size_t tcp_len = ip_total_len - iphl; | |
| size_t tcp_hdr_len = (size_t)tcp->doff * 4; | |
| if (tcp_hdr_len < sizeof(struct tcphdr) || tcp_hdr_len > tcp_len) | |
| return; | |
| if (!tcp_checksum_valid(ip, tcp, tcp_len)) { | |
| DBG("Dropping packet: bad TCP checksum"); | |
| return; | |
| } | |
| handle_tcp(tunfd, pkt, (ssize_t)ip_total_len); | |
| return; | |
| } | |
| if (ip->protocol == IPPROTO_UDP) { | |
| if (ip_total_len < iphl + sizeof(struct udphdr)) | |
| return; | |
| const struct udphdr *udp = (const struct udphdr *)(pkt + iphl); | |
| size_t udp_len = (size_t)ntohs(udp->len); | |
| if (udp_len < sizeof(struct udphdr) || udp_len > ip_total_len - iphl) | |
| return; | |
| if (!udp_checksum_valid(ip, udp, udp_len)) { | |
| DBG("Dropping packet: bad UDP checksum"); | |
| return; | |
| } | |
| handle_udp(tunfd, pkt, (ssize_t)ip_total_len); | |
| return; | |
| } | |
| if (ip->protocol == IPPROTO_ICMP) { | |
| const uint8_t *icmp = pkt + iphl; | |
| size_t icmp_len = ip_total_len - iphl; | |
| if (!icmp_checksum_valid(icmp, icmp_len)) { | |
| DBG("Dropping packet: bad ICMP checksum"); | |
| return; | |
| } | |
| handle_icmp(tunfd, pkt, (ssize_t)ip_total_len); | |
| } | |
| } | |
| static void udp_close_flow(struct udp_flow *f) { | |
| if (f->udp_relay >= 0) { | |
| epoll_del(f->udp_relay); | |
| close(f->udp_relay); | |
| f->udp_relay = -1; | |
| } | |
| if (f->udp_staging >= 0) { | |
| close(f->udp_staging); | |
| f->udp_staging = -1; | |
| } | |
| if (f->tcp_ctrl >= 0) { | |
| epoll_del(f->tcp_ctrl); | |
| close(f->tcp_ctrl); | |
| f->tcp_ctrl = -1; | |
| } | |
| f->pending_set = 0; | |
| f->pending_len = 0; | |
| socks_io_reset(&f->socks); | |
| } | |
| static int interactive_maybe_sync_winsize(struct interactive_session *session) { | |
| if (!session || !session->active) | |
| return 0; | |
| if (!interactive_resize_pending) | |
| return 0; | |
| interactive_resize_pending = 0; | |
| return interactive_sync_winsize(session); | |
| } | |
| static int event_loop(int tunfd, pid_t pid, int stdout_fd, int stderr_fd, | |
| struct interactive_session *session) { | |
| struct epoll_event events[MAX_EVENTS]; | |
| int child_status = -1; | |
| int interactive_pty_active = 0; | |
| time_t child_exited_at = 0; | |
| /* Register TUN fd (static wrapper on stack - never removed) */ | |
| struct epoll_wrapper tun_ew = {.type = FD_TUN, .fd = tunfd, .flow = NULL}; | |
| struct epoll_event tun_ev = {.events = EPOLLIN, .data.ptr = &tun_ew}; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, tunfd, &tun_ev) < 0) | |
| die("epoll_ctl TUN"); | |
| int pipes_active = (stdout_fd >= 0 ? 1 : 0) + (stderr_fd >= 0 ? 1 : 0); | |
| struct epoll_wrapper stdout_ew = {.type = FD_STDOUT_RELAY, .fd = stdout_fd, .flow = NULL}; | |
| struct epoll_wrapper stderr_ew = {.type = FD_STDERR_RELAY, .fd = stderr_fd, .flow = NULL}; | |
| if (stdout_fd >= 0) { | |
| struct epoll_event ev = {.events = EPOLLIN | EPOLLRDHUP, .data.ptr = &stdout_ew}; | |
| epoll_ctl(g_epfd, EPOLL_CTL_ADD, stdout_fd, &ev); | |
| } | |
| if (stderr_fd >= 0) { | |
| struct epoll_event ev = {.events = EPOLLIN | EPOLLRDHUP, .data.ptr = &stderr_ew}; | |
| epoll_ctl(g_epfd, EPOLL_CTL_ADD, stderr_fd, &ev); | |
| } | |
| struct epoll_wrapper interactive_tty_ew = { | |
| .type = FD_INTERACTIVE_TTY, .fd = -1, .flow = NULL}; | |
| struct epoll_wrapper interactive_pty_ew = { | |
| .type = FD_INTERACTIVE_PTY, .fd = -1, .flow = NULL}; | |
| if (session && session->active) { | |
| struct epoll_event tty_ev = { | |
| .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP | EPOLLERR, | |
| .data.ptr = &interactive_tty_ew}; | |
| struct epoll_event pty_ev = { | |
| .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP | EPOLLERR, | |
| .data.ptr = &interactive_pty_ew}; | |
| interactive_tty_ew.fd = session->host_tty_fd; | |
| interactive_pty_ew.fd = session->pty_master_fd; | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, session->host_tty_fd, &tty_ev) < 0) | |
| die("epoll_ctl interactive tty"); | |
| if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, session->pty_master_fd, &pty_ev) < 0) | |
| die("epoll_ctl interactive pty"); | |
| interactive_pty_active = 1; | |
| } | |
| for (;;) { | |
| if (interactive_maybe_sync_winsize(session) < 0) | |
| break; | |
| /* Check child status */ | |
| int status; | |
| if (child_status == -1 && waitpid(pid, &status, WNOHANG) > 0) { | |
| if (WIFEXITED(status) || WIFSIGNALED(status)) { | |
| child_status = status; | |
| child_exited_at = time(NULL); | |
| if (pipes_active <= 0 && interactive_pty_active <= 0) | |
| break; | |
| } | |
| } | |
| if (child_status != -1 && interactive_pty_active > 0 && | |
| child_exited_at != 0 && session && session->pty_master_fd >= 0 && | |
| (time(NULL) - child_exited_at) >= 1) { | |
| epoll_ctl(g_epfd, EPOLL_CTL_DEL, session->pty_master_fd, NULL); | |
| close(session->pty_master_fd); | |
| session->pty_master_fd = -1; | |
| interactive_pty_active = 0; | |
| if (pipes_active <= 0) | |
| break; | |
| } | |
| int n = epoll_wait(g_epfd, events, MAX_EVENTS, EPOLL_TIMEOUT_MS); | |
| if (n < 0) { | |
| if (errno == EINTR) | |
| continue; | |
| break; | |
| } | |
| if (interactive_maybe_sync_winsize(session) < 0) | |
| break; | |
| for (int i = 0; i < n; i++) { | |
| struct epoll_wrapper *ew = events[i].data.ptr; | |
| if (!ew) | |
| continue; | |
| switch (ew->type) { | |
| case FD_STDOUT_RELAY: | |
| case FD_STDERR_RELAY: { | |
| ssize_t r = read(ew->fd, g_io_buf, sizeof(g_io_buf)); | |
| if (r > 0) { | |
| int out_fd = (ew->type == FD_STDOUT_RELAY) ? STDOUT_FILENO : STDERR_FILENO; | |
| ssize_t w = write(out_fd, g_io_buf, (size_t)r); | |
| (void)w; | |
| } else if (r == 0 || (r < 0 && errno != EAGAIN && errno != EWOULDBLOCK) || (events[i].events & (EPOLLHUP | EPOLLRDHUP))) { | |
| epoll_ctl(g_epfd, EPOLL_CTL_DEL, ew->fd, NULL); | |
| close(ew->fd); | |
| ew->fd = -1; | |
| pipes_active--; | |
| if (child_status != -1 && pipes_active <= 0) { | |
| goto loop_end; | |
| } | |
| } | |
| break; | |
| } | |
| case FD_INTERACTIVE_TTY: | |
| case FD_INTERACTIVE_PTY: { | |
| int dst_fd = -1; | |
| ssize_t r; | |
| if (!session || !session->active || ew->fd < 0) | |
| break; | |
| r = read(ew->fd, g_io_buf, sizeof(g_io_buf)); | |
| if (r > 0) { | |
| dst_fd = (ew->type == FD_INTERACTIVE_TTY) ? session->pty_master_fd | |
| : session->host_tty_fd; | |
| if (write_all(dst_fd, g_io_buf, (size_t)r) < 0 && | |
| errno != EPIPE && errno != EIO) | |
| goto loop_end; | |
| } else if (r == 0 || | |
| (r < 0 && errno != EAGAIN && errno != EWOULDBLOCK && | |
| errno != EINTR) || | |
| (events[i].events & (EPOLLHUP | EPOLLRDHUP | EPOLLERR))) { | |
| epoll_ctl(g_epfd, EPOLL_CTL_DEL, ew->fd, NULL); | |
| if (ew->type == FD_INTERACTIVE_TTY) { | |
| /* Keep the tty fd open so terminal restore still has a target. */ | |
| } else { | |
| close(ew->fd); | |
| session->pty_master_fd = -1; | |
| interactive_pty_active = 0; | |
| } | |
| ew->fd = -1; | |
| if (child_status != -1 && interactive_pty_active <= 0 && | |
| pipes_active <= 0) { | |
| goto loop_end; | |
| } | |
| } | |
| break; | |
| } | |
| case FD_TUN: { | |
| /* Handle TUN packets (outgoing from child) */ | |
| ssize_t r = read(tunfd, g_io_buf, sizeof(g_io_buf)); | |
| if (r > 0) | |
| dispatch_tun_ipv4_packet(tunfd, g_io_buf, (size_t)r); | |
| break; | |
| } | |
| case FD_TCP: { | |
| /* Handle TCP server socket responses */ | |
| struct tcp_flow *f = ew->flow; | |
| if (!f || f->sock < 0) | |
| break; | |
| if (!f->backend_ready) { | |
| if ((events[i].events & (EPOLLOUT | EPOLLERR | EPOLLHUP | | |
| EPOLLRDHUP)) && | |
| ((!f->socks.active) || f->socks.connect_pending)) { | |
| if (socket_connect_complete(f->sock) < 0) { | |
| tcp_flow_rst(tunfd, f); | |
| break; | |
| } | |
| if (f->socks.active) { | |
| f->socks.connect_pending = 0; | |
| if (socks_begin_handshake(&f->socks, &socks_proxy) < 0) { | |
| tcp_flow_rst(tunfd, f); | |
| break; | |
| } | |
| } else { | |
| f->backend_ready = 1; | |
| } | |
| tcp_update_events(f); | |
| } | |
| if (f->sock < 0) | |
| break; | |
| if (f->socks.active && !f->backend_ready) { | |
| if ((events[i].events & EPOLLOUT) && socks_has_pending_tx(&f->socks)) { | |
| if (socks_flush_tx(f->sock, &f->socks) < 0) { | |
| tcp_flow_rst(tunfd, f); | |
| break; | |
| } | |
| tcp_update_events(f); | |
| } | |
| if ((events[i].events & EPOLLIN) != 0) { | |
| int rc = socks_recv_and_process(f->sock, &f->socks, &socks_proxy, | |
| NULL); | |
| if (rc < 0) { | |
| tcp_flow_rst(tunfd, f); | |
| break; | |
| } | |
| if (rc > 0) { | |
| f->backend_ready = 1; | |
| f->last_active = time(NULL); | |
| if (f->pending_write_len > 0 && | |
| tcp_flush_pending_write(tunfd, f) < 0) | |
| break; | |
| if (f->sock >= 0 && tcp_finish_pending_fin(tunfd, f) < 0) | |
| break; | |
| if (f->sock >= 0) | |
| tcp_update_events(f); | |
| } else if (f->sock >= 0) { | |
| tcp_update_events(f); | |
| } | |
| } | |
| if (!f->backend_ready) | |
| break; | |
| } | |
| } | |
| if (f->sock >= 0 && (events[i].events & EPOLLOUT) && | |
| f->pending_write_len > 0) { | |
| if (tcp_flush_pending_write(tunfd, f) < 0) | |
| break; | |
| } | |
| if (f->sock >= 0 && tcp_finish_pending_fin(tunfd, f) < 0) | |
| break; | |
| if (f->sock >= 0 && | |
| (f->state == SP_TCP_ESTABLISHED || | |
| f->state == SP_TCP_CLOSE_WAIT) && (events[i].events & EPOLLIN)) { | |
| ssize_t r = recv(f->sock, g_io_buf, sizeof(g_io_buf) - 64, 0); | |
| if (r > 0) { | |
| /* Forward data to client */ | |
| send_tcp_packet(tunfd, f, 0x08, g_io_buf, (size_t)r); | |
| f->last_active = time(NULL); | |
| } else if (r == 0) { | |
| if (f->pending_write_len > 0) { | |
| tcp_flow_rst(tunfd, f); | |
| break; | |
| } | |
| /* Server closed connection - send FIN to client */ | |
| send_tcp_packet(tunfd, f, 0x01, NULL, 0); | |
| epoll_del(f->sock); | |
| close(f->sock); | |
| f->sock = -1; | |
| f->state = SP_TCP_CLOSED; | |
| } else if (errno != EAGAIN && errno != EWOULDBLOCK && | |
| errno != EINTR) { | |
| tcp_flow_rst(tunfd, f); | |
| } | |
| } | |
| break; | |
| } | |
| case FD_UDP_CTRL: { | |
| struct udp_flow *f = ew->flow; | |
| if (!f || f->tcp_ctrl < 0) | |
| break; | |
| if ((events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) && | |
| !f->socks.connect_pending && f->udp_relay >= 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| if (f->socks.connect_pending && | |
| (events[i].events & (EPOLLOUT | EPOLLERR | EPOLLHUP | | |
| EPOLLRDHUP))) { | |
| if (socket_connect_complete(f->tcp_ctrl) < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| f->socks.connect_pending = 0; | |
| if (socks_begin_handshake(&f->socks, &socks_proxy) < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| epoll_mod_udp_ctrl(f, udp_ctrl_events(f)); | |
| } | |
| if (f->tcp_ctrl < 0) | |
| break; | |
| if ((events[i].events & EPOLLOUT) && socks_has_pending_tx(&f->socks)) { | |
| if (socks_flush_tx(f->tcp_ctrl, &f->socks) < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| epoll_mod_udp_ctrl(f, udp_ctrl_events(f)); | |
| } | |
| if ((events[i].events & EPOLLIN) != 0) { | |
| struct sockaddr_in relay_addr; | |
| int rc = socks_recv_and_process(f->tcp_ctrl, &f->socks, &socks_proxy, | |
| &relay_addr); | |
| if (rc < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| if (rc > 0) { | |
| if (udp_open_relay_socket(f, &relay_addr) < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| epoll_mod_udp_ctrl(f, EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP); | |
| f->last_used = time(NULL); | |
| if (udp_flush_pending(f) < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| udp_update_events(f); | |
| } else { | |
| epoll_mod_udp_ctrl(f, udp_ctrl_events(f)); | |
| } | |
| } | |
| break; | |
| } | |
| case FD_UDP_RELAY: { | |
| /* Handle UDP relay responses (incoming from SOCKS proxy) */ | |
| struct udp_flow *f = ew->flow; | |
| if (f && f->udp_relay >= 0) { | |
| if ((events[i].events & EPOLLOUT) && f->pending_set) { | |
| if (udp_flush_pending(f) < 0) { | |
| udp_close_flow(f); | |
| break; | |
| } | |
| } | |
| struct sockaddr_in from; | |
| socklen_t fromlen = sizeof(from); | |
| ssize_t r = recvfrom(f->udp_relay, g_io_buf, sizeof(g_io_buf), 0, | |
| (struct sockaddr *)&from, &fromlen); | |
| /* Direct and SOCKS relay traffic must return from the exact source | |
| * address and port the broker bound to for the flow. */ | |
| if (r > 0 && | |
| (from.sin_addr.s_addr != f->relay_addr.sin_addr.s_addr || | |
| from.sin_port != f->relay_addr.sin_port)) { | |
| DBG("[parent] UDP source mismatch: got %s:%d, expected %s:%d", | |
| inet_ntoa(from.sin_addr), ntohs(from.sin_port), | |
| inet_ntoa(f->relay_addr.sin_addr), | |
| ntohs(f->relay_addr.sin_port)); | |
| break; /* Packet from unexpected source */ | |
| } | |
| if (r > 0 && f->tcp_ctrl < 0) { | |
| udp_inject_tun(tunfd, f, g_io_buf, (size_t)r); | |
| f->last_used = time(NULL); | |
| } else if (r > 10) { | |
| DBG("UDP relay received %zd bytes from %s:%d", r, | |
| inet_ntoa(from.sin_addr), ntohs(from.sin_port)); | |
| /* Validate FRAG field (byte 2) - we don't support fragmentation */ | |
| if (g_io_buf[2] != 0) | |
| break; | |
| /* Strip SOCKS5 UDP header */ | |
| size_t hdr_len = 10; | |
| if (g_io_buf[3] == 0x03) | |
| hdr_len = 4 + 1 + g_io_buf[4] + 2; | |
| else if (g_io_buf[3] == 0x04) | |
| hdr_len = 4 + 16 + 2; | |
| if ((size_t)r > hdr_len) { | |
| udp_inject_tun(tunfd, f, g_io_buf + hdr_len, (size_t)r - hdr_len); | |
| f->last_used = time(NULL); | |
| } | |
| } else if (r == 0) { | |
| /* Relay closed */ | |
| udp_close_flow(f); | |
| break; | |
| } | |
| udp_update_events(f); | |
| } | |
| break; | |
| } | |
| } | |
| } | |
| /* Cleanup stale TCP flows */ | |
| time_t now = time(NULL); | |
| for (int i = 0; i < MAX_TCP; i++) { | |
| struct tcp_flow *f = &tcp_flows[i]; | |
| if (f->sock < 0) continue; | |
| int timeout = TCP_IDLE_TIMEOUT_SEC; | |
| if (f->state == SP_TCP_SYN_RECEIVED) timeout = TCP_HALF_OPEN_TIMEOUT_SEC; | |
| else if (f->state != SP_TCP_ESTABLISHED && f->state != SP_TCP_CLOSE_WAIT) | |
| timeout = 10; /* Quick cleanup for FIN_WAIT / CLOSING / LAST_ACK / TIME_WAIT */ | |
| if ((now - f->last_active) > timeout) { | |
| if (f->state == SP_TCP_ESTABLISHED || f->state == SP_TCP_CLOSE_WAIT) { | |
| send_tcp_packet(tunfd, f, 0x01, NULL, 0); /* FIN */ | |
| f->state = SP_TCP_FIN_WAIT_1; | |
| f->last_active = now; | |
| shutdown(f->sock, SHUT_WR); | |
| tcp_update_events(f); | |
| } else { | |
| /* Force close for other states or if already closing and timed out */ | |
| epoll_del(f->sock); | |
| close(f->sock); | |
| f->sock = -1; | |
| f->state = SP_TCP_CLOSED; | |
| } | |
| } | |
| } | |
| /* Cleanup stale UDP flows (idle for >30 seconds) */ | |
| for (int i = 0; i < MAX_UDP; i++) { | |
| if ((udp_flows[i].udp_relay >= 0 || udp_flows[i].tcp_ctrl >= 0) && | |
| (now - udp_flows[i].last_used) > 30) { | |
| udp_close_flow(&udp_flows[i]); | |
| } | |
| } | |
| loop_end: | |
| if (child_status != -1 && pipes_active <= 0 && interactive_pty_active <= 0) | |
| break; | |
| } | |
| return child_status; | |
| } | |
| /* ---------- main ---------- */ | |
| int main(int argc, char **argv) { | |
| /* Pre-scan verbose and bootstrap into a delegated scope before side effects. */ | |
| int cmd_start = 1; | |
| pre_scan_verbose_flag(argc, argv); | |
| maybe_reexec_under_systemd_scope(argc, argv); | |
| struct interactive_session interactive_session = {0}; | |
| interactive_session.host_tty_fd = -1; | |
| interactive_session.pty_master_fd = -1; | |
| interactive_session.pty_slave_fd = -1; | |
| for (int i = 1; i < argc; i++) { | |
| if (strcmp(argv[i], "--socks") == 0 && i + 1 < argc) { | |
| parse_socks_url(argv[i + 1], &socks_proxy); | |
| if (socks_proxy.enabled) { | |
| fprintf(stderr, "Using SOCKS5 proxy: %s:%d%s\n", socks_proxy.host, | |
| socks_proxy.port, | |
| socks_proxy.username[0] ? " (with auth)" : ""); | |
| } | |
| i++; /* skip next arg (the proxy URL) */ | |
| cmd_start = i + 1; | |
| } else if (strcmp(argv[i], "--unsafe-share-cwd") == 0) { | |
| unsafe_share_cwd = 1; | |
| cmd_start = i + 1; | |
| } else if (strcmp(argv[i], "--interactive") == 0) { | |
| interactive_stdio = 1; | |
| cmd_start = i + 1; | |
| } else if (strncmp(argv[i], "--socks-auth-file=", 18) == 0) { | |
| parse_socks_auth_file(argv[i] + 18); | |
| cmd_start = i + 1; | |
| } else if (strcmp(argv[i], "--verbose") == 0 || | |
| strcmp(argv[i], "-v") == 0) { | |
| verbose = 1; | |
| cmd_start = i + 1; | |
| } else if (strncmp(argv[i], "--allow-host=", 13) == 0) { | |
| const char *spec = argv[i] + 13; | |
| if (host_rule_count < MAX_HOST_RULES) { | |
| struct host_rule *r = &host_rules[host_rule_count]; | |
| memset(r, 0, sizeof(*r)); | |
| /* Make a mutable copy for parsing */ | |
| char buf[128]; | |
| strncpy(buf, spec, sizeof(buf) - 1); | |
| buf[sizeof(buf) - 1] = '\0'; | |
| /* Parse protocol suffix /tcp or /udp */ | |
| char *slash = strchr(buf, '/'); | |
| if (slash) { | |
| *slash = '\0'; | |
| if (strcmp(slash + 1, "tcp") == 0) | |
| r->proto = IPPROTO_TCP; | |
| else if (strcmp(slash + 1, "udp") == 0) | |
| r->proto = IPPROTO_UDP; | |
| else { | |
| fprintf(stderr, "Invalid protocol: %s (use /tcp or /udp)\n", | |
| slash + 1); | |
| cmd_start = i + 1; | |
| continue; | |
| } | |
| } | |
| /* Parse 127.0.0.X:PORT format */ | |
| if (strncmp(buf, "127.0.0.", 8) == 0) { | |
| char *colon = strchr(buf + 8, ':'); | |
| if (colon) { | |
| long last_octet; | |
| long port_val; | |
| *colon = '\0'; | |
| const char *port_str = colon + 1; | |
| if (parse_long_strict(buf + 8, 0, 255, &last_octet) < 0) { | |
| fprintf(stderr, "Invalid IP: %s (must be 127.0.0.X)\n", spec); | |
| cmd_start = i + 1; | |
| continue; | |
| } | |
| r->last_octet = (uint8_t)last_octet; | |
| if (strcmp(port_str, "*") == 0) { | |
| fprintf(stderr, "Wildcard port not allowed\\n"); | |
| cmd_start = i + 1; | |
| continue; | |
| } else { | |
| if (parse_long_strict(port_str, 1, 65535, &port_val) < 0) { | |
| fprintf(stderr, "Invalid port: %s\\n", port_str); | |
| cmd_start = i + 1; | |
| continue; | |
| } | |
| r->port = (uint16_t)port_val; | |
| } | |
| host_rule_count++; | |
| const char *proto_str = r->proto == IPPROTO_TCP ? "/tcp" | |
| : r->proto == IPPROTO_UDP ? "/udp" | |
| : ""; | |
| DBG("Host gateway: 127.0.0.%d:%d%s", r->last_octet, r->port, proto_str); | |
| } else { | |
| fprintf(stderr, "Invalid format: %s (expected 127.0.0.X:PORT)\n", | |
| spec); | |
| } | |
| } else { | |
| fprintf(stderr, "Invalid IP: %s (must be 127.0.0.X)\n", spec); | |
| } | |
| } | |
| cmd_start = i + 1; | |
| } else { | |
| /* First non-flag argument is the command */ | |
| cmd_start = i; | |
| break; | |
| } | |
| } | |
| if (cmd_start >= argc) { | |
| fprintf(stderr, "usage: %s [OPTIONS] <cmd> [args...]\n\n", argv[0]); | |
| fprintf(stderr, "Options:\n"); | |
| fprintf(stderr, " --socks <proxy> SOCKS5 proxy\n"); | |
| fprintf(stderr, " --socks-auth-file= File with proxy credentials\n"); | |
| fprintf(stderr, " --unsafe-share-cwd Allow unsafe sandbox source paths\n"); | |
| fprintf(stderr, " --interactive Attach child to a private PTY\n"); | |
| fprintf(stderr, " -v, --verbose Print debug info\n"); | |
| fprintf(stderr, | |
| "\nHost gateway (child accesses 10.0.1.x -> host 127.0.0.x):\n"); | |
| fprintf(stderr, " --allow-host=127.0.0.X:PORT/PROTO Allow IP:PORT\n"); | |
| fprintf(stderr, "\nExamples:\n"); | |
| fprintf(stderr, " %s --allow-host=127.0.0.1:8080/tcp curl 10.0.1.1:8080\n", argv[0]); | |
| return 1; | |
| } | |
| if (socks_proxy.enabled && resolve_socks_proxy(&socks_proxy) < 0) { | |
| fprintf(stderr, "Failed to resolve SOCKS proxy %s:%d\n", socks_proxy.host, | |
| socks_proxy.port); | |
| return 1; | |
| } | |
| /* Shift argv to command */ | |
| argv = &argv[cmd_start]; | |
| argc -= cmd_start; | |
| int sp[2], ctl[2], sync[2]; | |
| if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sp) < 0) | |
| die("socketpair sp"); | |
| if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ctl) < 0) | |
| die("socketpair ctl"); | |
| if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sync) < 0) | |
| die("socketpair sync"); | |
| char overlay_base[] = "/tmp/.sockpuppet-overlay-XXXXXX"; | |
| if (!mkdtemp(overlay_base)) | |
| die("mkdtemp overlay base"); | |
| uid_t uid = getuid(); | |
| gid_t gid = getgid(); | |
| for (int i = 0; i < MAX_TCP; i++) | |
| tcp_flows[i].sock = -1; | |
| for (int i = 0; i < MAX_UDP; i++) { | |
| udp_flows[i].udp_relay = -1; | |
| udp_flows[i].udp_staging = -1; | |
| udp_flows[i].tcp_ctrl = -1; | |
| } | |
| /* Create epoll instance for event loop */ | |
| g_epfd = epoll_create1(EPOLL_CLOEXEC); | |
| if (g_epfd < 0) | |
| die("epoll_create1"); | |
| if (interactive_stdio) { | |
| struct sigaction sa; | |
| if (interactive_parent_setup(&interactive_session) < 0) { | |
| fprintf(stderr, "Interactive mode requires a usable parent tty and PTY support (%s)\n", | |
| strerror(errno)); | |
| interactive_close_session(&interactive_session); | |
| return 1; | |
| } | |
| memset(&sa, 0, sizeof(sa)); | |
| sigemptyset(&sa.sa_mask); | |
| sa.sa_handler = interactive_handle_sigwinch; | |
| if (sigaction(SIGWINCH, &sa, NULL) < 0) { | |
| interactive_close_session(&interactive_session); | |
| die("sigaction SIGWINCH"); | |
| } | |
| } | |
| int stdout_pipe[2] = {-1, -1}; | |
| int stderr_pipe[2] = {-1, -1}; | |
| if (!interactive_stdio) { | |
| if (pipe(stdout_pipe) < 0 || pipe(stderr_pipe) < 0) die("pipe"); | |
| fcntl(stdout_pipe[0], F_SETFL, fcntl(stdout_pipe[0], F_GETFL) | O_NONBLOCK); | |
| fcntl(stderr_pipe[0], F_SETFL, fcntl(stderr_pipe[0], F_GETFL) | O_NONBLOCK); | |
| } | |
| apply_parent_rlimits(); | |
| cgroup_setup_containment(); | |
| pid_t pid = fork(); | |
| if (pid < 0) { | |
| interactive_close_session(&interactive_session); | |
| die("fork"); | |
| } | |
| if (pid == 0) { | |
| /* ---------- child ---------- */ | |
| close(sp[0]); | |
| close(ctl[0]); | |
| close(sync[0]); | |
| close(g_epfd); | |
| g_epfd = -1; | |
| if (interactive_session.host_tty_fd >= 0) | |
| close(interactive_session.host_tty_fd); | |
| if (interactive_session.pty_master_fd >= 0) | |
| close(interactive_session.pty_master_fd); | |
| if (unshare(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWIPC | | |
| CLONE_NEWUTS | CLONE_NEWPID) < 0) | |
| die("unshare"); | |
| DBG("Created namespaces: user, net, mnt, ipc, uts, pid"); | |
| /* CLONE_NEWPID requires a second fork - the child becomes PID 1 */ | |
| pid_t inner_pid = fork(); | |
| if (inner_pid < 0) | |
| die("fork (inner)"); | |
| if (inner_pid > 0) { | |
| /* Intermediate process: wait for inner child and exit with its status */ | |
| int status; | |
| waitpid(inner_pid, &status, 0); | |
| _exit(WIFEXITED(status) ? WEXITSTATUS(status) : 1); | |
| } | |
| /* ---------- inner child (PID 1 in new namespace) ---------- */ | |
| IGNORE_RESULT(write(sync[1], "1", 1)); | |
| IGNORE_RESULT(read(sync[1], &uid, sizeof(uid))); | |
| IGNORE_RESULT(read(sync[1], &gid, sizeof(gid))); | |
| close(sync[1]); | |
| char cwd[PATH_MAX]; | |
| struct fs_sandbox fs_sandbox; | |
| if (!getcwd(cwd, sizeof(cwd))) | |
| die("getcwd"); | |
| prepare_fs_sandbox(&fs_sandbox, cwd, overlay_base); | |
| int sockfd = recv_fd(ctl[1]); | |
| close(sp[1]); | |
| int tunfd = tun_create("tun0"); | |
| /* network config inside child netns */ | |
| if_up_netlink("lo"); | |
| if_addr_ptp("tun0", "10.0.0.2", "10.0.0.1"); | |
| if_up("tun0"); | |
| add_default_route("tun0", "10.0.0.1"); | |
| DBG("Network setup: tun0 (10.0.0.2 -> 10.0.0.1), lo up"); | |
| fcntl(tunfd, F_SETFD, FD_CLOEXEC); | |
| send_fd(ctl[1], tunfd); | |
| close(tunfd); | |
| close(sockfd); | |
| close(ctl[1]); | |
| enter_fs_sandbox(&fs_sandbox); | |
| /* Privilege dropping is mandatory */ | |
| drop_caps(); | |
| DBG("Dropped capabilities"); | |
| if (apply_landlock_policy(fs_sandbox.resolved_cwd) < 0) | |
| die("landlock_restrict_self"); | |
| if (apply_child_seccomp() < 0) | |
| die("seccomp"); | |
| char **envp = build_sanitized_envp(fs_sandbox.resolved_cwd); | |
| if (interactive_stdio) { | |
| interactive_child_setup(&interactive_session); | |
| if (interactive_session.pty_slave_fd > STDERR_FILENO) | |
| close(interactive_session.pty_slave_fd); | |
| } else { | |
| dup2(stdout_pipe[1], STDOUT_FILENO); | |
| dup2(stderr_pipe[1], STDERR_FILENO); | |
| int null_fd = open("/dev/null", O_RDONLY); | |
| if (null_fd >= 0) { | |
| dup2(null_fd, STDIN_FILENO); | |
| close(null_fd); | |
| } | |
| close(stdout_pipe[0]); | |
| close(stdout_pipe[1]); | |
| close(stderr_pipe[0]); | |
| close(stderr_pipe[1]); | |
| } | |
| close_extra_fds_for_exec(); | |
| apply_child_rlimits(); | |
| DBG("Executing: %s", argv[0]); | |
| execvpe(argv[0], argv, envp); | |
| die("exec"); | |
| } | |
| /* ---------- parent ---------- */ | |
| cgroup_move_child_to_payload(pid); | |
| close(ctl[1]); | |
| close(sync[1]); | |
| if (interactive_session.pty_slave_fd >= 0) { | |
| close(interactive_session.pty_slave_fd); | |
| interactive_session.pty_slave_fd = -1; | |
| } | |
| char tmp; | |
| IGNORE_RESULT(read(sync[0], &tmp, 1)); | |
| char path[128], map[64]; | |
| snprintf(path, sizeof(path), "/proc/%d/setgroups", pid); | |
| write_file(path, "deny"); | |
| snprintf(path, sizeof(path), "/proc/%d/uid_map", pid); | |
| snprintf(map, sizeof(map), "%d %d 1\n", uid, uid); | |
| write_file(path, map); | |
| snprintf(path, sizeof(path), "/proc/%d/gid_map", pid); | |
| snprintf(map, sizeof(map), "%d %d 1\n", gid, gid); | |
| write_file(path, map); | |
| IGNORE_RESULT(write(sync[0], &uid, sizeof(uid))); | |
| IGNORE_RESULT(write(sync[0], &gid, sizeof(gid))); | |
| send_fd(ctl[0], sp[1]); | |
| close(sp[1]); | |
| if (!interactive_stdio) { | |
| close(stdout_pipe[1]); | |
| close(stderr_pipe[1]); | |
| } | |
| int tunfd = recv_fd(ctl[0]); | |
| int status = | |
| event_loop(tunfd, pid, stdout_pipe[0], stderr_pipe[0], &interactive_session); | |
| interactive_close_session(&interactive_session); | |
| if (status < 0 && waitpid(pid, &status, 0) < 0) | |
| status = 1; | |
| if (rmdir(overlay_base) < 0 && errno != ENOENT) | |
| perror("rmdir overlay base"); | |
| if (WIFEXITED(status)) | |
| return WEXITSTATUS(status); | |
| return 1; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment