Skip to content

Instantly share code, notes, and snippets.

@rrampage
Last active April 17, 2026 06:10
Show Gist options
  • Select an option

  • Save rrampage/92f0eb6bf56d7bb403aff069cc8f1d6b to your computer and use it in GitHub Desktop.

Select an option

Save rrampage/92f0eb6bf56d7bb403aff069cc8f1d6b to your computer and use it in GitHub Desktop.
A userspace sandbox which uses SOCKS proxy to restrict network access (inspired by oniux)
#define _GNU_SOURCE
/*
* sockpuppet.c - single-file Linux sandbox + userspace network broker
*
* Quick build:
* gcc -O2 -g -Wall -Wextra -Wformat -Wformat=2 -Wconversion \
* -Wimplicit-fallthrough -Werror=format-security \
* -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS \
* -fstack-clash-protection -fstack-protector-strong \
* -Wl,-z,relro -Wl,-z,now -Wl,--as-needed \
* -Wl,--no-copy-dt-needed-entries sockpuppet.c -o sockpuppet
*
* Common usage:
* ./sockpuppet /bin/sh
* Run a command with non-interactive stdio: stdin from /dev/null and
* stdout/stderr relayed through the parent.
*
* ./sockpuppet --interactive /bin/sh
* Run with a private PTY for shells, REPLs, and full-screen terminal apps.
*
* ./sockpuppet --allow-host=127.0.0.1:8080/tcp curl http://10.0.1.1:8080
* Allow the sandbox to reach a host-local service through the 10.0.1.x
* gateway mapping.
*
* ./sockpuppet --socks socks5://127.0.0.1:1080 curl https://example.com
* Route outbound traffic through a SOCKS5 proxy.
*
* ./sockpuppet --unsafe-share-cwd ...
* Allow running from /, /root, or /home/... when you intentionally want
* the current working directory exposed inside the sandbox.
*/
#include <arpa/inet.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/filter.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/seccomp.h>
#include <netdb.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <stddef.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/random.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/xattr.h>
#include <termios.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#define MAX_TCP 128
#define MAX_UDP 64
#define MAX_EVENTS 64
#define EPOLL_TIMEOUT_MS 100
#define TCP_PENDING_WRITE_CAP 262144
#define BROKER_MEMORY_LOW "67108864"
#define BROKER_MEMORY_HIGH "134217728"
#define BROKER_MEMORY_MAX "201326592"
#define BROKER_PIDS_MAX "32"
#define BROKER_CPU_WEIGHT "200"
#define PAYLOAD_MEMORY_HIGH "805306368"
#define PAYLOAD_MEMORY_MAX "1073741824"
#define PAYLOAD_PIDS_MAX "128"
#define PAYLOAD_CPU_WEIGHT "100"
#define PAYLOAD_CPU_MAX "200000 100000"
#define SCOPE_MEMORY_MAX "1280M"
#define SCOPE_MEMORY_HIGH "896M"
#define SCOPE_TASKS_MAX "160"
#define SCOPE_CPU_QUOTA "300%"
#define SCOPE_NOFILE "8192"
#define RLIMIT_PARENT_NOFILE 4096
#define RLIMIT_CHILD_NOFILE 1024
/* Epoll event data wrapper */
enum fd_type {
FD_TUN = 1,
FD_STDOUT_RELAY,
FD_STDERR_RELAY,
FD_INTERACTIVE_TTY,
FD_INTERACTIVE_PTY,
FD_TCP,
FD_UDP_RELAY,
FD_UDP_CTRL
};
struct epoll_wrapper {
enum fd_type type;
int fd;
void *flow; /* Points to tcp_flow or udp_flow */
};
static int g_epfd = -1; /* Global epoll fd */
/* SOCKS5 proxy configuration */
struct socks_config {
char host[256];
int port;
char username[256];
char password[256];
int enabled;
int addr_valid;
struct sockaddr_in addr;
};
enum socks_io_state {
SOCKS_IO_NONE = 0,
SOCKS_IO_CONNECTING,
SOCKS_IO_METHOD,
SOCKS_IO_AUTH,
SOCKS_IO_REQUEST,
SOCKS_IO_READY,
SOCKS_IO_FAILED
};
struct socks_io {
int active;
int is_udp;
int connect_pending;
enum socks_io_state state;
uint32_t target_ip;
uint16_t target_port;
uint8_t txbuf[512];
size_t tx_off;
size_t tx_len;
uint8_t rxbuf[512];
size_t rx_len;
};
static struct socks_config socks_proxy = {0};
static int unsafe_share_cwd = 0;
static int interactive_stdio = 0;
static int verbose = 0; /* Verbose debug output */
static volatile sig_atomic_t interactive_resize_pending = 0;
struct interactive_session {
int active;
int host_tty_fd;
int pty_master_fd;
int pty_slave_fd;
struct termios host_termios;
struct winsize host_winsize;
int host_termios_saved;
int host_winsize_saved;
};
/* Debug macro - only prints if verbose mode enabled */
#define DBG(fmt, ...) \
do { \
if (verbose) \
fprintf(stderr, "[sockpuppet] " fmt "\n", ##__VA_ARGS__); \
} while (0)
/* Host gateway configuration - map 10.0.1.x to 127.0.0.x */
#define HOST_PING_IP 0x0100000a /* 10.0.0.1 - only for ping */
#define HOST_GATEWAY_BASE 0x0001000a /* 10.0.1.0 network byte order */
#define HOST_GATEWAY_MASK 0x00ffffff /* /24 mask for 10.0.1.x */
#define LOCALHOST_BASE 0x0000007f /* 127.0.0.0 network byte order */
#define MAX_HOST_RULES 64
struct host_rule {
uint8_t last_octet; /* x in 127.0.0.x (1-255), 0 = wildcard */
uint16_t port; /* port number, 0 = all ports */
int proto; /* IPPROTO_TCP, IPPROTO_UDP, or 0 for both */
int wildcard_ip; /* match all 127.0.0.x */
int wildcard_port; /* match all ports */
};
static struct host_rule host_rules[MAX_HOST_RULES];
static int host_rule_count = 0;
static int host_allow_all = 0; /* --host=* */
/* Check if IP is in gateway range (10.0.1.0/24) */
static int is_gateway_ip(uint32_t ip) {
return (ip & HOST_GATEWAY_MASK) == HOST_GATEWAY_BASE;
}
/* Extract last octet from gateway IP (10.0.1.x -> x) */
static uint8_t gateway_last_octet(uint32_t ip) {
return (uint8_t)((ip >> 24) & 0xff);
}
/* Convert gateway IP to localhost (10.0.1.x -> 127.0.0.x) */
static uint32_t gateway_to_localhost(uint32_t gw_ip) {
uint8_t last = gateway_last_octet(gw_ip);
return LOCALHOST_BASE | ((uint32_t)last << 24);
}
/* Check if gateway access is allowed for given IP, port, and protocol */
static int is_gateway_allowed(uint32_t gw_ip, uint16_t port, int proto) {
if (!is_gateway_ip(gw_ip))
return 0;
if (host_allow_all)
return 1;
uint8_t last = gateway_last_octet(gw_ip);
for (int i = 0; i < host_rule_count; i++) {
struct host_rule *r = &host_rules[i];
int ip_match = r->wildcard_ip || (r->last_octet == last);
int port_match = r->wildcard_port || (r->port == port);
int proto_match = (r->proto == 0) || (r->proto == proto);
if (ip_match && port_match && proto_match)
return 1;
}
return 0;
}
/* Rate limiting */
#define MAX_CONNECTS_PER_SEC 50
#define TCP_HALF_OPEN_TIMEOUT_SEC 10
#define TCP_IDLE_TIMEOUT_SEC 120
static struct timespec rate_limit_last = {0};
static double rate_limit_tokens = (double)MAX_CONNECTS_PER_SEC;
static double monotonic_elapsed_seconds(struct timespec now,
struct timespec then) {
time_t sec = now.tv_sec - then.tv_sec;
long nsec = now.tv_nsec - then.tv_nsec;
return (double)sec + ((double)nsec / 1000000000.0);
}
static int check_rate_limit(void) {
struct timespec now;
if (clock_gettime(CLOCK_MONOTONIC, &now) < 0)
return 0;
if (rate_limit_last.tv_sec == 0 && rate_limit_last.tv_nsec == 0) {
rate_limit_last = now;
} else {
double elapsed = monotonic_elapsed_seconds(now, rate_limit_last);
if (elapsed > 0.0) {
rate_limit_tokens += elapsed * (double)MAX_CONNECTS_PER_SEC;
if (rate_limit_tokens > (double)MAX_CONNECTS_PER_SEC)
rate_limit_tokens = (double)MAX_CONNECTS_PER_SEC;
rate_limit_last = now;
}
}
if (rate_limit_tokens < 1.0)
return 0;
rate_limit_tokens -= 1.0;
return 1;
}
/* TCP connection states */
enum tcp_state {
SP_TCP_CLOSED = 0,
SP_TCP_SYN_RECEIVED,
SP_TCP_ESTABLISHED,
SP_TCP_FIN_WAIT_1,
SP_TCP_FIN_WAIT_2,
SP_TCP_CLOSE_WAIT,
SP_TCP_CLOSING,
SP_TCP_LAST_ACK,
SP_TCP_TIME_WAIT
};
struct tcp_flow {
uint32_t cli_ip;
uint16_t cli_port;
uint32_t srv_ip;
uint16_t srv_port;
uint32_t cli_isn;
uint32_t srv_isn;
uint32_t cli_next;
uint32_t srv_next;
int sock;
enum tcp_state state;
time_t last_active;
/* TCP timestamp option (RFC 7323) */
int ts_ok; /* Timestamps negotiated */
uint32_t ts_recent; /* Last TSval received from client */
uint8_t pending_write[TCP_PENDING_WRITE_CAP];
size_t pending_write_off;
size_t pending_write_len;
int pending_fin;
uint32_t pending_fin_seq;
int backend_ready;
struct socks_io socks;
struct epoll_wrapper ew; /* Epoll registration */
};
static struct tcp_flow tcp_flows[MAX_TCP];
struct udp_flow {
uint32_t cli_ip;
uint16_t cli_port;
uint32_t srv_ip;
uint16_t srv_port;
int tcp_ctrl; /* SOCKS5 TCP control connection (must stay open) */
int udp_relay; /* UDP socket to SOCKS relay */
int udp_staging; /* UDP socket bound before SOCKS UDP ASSOCIATE completes */
time_t last_used; /* Last activity timestamp */
struct sockaddr_in relay_addr; /* Expected relay source for validation */
struct socks_io socks;
uint8_t pending_data[65535];
size_t pending_len;
int pending_set;
unsigned long dropped_backpressure;
struct epoll_wrapper ew; /* Epoll registration */
struct epoll_wrapper ctrl_ew; /* Epoll registration for SOCKS control TCP */
};
static struct udp_flow udp_flows[MAX_UDP];
struct cgroup_ctx {
int active;
int cpu_enabled;
int memory_enabled;
int pids_enabled;
char root[PATH_MAX];
char broker[PATH_MAX];
char payload[PATH_MAX];
};
static struct cgroup_ctx g_cgroup = {0};
/* I/O buffer for event loop reads - safe as static since writes are via
opaque syscalls (read/recv) that act as compiler barriers. */
static uint8_t g_io_buf[65536];
/* ---------- utilities ---------- */
static void die(const char *msg) {
perror(msg);
exit(1);
}
static ssize_t write_all(int fd, const void *buf, size_t len) {
const uint8_t *p = buf;
size_t off = 0;
while (off < len) {
ssize_t w = write(fd, p + off, len - off);
if (w > 0) {
off += (size_t)w;
continue;
}
if (w < 0 && errno == EINTR)
continue;
return -1;
}
return (ssize_t)off;
}
static int tun_write_packet(int tunfd, const uint8_t *buf, size_t len,
const char *what) {
for (;;) {
ssize_t n = write(tunfd, buf, len);
if (n == (ssize_t)len)
return 0;
if (n > 0) {
DBG("TUN write short (%s): %zd/%zu, dropping packet", what, n, len);
return -1;
}
if (errno == EINTR)
continue;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
DBG("TUN write would block (%s), dropping packet", what);
return -1;
}
DBG("TUN write failed (%s): %s", what, strerror(errno));
return -1;
}
}
static void interactive_handle_sigwinch(int signo) {
(void)signo;
interactive_resize_pending = 1;
}
static int parse_long_strict(const char *s, long min, long max, long *out) {
char *end = NULL;
long value;
if (s == NULL || *s == '\0')
return -1;
errno = 0;
value = strtol(s, &end, 10);
if (errno != 0 || end == NULL || *end != '\0' || value < min || value > max)
return -1;
*out = value;
return 0;
}
/* Helper to suppress unused result warnings from FORTIFY_SOURCE */
#define IGNORE_RESULT(x) \
do { \
if (x) { \
} \
} while (0)
static void write_file(const char *path, const char *data) {
int fd = open(path, O_WRONLY);
if (fd < 0)
die(path);
if (write(fd, data, strlen(data)) != (ssize_t)strlen(data))
die(path);
close(fd);
}
static void set_rlimit_or_die(int resource, rlim_t soft, rlim_t hard,
const char *name) {
struct rlimit lim = {.rlim_cur = soft, .rlim_max = hard};
if (setrlimit(resource, &lim) < 0) {
fprintf(stderr, "setrlimit(%s) failed: %s\n", name, strerror(errno));
exit(1);
}
DBG("RLIMIT %s set to soft=%llu hard=%llu", name,
(unsigned long long)soft, (unsigned long long)hard);
}
static void apply_parent_rlimits(void) {
set_rlimit_or_die(RLIMIT_CORE, 0, 0, "CORE");
set_rlimit_or_die(RLIMIT_MEMLOCK, 0, 0, "MEMLOCK");
set_rlimit_or_die(RLIMIT_NOFILE, RLIMIT_PARENT_NOFILE, RLIMIT_PARENT_NOFILE,
"NOFILE");
}
static void apply_child_rlimits(void) {
set_rlimit_or_die(RLIMIT_CORE, 0, 0, "CORE");
set_rlimit_or_die(RLIMIT_MEMLOCK, 0, 0, "MEMLOCK");
set_rlimit_or_die(RLIMIT_NOFILE, RLIMIT_CHILD_NOFILE, RLIMIT_CHILD_NOFILE,
"NOFILE");
}
static int detect_delegated_cgroup_root(struct cgroup_ctx *ctx);
static void pre_scan_verbose_flag(int argc, char **argv) {
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], "--verbose") == 0 || strcmp(argv[i], "-v") == 0) {
verbose = 1;
continue;
}
if (strcmp(argv[i], "--socks") == 0) {
++i;
continue;
}
if (strncmp(argv[i], "--socks-auth-file=", 18) == 0 ||
strncmp(argv[i], "--allow-host=", 13) == 0 ||
strcmp(argv[i], "--unsafe-share-cwd") == 0 ||
strcmp(argv[i], "--interactive") == 0) {
continue;
}
if (argv[i][0] == '-')
break;
break;
}
}
static int env_flag_enabled(const char *name) {
const char *value = getenv(name);
return value != NULL && *value != '\0' && strcmp(value, "0") != 0;
}
static int read_self_exe_path(char *out, size_t out_sz) {
ssize_t n = readlink("/proc/self/exe", out, out_sz - 1);
if (n < 0)
return -1;
if ((size_t)n >= out_sz - 1) {
errno = ENAMETOOLONG;
return -1;
}
out[n] = '\0';
return 0;
}
static int find_executable_in_path(const char *name, char *out, size_t out_sz) {
const char *path = getenv("PATH");
const char *segment = path;
if (!name || !*name || !out || out_sz == 0) {
errno = EINVAL;
return -1;
}
if (path == NULL || *path == '\0') {
errno = ENOENT;
return -1;
}
while (1) {
const char *colon = strchr(segment, ':');
size_t seg_len = colon ? (size_t)(colon - segment) : strlen(segment);
const char *dir = segment;
char candidate[PATH_MAX];
int rc;
if (seg_len == 0) {
dir = ".";
seg_len = 1;
}
rc = snprintf(candidate, sizeof(candidate), "%.*s/%s", (int)seg_len, dir,
name);
if (rc >= 0 && (size_t)rc < sizeof(candidate) &&
access(candidate, X_OK) == 0) {
if (snprintf(out, out_sz, "%s", candidate) >= (int)out_sz) {
errno = ENAMETOOLONG;
return -1;
}
return 0;
}
if (!colon)
break;
segment = colon + 1;
}
errno = ENOENT;
return -1;
}
static void maybe_reexec_under_systemd_scope(int argc, char **argv) {
struct cgroup_ctx probe;
char systemd_run_path[PATH_MAX];
char self_path[PATH_MAX];
char **new_argv;
char *term_env = NULL;
size_t extra_args = 0;
size_t argc_sz = (size_t)(argc > 0 ? argc : 0);
size_t idx = 0;
int use_user_scope = 1;
const char *term = getenv("TERM");
struct stat st;
if (detect_delegated_cgroup_root(&probe)) {
DBG("scope bootstrap skipped: already in delegated subtree");
return;
}
if (getenv("_SOCKPUPPET_IN_SCOPE") != NULL) {
fprintf(stderr,
"[sockpuppet] scope bootstrap claimed success but no delegated "
"cgroup root is active\n");
exit(1);
}
if (stat("/run/systemd/system", &st) < 0) {
DBG("scope bootstrap skipped: systemd not available");
return;
}
if (find_executable_in_path("systemd-run", systemd_run_path,
sizeof(systemd_run_path)) < 0) {
DBG("scope bootstrap skipped: systemd-run not available");
return;
}
if (read_self_exe_path(self_path, sizeof(self_path)) < 0)
die("readlink /proc/self/exe");
if (env_flag_enabled("SOCKPUPPET_SCOPE_SYSTEM") && geteuid() == 0)
use_user_scope = 0;
if (term != NULL && *term != '\0')
++extra_args;
new_argv = calloc(argc_sz + extra_args + 20U, sizeof(*new_argv));
if (new_argv == NULL)
die("calloc systemd-run argv");
new_argv[idx++] = systemd_run_path;
if (use_user_scope)
new_argv[idx++] = "--user";
new_argv[idx++] = "--scope";
new_argv[idx++] = "--quiet";
new_argv[idx++] = "--same-dir";
new_argv[idx++] = "--collect";
new_argv[idx++] = "--property=Delegate=yes";
new_argv[idx++] = "--property=MemoryMax=" SCOPE_MEMORY_MAX;
new_argv[idx++] = "--property=MemoryHigh=" SCOPE_MEMORY_HIGH;
new_argv[idx++] = "--property=TasksMax=" SCOPE_TASKS_MAX;
new_argv[idx++] = "--property=CPUQuota=" SCOPE_CPU_QUOTA;
new_argv[idx++] = "--property=LimitCORE=0";
new_argv[idx++] = "--property=LimitNOFILE=" SCOPE_NOFILE;
new_argv[idx++] = "--setenv=_SOCKPUPPET_IN_SCOPE=1";
if (term != NULL && *term != '\0') {
size_t term_len = strlen(term);
term_env = malloc(term_len + sizeof("--setenv=TERM="));
if (term_env == NULL)
die("malloc TERM env");
if (snprintf(term_env, term_len + sizeof("--setenv=TERM="),
"--setenv=TERM=%s", term) >=
(int)(term_len + sizeof("--setenv=TERM=")))
die("TERM env too long");
new_argv[idx++] = term_env;
}
new_argv[idx++] = "--";
new_argv[idx++] = self_path;
for (int i = 1; i < argc; ++i)
new_argv[idx++] = argv[i];
new_argv[idx] = NULL;
execv(systemd_run_path, new_argv);
DBG("scope bootstrap skipped: exec systemd-run failed (%s)",
strerror(errno));
free(term_env);
free(new_argv);
}
static int read_self_cgroup_path(char *out, size_t out_sz) {
FILE *fp = fopen("/proc/self/cgroup", "r");
char line[1024];
if (!fp)
return -1;
while (fgets(line, sizeof(line), fp) != NULL) {
if (strncmp(line, "0::", 3) != 0)
continue;
char *path = line + 3;
char *nl = strchr(path, '\n');
if (nl)
*nl = '\0';
if (*path == '\0')
path = "/";
if (snprintf(out, out_sz, "%s", path) >= (int)out_sz) {
fclose(fp);
return -1;
}
fclose(fp);
return 0;
}
fclose(fp);
return -1;
}
static int cgroup_write_file(const char *dir, const char *name,
const char *value) {
char path[PATH_MAX];
int fd;
if (snprintf(path, sizeof(path), "%s/%s", dir, name) >= (int)sizeof(path))
return -1;
fd = open(path, O_WRONLY | O_CLOEXEC);
if (fd < 0)
return -1;
size_t len = strlen(value);
ssize_t w = write(fd, value, len);
int saved = errno;
close(fd);
if (w != (ssize_t)len) {
errno = saved ? saved : EIO;
return -1;
}
return 0;
}
static int cgroup_mkdir_leaf(const char *path) {
if (mkdir(path, 0755) < 0 && errno != EEXIST)
return -1;
return 0;
}
static int cgroup_move_pid(const char *leaf, pid_t pid) {
char pidbuf[32];
if (snprintf(pidbuf, sizeof(pidbuf), "%ld", (long)pid) >= (int)sizeof(pidbuf))
return -1;
return cgroup_write_file(leaf, "cgroup.procs", pidbuf);
}
static int cgroup_has_controller(const char *controllers, const char *needle) {
size_t nlen = strlen(needle);
const char *p = controllers;
while (*p) {
while (*p == ' ' || *p == '\t' || *p == '\n')
++p;
if (!*p)
break;
const char *start = p;
while (*p && *p != ' ' && *p != '\t' && *p != '\n')
++p;
size_t len = (size_t)(p - start);
if (len == nlen && memcmp(start, needle, nlen) == 0)
return 1;
}
return 0;
}
static int cgroup_enable_controllers(struct cgroup_ctx *ctx, int want_cpu,
int want_memory, int want_pids) {
char path[PATH_MAX];
char controllers[1024];
FILE *fp;
char enable[128] = {0};
size_t off = 0;
if (snprintf(path, sizeof(path), "%s/cgroup.controllers", ctx->root) >=
(int)sizeof(path))
return -1;
fp = fopen(path, "r");
if (!fp)
return -1;
if (!fgets(controllers, sizeof(controllers), fp)) {
fclose(fp);
return -1;
}
fclose(fp);
ctx->cpu_enabled = want_cpu && cgroup_has_controller(controllers, "cpu");
ctx->memory_enabled =
want_memory && cgroup_has_controller(controllers, "memory");
ctx->pids_enabled = want_pids && cgroup_has_controller(controllers, "pids");
if (ctx->cpu_enabled)
off += (size_t)snprintf(enable + off, sizeof(enable) - off, "%s+cpu",
off > 0 ? " " : "");
if (ctx->memory_enabled)
off += (size_t)snprintf(enable + off, sizeof(enable) - off, "%s+memory",
off > 0 ? " " : "");
if (ctx->pids_enabled)
off += (size_t)snprintf(enable + off, sizeof(enable) - off, "%s+pids",
off > 0 ? " " : "");
if (off == 0)
return 0;
if (off >= sizeof(enable))
return -1;
if (cgroup_write_file(ctx->root, "cgroup.subtree_control", enable) < 0)
return -1;
DBG("cgroup controllers enabled:%s%s%s", ctx->cpu_enabled ? " cpu" : "",
ctx->memory_enabled ? " memory" : "", ctx->pids_enabled ? " pids" : "");
return 0;
}
static int detect_delegated_cgroup_root(struct cgroup_ctx *ctx) {
char rel[PATH_MAX];
char ctrl_path[PATH_MAX];
char subtree_path[PATH_MAX];
char delegate[8] = {0};
ssize_t xrc;
memset(ctx, 0, sizeof(*ctx));
if (read_self_cgroup_path(rel, sizeof(rel)) < 0)
return 0;
if (strcmp(rel, "/") == 0) {
if (snprintf(ctx->root, sizeof(ctx->root), "/sys/fs/cgroup") >=
(int)sizeof(ctx->root))
return 0;
} else {
if (snprintf(ctx->root, sizeof(ctx->root), "/sys/fs/cgroup%s", rel) >=
(int)sizeof(ctx->root))
return 0;
}
if (snprintf(ctrl_path, sizeof(ctrl_path), "%s/cgroup.controllers", ctx->root) >=
(int)sizeof(ctrl_path) ||
snprintf(subtree_path, sizeof(subtree_path), "%s/cgroup.subtree_control",
ctx->root) >= (int)sizeof(subtree_path))
return 0;
if (access(ctrl_path, R_OK) < 0 || access(subtree_path, W_OK) < 0)
return 0;
xrc = getxattr(ctx->root, "user.delegate", delegate, sizeof(delegate) - 1);
if (xrc > 0) {
delegate[xrc] = '\0';
DBG("cgroup user.delegate=%s", delegate);
}
if (snprintf(ctx->broker, sizeof(ctx->broker), "%s/broker", ctx->root) >=
(int)sizeof(ctx->broker) ||
snprintf(ctx->payload, sizeof(ctx->payload), "%s/payload", ctx->root) >=
(int)sizeof(ctx->payload))
return 0;
ctx->active = 1;
return 1;
}
static int cgroup_root_has_foreign_procs(const struct cgroup_ctx *ctx) {
char path[PATH_MAX];
FILE *fp;
long pid;
pid_t self = getpid();
if (snprintf(path, sizeof(path), "%s/cgroup.procs", ctx->root) >=
(int)sizeof(path))
return 1;
fp = fopen(path, "r");
if (!fp)
return 1;
while (fscanf(fp, "%ld", &pid) == 1) {
if ((pid_t)pid != self) {
fclose(fp);
return 1;
}
}
fclose(fp);
return 0;
}
static void cgroup_rollback_setup(const struct cgroup_ctx *ctx) {
if (cgroup_move_pid(ctx->root, getpid()) < 0) {
DBG("cgroup containment rollback: failed to move self back to root (%s)",
strerror(errno));
}
if (rmdir(ctx->broker) < 0 && errno != ENOENT && errno != ENOTEMPTY) {
DBG("cgroup containment rollback: failed to remove broker leaf (%s)",
strerror(errno));
}
if (rmdir(ctx->payload) < 0 && errno != ENOENT && errno != ENOTEMPTY) {
DBG("cgroup containment rollback: failed to remove payload leaf (%s)",
strerror(errno));
}
}
static void cgroup_apply_broker_limits(const struct cgroup_ctx *ctx) {
if (ctx->memory_enabled) {
if (cgroup_write_file(ctx->broker, "memory.low", BROKER_MEMORY_LOW) < 0 ||
cgroup_write_file(ctx->broker, "memory.high", BROKER_MEMORY_HIGH) < 0 ||
cgroup_write_file(ctx->broker, "memory.max", BROKER_MEMORY_MAX) < 0 ||
cgroup_write_file(ctx->broker, "memory.oom.group", "1") < 0)
die("cgroup broker memory limits");
}
if (ctx->pids_enabled &&
cgroup_write_file(ctx->broker, "pids.max", BROKER_PIDS_MAX) < 0)
die("cgroup broker pids.max");
if (ctx->cpu_enabled &&
cgroup_write_file(ctx->broker, "cpu.weight", BROKER_CPU_WEIGHT) < 0)
die("cgroup broker cpu.weight");
}
static void cgroup_apply_payload_limits(const struct cgroup_ctx *ctx) {
char path[PATH_MAX];
if (ctx->memory_enabled) {
if (cgroup_write_file(ctx->payload, "memory.high", PAYLOAD_MEMORY_HIGH) < 0 ||
cgroup_write_file(ctx->payload, "memory.max", PAYLOAD_MEMORY_MAX) < 0 ||
cgroup_write_file(ctx->payload, "memory.oom.group", "1") < 0)
die("cgroup payload memory limits");
if (snprintf(path, sizeof(path), "%s/memory.swap.max", ctx->payload) <
(int)sizeof(path) &&
access(path, F_OK) == 0) {
if (cgroup_write_file(ctx->payload, "memory.swap.max", "0") < 0)
die("cgroup payload memory.swap.max");
} else {
DBG("cgroup payload memory.swap.max not available, skipping");
}
}
if (ctx->pids_enabled &&
cgroup_write_file(ctx->payload, "pids.max", PAYLOAD_PIDS_MAX) < 0)
die("cgroup payload pids.max");
if (ctx->cpu_enabled) {
if (cgroup_write_file(ctx->payload, "cpu.weight", PAYLOAD_CPU_WEIGHT) < 0)
die("cgroup payload cpu.weight");
if (snprintf(path, sizeof(path), "%s/cpu.max", ctx->payload) <
(int)sizeof(path) &&
access(path, F_OK) == 0) {
if (cgroup_write_file(ctx->payload, "cpu.max", PAYLOAD_CPU_MAX) < 0)
die("cgroup payload cpu.max");
} else {
DBG("cgroup payload cpu.max not available, skipping");
}
}
}
static void cgroup_setup_containment(void) {
struct cgroup_ctx ctx;
if (!detect_delegated_cgroup_root(&ctx)) {
DBG("cgroup containment inactive: no delegated writable subtree");
return;
}
DBG("cgroup containment active under %s", ctx.root);
if (cgroup_root_has_foreign_procs(&ctx)) {
DBG("cgroup containment inactive: delegated root contains foreign pids");
return;
}
if (cgroup_mkdir_leaf(ctx.broker) < 0) {
if (errno == EACCES || errno == EPERM || errno == EROFS) {
DBG("cgroup containment inactive: cannot create broker leaf (%s)",
strerror(errno));
return;
}
die("cgroup mkdir broker");
}
if (cgroup_move_pid(ctx.broker, getpid()) < 0)
die("cgroup move self to broker");
if (cgroup_enable_controllers(&ctx, 1, 1, 1) < 0) {
if (errno == EBUSY || errno == EACCES || errno == EPERM ||
errno == EROFS || errno == EOPNOTSUPP) {
DBG("cgroup containment inactive: cannot enable controllers (%s)",
strerror(errno));
cgroup_rollback_setup(&ctx);
return;
}
die("cgroup enable controllers");
}
if (cgroup_mkdir_leaf(ctx.payload) < 0)
die("cgroup mkdir payload");
cgroup_apply_broker_limits(&ctx);
cgroup_apply_payload_limits(&ctx);
DBG("cgroup broker leaf: %s", ctx.broker);
DBG("cgroup payload leaf: %s", ctx.payload);
g_cgroup = ctx;
}
static void cgroup_move_child_to_payload(pid_t pid) {
if (!g_cgroup.active)
return;
if (cgroup_move_pid(g_cgroup.payload, pid) < 0)
die("cgroup move child to payload");
}
static void mkdir_if_missing(const char *path, mode_t mode) {
if (mkdir(path, mode) < 0 && errno != EEXIST)
die(path);
}
#if defined(__has_include)
#if __has_include(<linux/landlock.h>)
#include <linux/landlock.h>
#define SP_HAVE_LANDLOCK 1
#endif
#endif
#ifndef SP_HAVE_LANDLOCK
#define SP_HAVE_LANDLOCK 0
#endif
struct fs_sandbox {
char resolved_cwd[PATH_MAX];
char final_root[PATH_MAX];
};
static void write_text_file(const char *path, const char *data) {
int fd = open(path, O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC, 0644);
if (fd < 0)
die(path);
size_t len = strlen(data);
if (write(fd, data, len) != (ssize_t)len)
die("write");
close(fd);
}
static int path_exists(const char *path) {
struct stat st;
return stat(path, &st) == 0;
}
static void mkdir_parents(const char *path, mode_t mode) {
char tmp[PATH_MAX];
size_t len = strlen(path);
if (len >= sizeof(tmp))
die("mkdir_parents");
memcpy(tmp, path, len + 1);
for (char *p = tmp + 1; *p; ++p) {
if (*p != '/')
continue;
*p = '\0';
mkdir_if_missing(tmp, mode);
*p = '/';
}
mkdir_if_missing(tmp, mode);
}
static void ensure_parent_dir(const char *path, mode_t mode) {
char tmp[PATH_MAX];
char *slash;
if (strlen(path) >= sizeof(tmp))
die("ensure_parent_dir");
strcpy(tmp, path);
slash = strrchr(tmp, '/');
if (!slash)
return;
if (slash == tmp) {
mkdir_if_missing("/", mode);
return;
}
*slash = '\0';
mkdir_parents(tmp, mode);
}
static int path_contains(const char *base, const char *path) {
size_t len = strlen(base);
if (strcmp(base, "/") == 0)
return 1;
if (strncmp(base, path, len) != 0)
return 0;
return path[len] == '\0' || path[len] == '/';
}
static void path_append(char *dst, size_t dst_size, const char *root,
const char *suffix) {
if (snprintf(dst, dst_size, "%s%s", root, suffix) >= (int)dst_size)
die("path too long");
}
static void stage_path(char *dst, size_t dst_size, const char *base,
const char *suffix) {
path_append(dst, dst_size, base, suffix);
}
static void normalize_absolute_path(const char *path, char *out,
size_t out_size) {
const char *segments[PATH_MAX / 2];
size_t count = 0;
char tmp[PATH_MAX];
char *save = NULL;
char *tok;
if (strlen(path) >= sizeof(tmp))
die("normalize path");
strcpy(tmp, path);
for (tok = strtok_r(tmp, "/", &save); tok; tok = strtok_r(NULL, "/", &save)) {
if (strcmp(tok, ".") == 0 || *tok == '\0')
continue;
if (strcmp(tok, "..") == 0) {
if (count > 0)
--count;
continue;
}
segments[count++] = tok;
}
if (snprintf(out, out_size, "/") >= (int)out_size)
die("normalize path");
for (size_t i = 0; i < count; ++i) {
size_t used = strlen(out);
if (snprintf(out + used, out_size - used, "%s%s", i == 0 ? "" : "/",
segments[i]) >= (int)(out_size - used))
die("normalize path");
}
}
static void resolve_bind_target(const char *path, char *resolved,
size_t resolved_size) {
struct stat st;
char link[PATH_MAX];
char base[PATH_MAX];
char combined[PATH_MAX];
ssize_t len;
if (lstat(path, &st) < 0 || !S_ISLNK(st.st_mode)) {
if (strlen(path) >= resolved_size)
die("resolve bind target");
strcpy(resolved, path);
return;
}
len = readlink(path, link, sizeof(link) - 1);
if (len < 0)
die("readlink");
link[len] = '\0';
if (link[0] == '/') {
normalize_absolute_path(link, resolved, resolved_size);
return;
}
if (strlen(path) >= sizeof(base))
die("resolve bind target");
strcpy(base, path);
char *slash = strrchr(base, '/');
if (!slash)
die("resolve bind target");
if (slash == base) {
base[1] = '\0';
} else {
*slash = '\0';
}
if (snprintf(combined, sizeof(combined), "%s/%s", base, link) >=
(int)sizeof(combined))
die("resolve bind target");
normalize_absolute_path(combined, resolved, resolved_size);
}
static void bind_mount_dir(const char *src, const char *dst, int readonly) {
if (!path_exists(src))
return;
mkdir_parents(dst, 0755);
if (mount(src, dst, NULL, MS_BIND | MS_REC, NULL) < 0)
die(src);
if (!readonly)
return;
if (mount(NULL, dst, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY | MS_REC, NULL) <
0)
die(dst);
}
static void bind_mount_file(const char *src, const char *dst) {
if (!path_exists(src))
return;
ensure_parent_dir(dst, 0755);
if (!path_exists(dst)) {
int fd = open(dst, O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC, 0644);
if (fd < 0)
die(dst);
close(fd);
}
if (mount(src, dst, NULL, MS_BIND, NULL) < 0)
die(src);
}
static void mount_procfs(const char *root) {
char path[PATH_MAX];
path_append(path, sizeof(path), root, "/proc");
mkdir_parents(path, 0555);
if (mount("proc", path, "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, "hidepid=2") < 0)
die("mount proc");
}
static void mount_private_tmp(const char *root) {
char path[PATH_MAX];
path_append(path, sizeof(path), root, "/tmp");
mkdir_parents(path, 01777);
if (mount("tmpfs", path, "tmpfs", MS_NODEV | MS_NOSUID,
"mode=1777,size=64m") < 0)
die("mount tmpfs /tmp");
}
static void mount_minimal_dev(const char *root) {
static const char *const dev_files[] = {
"/dev/null", "/dev/zero", "/dev/full", "/dev/random", "/dev/urandom",
};
char dev_root[PATH_MAX];
path_append(dev_root, sizeof(dev_root), root, "/dev");
mkdir_parents("/dev-min", 0755);
for (size_t i = 0; i < sizeof(dev_files) / sizeof(dev_files[0]); ++i) {
char src[PATH_MAX];
char dst[PATH_MAX];
path_append(src, sizeof(src), "/oldroot", dev_files[i]);
path_append(dst, sizeof(dst), "/dev-min", dev_files[i] + 4);
bind_mount_file(src, dst);
}
bind_mount_file("/oldroot/dev/tty", "/dev-min/tty");
mkdir_parents("/dev-min/pts", 0755);
if (mount("devpts", "/dev-min/pts", "devpts",
MS_NOSUID | MS_NOEXEC, "newinstance,ptmxmode=0666,mode=0620") < 0)
die("mount devpts");
unlink("/dev-min/ptmx");
if (symlink("pts/ptmx", "/dev-min/ptmx") < 0)
die("symlink /dev/ptmx");
mkdir_parents(dev_root, 0755);
if (mount("/dev-min", dev_root, NULL, MS_BIND | MS_REC, NULL) < 0)
die("mount /dev");
}
static void mount_runtime_tree(const struct fs_sandbox *sandbox) {
static const char *const runtime_dirs[] = {
"/bin", "/sbin", "/usr", "/lib", "/lib64", "/etc",
};
for (size_t i = 0; i < sizeof(runtime_dirs) / sizeof(runtime_dirs[0]); ++i) {
char src[PATH_MAX];
char dst[PATH_MAX];
const char *path = runtime_dirs[i];
if (path_contains(sandbox->resolved_cwd, path))
continue;
path_append(src, sizeof(src), "/oldroot", path);
path_append(dst, sizeof(dst), sandbox->final_root, path);
bind_mount_dir(src, dst, 1);
}
}
static void mount_resolv_conf(const char *root) {
char resolv_dst[PATH_MAX];
char mount_dst[PATH_MAX];
path_append(resolv_dst, sizeof(resolv_dst), root, "/etc/resolv.conf");
resolve_bind_target(resolv_dst, mount_dst, sizeof(mount_dst));
write_text_file("/resolv.conf.tmp", "nameserver 8.8.8.8\n");
bind_mount_file("/resolv.conf.tmp", mount_dst);
}
static void mount_overlay_cwd(struct fs_sandbox *sandbox) {
char lower_src[PATH_MAX];
char opts[4096];
path_append(lower_src, sizeof(lower_src), "/oldroot", sandbox->resolved_cwd);
bind_mount_dir(lower_src, "/lower", 0);
mkdir_parents("/upper", 0700);
mkdir_parents("/work", 0700);
mkdir_parents("/merged", 0755);
if (snprintf(opts, sizeof(opts),
"lowerdir=/lower,upperdir=/upper,workdir=/work,userxattr") >=
(int)sizeof(opts))
die("overlay options too long");
if (mount("overlay", "/merged", "overlay", 0, opts) < 0) {
if (errno == EINVAL &&
(strcmp(sandbox->resolved_cwd, "/") == 0 ||
strcmp(sandbox->resolved_cwd, "/tmp") == 0)) {
if (strcmp(sandbox->resolved_cwd, "/tmp") == 0) {
mkdir_parents("/merged", 0755);
if (mount("/lower", "/merged", NULL, MS_BIND | MS_REC, NULL) < 0)
die("/merged");
} else {
bind_mount_dir("/lower", "/merged", 1);
}
return;
}
die("mount overlay");
}
}
static void setup_final_root(struct fs_sandbox *sandbox) {
char dst[PATH_MAX];
if (strcmp(sandbox->resolved_cwd, "/") == 0) {
strcpy(sandbox->final_root, "/merged");
} else {
strcpy(sandbox->final_root, "/sandbox");
mkdir_parents(sandbox->final_root, 0755);
mount_runtime_tree(sandbox);
if (!path_contains(sandbox->resolved_cwd, "/tmp"))
mount_private_tmp(sandbox->final_root);
path_append(dst, sizeof(dst), sandbox->final_root, sandbox->resolved_cwd);
mkdir_parents(dst, 0755);
if (mount("/merged", dst, NULL, MS_BIND | MS_REC, NULL) < 0)
die("mount cwd overlay");
}
if (strcmp(sandbox->resolved_cwd, "/") == 0 &&
!path_contains(sandbox->resolved_cwd, "/tmp"))
mount_private_tmp(sandbox->final_root);
mount_procfs(sandbox->final_root);
mount_minimal_dev(sandbox->final_root);
mount_resolv_conf(sandbox->final_root);
}
static void prepare_fs_sandbox(struct fs_sandbox *sandbox, const char *cwd,
const char *base) {
char path[PATH_MAX];
if (!unsafe_share_cwd && (strcmp(cwd, "/") == 0 || strncmp(cwd, "/home/", 6) == 0 || strcmp(cwd, "/root") == 0)) {
fprintf(stderr, "Unsafe working directory %s. Use --unsafe-share-cwd\\n", cwd);
exit(1);
}
if (!realpath(cwd, sandbox->resolved_cwd))
die("realpath cwd");
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0)
die("mount MS_PRIVATE");
if (mount("tmpfs", base, "tmpfs", MS_NODEV | MS_NOSUID,
"mode=0700,size=128m") < 0)
die("mount tmpfs overlay base");
stage_path(path, sizeof(path), base, "/oldroot");
mkdir_parents(path, 0755);
stage_path(path, sizeof(path), base, "/dev/net");
mkdir_parents(path, 0755);
stage_path(path, sizeof(path), base, "/lower");
mkdir_parents(path, 0755);
stage_path(path, sizeof(path), base, "/upper");
mkdir_parents(path, 0700);
stage_path(path, sizeof(path), base, "/work");
mkdir_parents(path, 0700);
stage_path(path, sizeof(path), base, "/merged");
mkdir_parents(path, 0755);
stage_path(path, sizeof(path), base, "/sandbox");
mkdir_parents(path, 0755);
stage_path(path, sizeof(path), base, "/dev-min");
mkdir_parents(path, 0755);
stage_path(path, sizeof(path), base, "/resolv.conf.tmp");
ensure_parent_dir(path, 0755);
if (chdir(base) < 0)
die("chdir overlay base");
if (syscall(SYS_pivot_root, ".", "oldroot") < 0)
die("pivot_root");
if (chdir("/") < 0)
die("chdir /");
mkdir_parents("/dev", 0755);
mkdir_parents("/dev/net", 0755);
if (path_exists("/oldroot/dev/net")) {
if (mount("/oldroot/dev/net", "/dev/net", NULL, MS_BIND | MS_REC, NULL) < 0)
die("bind /dev/net");
}
path_append(path, sizeof(path), "/oldroot", base);
if (path_exists(path) &&
mount("tmpfs", path, "tmpfs", MS_NODEV | MS_NOSUID,
"mode=0000,size=4k") < 0)
die("hide overlay base");
mount_overlay_cwd(sandbox);
setup_final_root(sandbox);
}
static void enter_fs_sandbox(const struct fs_sandbox *sandbox) {
if (chroot(sandbox->final_root) < 0)
die("chroot sandbox");
if (chdir(sandbox->resolved_cwd) < 0)
die("chdir sandbox cwd");
}
static int ensure_no_new_privs(void) {
return prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
}
static uint64_t landlock_read_exec_rights(void) {
#if SP_HAVE_LANDLOCK
return LANDLOCK_ACCESS_FS_EXECUTE | LANDLOCK_ACCESS_FS_READ_FILE |
LANDLOCK_ACCESS_FS_READ_DIR;
#else
return 0;
#endif
}
static uint64_t landlock_write_rights_for_abi(int abi) {
uint64_t rights = 0;
#if SP_HAVE_LANDLOCK
(void)abi;
rights = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_REMOVE_DIR |
LANDLOCK_ACCESS_FS_REMOVE_FILE | LANDLOCK_ACCESS_FS_MAKE_CHAR |
LANDLOCK_ACCESS_FS_MAKE_DIR | LANDLOCK_ACCESS_FS_MAKE_REG |
LANDLOCK_ACCESS_FS_MAKE_SOCK | LANDLOCK_ACCESS_FS_MAKE_FIFO |
LANDLOCK_ACCESS_FS_MAKE_BLOCK | LANDLOCK_ACCESS_FS_MAKE_SYM;
#ifdef LANDLOCK_ACCESS_FS_REFER
if (abi >= 2)
rights |= LANDLOCK_ACCESS_FS_REFER;
#endif
#ifdef LANDLOCK_ACCESS_FS_TRUNCATE
if (abi >= 3)
rights |= LANDLOCK_ACCESS_FS_TRUNCATE;
#endif
#else
(void)abi;
#endif
return rights;
}
static int add_landlock_rule(int ruleset_fd, const char *path,
uint64_t allowed_access) {
#if SP_HAVE_LANDLOCK && defined(__NR_landlock_add_rule)
int dirfd = open(path, O_PATH | O_CLOEXEC);
if (dirfd < 0)
return -1;
struct landlock_path_beneath_attr rule = {
.allowed_access = allowed_access,
.parent_fd = dirfd,
};
int rc = (int)syscall(__NR_landlock_add_rule, ruleset_fd,
LANDLOCK_RULE_PATH_BENEATH, &rule, 0);
close(dirfd);
return rc;
#else
(void)ruleset_fd;
(void)path;
(void)allowed_access;
errno = ENOSYS;
return -1;
#endif
}
static int apply_landlock_policy(const char *cwd_path) {
#if SP_HAVE_LANDLOCK && defined(__NR_landlock_create_ruleset) && \
defined(__NR_landlock_restrict_self)
int abi = (int)syscall(__NR_landlock_create_ruleset, NULL, 0,
LANDLOCK_CREATE_RULESET_VERSION);
if (abi < 0) {
if (errno == ENOSYS || errno == EOPNOTSUPP) {
fprintf(stderr, "[sockpuppet] Warning: Landlock not supported by kernel, continuing without filesystem sandbox\\n");
return 0;
}
return -1;
}
uint64_t read_exec = landlock_read_exec_rights();
uint64_t write_rights = landlock_write_rights_for_abi(abi);
struct landlock_ruleset_attr ruleset = {
.handled_access_fs = read_exec | write_rights,
};
int ruleset_fd = (int)syscall(__NR_landlock_create_ruleset, &ruleset,
sizeof(ruleset), 0);
if (ruleset_fd < 0) {
if (errno == ENOSYS || errno == EOPNOTSUPP) {
fprintf(stderr, "[sockpuppet] Warning: Landlock not supported by kernel, continuing without filesystem sandbox\\n");
return 0;
}
return -1;
}
if (add_landlock_rule(ruleset_fd, "/", read_exec) < 0 ||
add_landlock_rule(ruleset_fd, cwd_path, read_exec | write_rights) < 0 ||
add_landlock_rule(ruleset_fd, "/tmp", read_exec | write_rights) < 0 ||
add_landlock_rule(ruleset_fd, "/dev", LANDLOCK_ACCESS_FS_READ_DIR) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/pts",
LANDLOCK_ACCESS_FS_READ_DIR |
LANDLOCK_ACCESS_FS_READ_FILE |
LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/null",
LANDLOCK_ACCESS_FS_READ_FILE |
LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/tty",
LANDLOCK_ACCESS_FS_READ_FILE |
LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/ptmx",
LANDLOCK_ACCESS_FS_READ_FILE |
LANDLOCK_ACCESS_FS_WRITE_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/zero",
LANDLOCK_ACCESS_FS_READ_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/full",
LANDLOCK_ACCESS_FS_READ_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/random",
LANDLOCK_ACCESS_FS_READ_FILE) < 0 ||
add_landlock_rule(ruleset_fd, "/dev/urandom",
LANDLOCK_ACCESS_FS_READ_FILE) < 0) {
close(ruleset_fd);
return -1;
}
if (ensure_no_new_privs() < 0) {
close(ruleset_fd);
return -1;
}
if (syscall(__NR_landlock_restrict_self, ruleset_fd, 0) < 0) {
close(ruleset_fd);
return -1;
}
close(ruleset_fd);
return 0;
#else
(void)cwd_path;
return 0;
#endif
}
#ifndef SECCOMP_RET_KILL_PROCESS
#define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL
#endif
#ifndef __X32_SYSCALL_BIT
#define __X32_SYSCALL_BIT 0x40000000U
#endif
#define SP_CLONE_NAMESPACE_FLAGS_BASE \
(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWIPC | \
CLONE_NEWUTS | CLONE_NEWPID)
#ifdef CLONE_NEWCGROUP
#define SP_CLONE_NAMESPACE_FLAGS_CGROUP | CLONE_NEWCGROUP
#else
#define SP_CLONE_NAMESPACE_FLAGS_CGROUP
#endif
#ifdef CLONE_NEWTIME
#define SP_CLONE_NAMESPACE_FLAGS_TIME | CLONE_NEWTIME
#else
#define SP_CLONE_NAMESPACE_FLAGS_TIME
#endif
#define SP_CLONE_NAMESPACE_FLAGS \
(SP_CLONE_NAMESPACE_FLAGS_BASE SP_CLONE_NAMESPACE_FLAGS_CGROUP \
SP_CLONE_NAMESPACE_FLAGS_TIME)
#define SP_SECCOMP_KILL SECCOMP_RET_KILL_PROCESS
#define SP_SECCOMP_DENY_NR(nr) \
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)(nr), 0, 1), \
BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL)
#define SP_SECCOMP_ERRNO_NR(nr, err) \
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)(nr), 0, 1), \
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | ((err) & SECCOMP_RET_DATA))
#define SP_SECCOMP_ALLOW() BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
static int apply_child_seccomp(void) {
#if defined(__x86_64__)
static const struct sock_filter filter[] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(unsigned int)offsetof(struct seccomp_data, arch)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0),
BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL),
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(unsigned int)offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, __X32_SYSCALL_BIT, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL),
#ifdef __NR_unshare
SP_SECCOMP_DENY_NR(__NR_unshare),
#endif
#ifdef __NR_setns
SP_SECCOMP_DENY_NR(__NR_setns),
#endif
#ifdef __NR_mount
SP_SECCOMP_DENY_NR(__NR_mount),
#endif
#ifdef __NR_umount2
SP_SECCOMP_DENY_NR(__NR_umount2),
#endif
#ifdef __NR_pivot_root
SP_SECCOMP_DENY_NR(__NR_pivot_root),
#endif
#ifdef __NR_open_tree
SP_SECCOMP_DENY_NR(__NR_open_tree),
#endif
#ifdef __NR_move_mount
SP_SECCOMP_DENY_NR(__NR_move_mount),
#endif
#ifdef __NR_fsopen
SP_SECCOMP_DENY_NR(__NR_fsopen),
#endif
#ifdef __NR_fsconfig
SP_SECCOMP_DENY_NR(__NR_fsconfig),
#endif
#ifdef __NR_fsmount
SP_SECCOMP_DENY_NR(__NR_fsmount),
#endif
#ifdef __NR_fspick
SP_SECCOMP_DENY_NR(__NR_fspick),
#endif
#ifdef __NR_mount_setattr
SP_SECCOMP_DENY_NR(__NR_mount_setattr),
#endif
#ifdef __NR_bpf
SP_SECCOMP_DENY_NR(__NR_bpf),
#endif
#ifdef __NR_perf_event_open
SP_SECCOMP_DENY_NR(__NR_perf_event_open),
#endif
#ifdef __NR_userfaultfd
SP_SECCOMP_DENY_NR(__NR_userfaultfd),
#endif
#ifdef __NR_ptrace
SP_SECCOMP_DENY_NR(__NR_ptrace),
#endif
#ifdef __NR_init_module
SP_SECCOMP_DENY_NR(__NR_init_module),
#endif
#ifdef __NR_finit_module
SP_SECCOMP_DENY_NR(__NR_finit_module),
#endif
#ifdef __NR_delete_module
SP_SECCOMP_DENY_NR(__NR_delete_module),
#endif
#ifdef __NR_kexec_load
SP_SECCOMP_DENY_NR(__NR_kexec_load),
#endif
#ifdef __NR_io_uring_setup
SP_SECCOMP_ERRNO_NR(__NR_io_uring_setup, EPERM),
#endif
#ifdef __NR_io_uring_enter
SP_SECCOMP_ERRNO_NR(__NR_io_uring_enter, EPERM),
#endif
#ifdef __NR_io_uring_register
SP_SECCOMP_ERRNO_NR(__NR_io_uring_register, EPERM),
#endif
#ifdef __NR_process_vm_readv
SP_SECCOMP_DENY_NR(__NR_process_vm_readv),
#endif
#ifdef __NR_process_vm_writev
SP_SECCOMP_DENY_NR(__NR_process_vm_writev),
#endif
#ifdef __NR_keyctl
SP_SECCOMP_DENY_NR(__NR_keyctl),
#endif
#ifdef __NR_add_key
SP_SECCOMP_DENY_NR(__NR_add_key),
#endif
#ifdef __NR_request_key
SP_SECCOMP_DENY_NR(__NR_request_key),
#endif
#ifdef __NR_ioctl
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)__NR_ioctl, 0, 6),
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(unsigned int)offsetof(struct seccomp_data, args[1])),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)TIOCSTI, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & SECCOMP_RET_DATA)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)TIOCLINUX, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & SECCOMP_RET_DATA)),
SP_SECCOMP_ALLOW(),
#endif
#ifdef __NR_clone
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (unsigned int)__NR_clone, 0, 4),
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
(unsigned int)offsetof(struct seccomp_data, args[0])),
BPF_STMT(BPF_ALU | BPF_AND | BPF_K, (unsigned int)SP_CLONE_NAMESPACE_FLAGS),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 1, 0),
BPF_STMT(BPF_RET | BPF_K, SP_SECCOMP_KILL),
#endif
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter) / sizeof(filter[0])),
.filter = (struct sock_filter *)filter,
};
if (ensure_no_new_privs() < 0)
return -1;
return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
#else
fprintf(stderr, "[sockpuppet] Warning: Seccomp not supported on this architecture, continuing without syscall filtering\\n");
return 0;
#endif
}
/* ---------- epoll helpers ---------- */
static void epoll_add_tcp(struct tcp_flow *f) {
if (f->sock < 0 || g_epfd < 0)
return;
f->ew.type = FD_TCP;
f->ew.fd = f->sock;
f->ew.flow = f;
struct epoll_event ev = {.events = EPOLLIN, .data.ptr = &f->ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, f->sock, &ev) < 0)
die("epoll_ctl add tcp");
}
static void epoll_mod_tcp(struct tcp_flow *f, uint32_t events) {
if (f->sock < 0 || g_epfd < 0)
return;
struct epoll_event ev = {.events = events, .data.ptr = &f->ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_MOD, f->sock, &ev) < 0)
perror("epoll_ctl mod tcp");
}
static void epoll_add_udp(struct udp_flow *f) {
if (f->udp_relay < 0 || g_epfd < 0)
return;
f->ew.type = FD_UDP_RELAY;
f->ew.fd = f->udp_relay;
f->ew.flow = f;
struct epoll_event ev = {.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP,
.data.ptr = &f->ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, f->udp_relay, &ev) < 0)
die("epoll_ctl add udp");
}
static void epoll_mod_udp(struct udp_flow *f, uint32_t events) {
if (f->udp_relay < 0 || g_epfd < 0)
return;
struct epoll_event ev = {.events = events, .data.ptr = &f->ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_MOD, f->udp_relay, &ev) < 0)
perror("epoll_ctl mod udp");
}
static void epoll_add_udp_ctrl(struct udp_flow *f, uint32_t events) {
if (f->tcp_ctrl < 0 || g_epfd < 0)
return;
f->ctrl_ew.type = FD_UDP_CTRL;
f->ctrl_ew.fd = f->tcp_ctrl;
f->ctrl_ew.flow = f;
struct epoll_event ev = {.events = events, .data.ptr = &f->ctrl_ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, f->tcp_ctrl, &ev) < 0)
die("epoll_ctl add udp ctrl");
}
static void epoll_mod_udp_ctrl(struct udp_flow *f, uint32_t events) {
if (f->tcp_ctrl < 0 || g_epfd < 0)
return;
struct epoll_event ev = {.events = events, .data.ptr = &f->ctrl_ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_MOD, f->tcp_ctrl, &ev) < 0)
perror("epoll_ctl mod udp ctrl");
}
static void epoll_del(int fd) {
if (fd >= 0 && g_epfd >= 0)
epoll_ctl(g_epfd, EPOLL_CTL_DEL, fd, NULL);
}
static int set_nonblocking(int fd) {
int flags = fcntl(fd, F_GETFL, 0);
if (flags < 0)
return -1;
return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
}
static int interactive_open_pty_master(void) {
return posix_openpt(O_RDWR | O_NOCTTY | O_CLOEXEC);
}
static int interactive_sync_winsize(struct interactive_session *session) {
if (!session || !session->active || session->host_tty_fd < 0 ||
session->pty_master_fd < 0)
return 0;
if (ioctl(session->host_tty_fd, TIOCGWINSZ, &session->host_winsize) < 0) {
if (errno == ENOTTY)
return 0;
return -1;
}
session->host_winsize_saved = 1;
if (ioctl(session->pty_master_fd, TIOCSWINSZ, &session->host_winsize) < 0)
return -1;
return 0;
}
static void interactive_restore_terminal(struct interactive_session *session) {
if (!session || session->host_tty_fd < 0 || !session->host_termios_saved)
return;
IGNORE_RESULT(tcsetattr(session->host_tty_fd, TCSAFLUSH,
&session->host_termios));
}
static void interactive_close_session(struct interactive_session *session) {
if (!session)
return;
interactive_restore_terminal(session);
if (session->pty_master_fd >= 0) {
close(session->pty_master_fd);
session->pty_master_fd = -1;
}
if (session->pty_slave_fd >= 0) {
close(session->pty_slave_fd);
session->pty_slave_fd = -1;
}
if (session->host_tty_fd >= 0) {
close(session->host_tty_fd);
session->host_tty_fd = -1;
}
session->active = 0;
}
static int interactive_parent_setup(struct interactive_session *session) {
struct termios raw;
memset(session, 0, sizeof(*session));
session->host_tty_fd = -1;
session->pty_master_fd = -1;
session->pty_slave_fd = -1;
session->host_tty_fd = open("/dev/tty", O_RDWR | O_NOCTTY | O_CLOEXEC);
if (session->host_tty_fd < 0)
return -1;
if (!isatty(session->host_tty_fd)) {
errno = ENOTTY;
return -1;
}
session->pty_master_fd = interactive_open_pty_master();
if (session->pty_master_fd < 0)
return -1;
if (grantpt(session->pty_master_fd) < 0)
return -1;
if (unlockpt(session->pty_master_fd) < 0)
return -1;
#ifdef TIOCGPTPEER
session->pty_slave_fd =
ioctl(session->pty_master_fd, TIOCGPTPEER, O_RDWR | O_NOCTTY | O_CLOEXEC);
if (session->pty_slave_fd < 0)
return -1;
#else
errno = ENOTSUP;
return -1;
#endif
if (tcgetattr(session->host_tty_fd, &session->host_termios) < 0)
return -1;
session->host_termios_saved = 1;
raw = session->host_termios;
cfmakeraw(&raw);
if (tcsetattr(session->host_tty_fd, TCSAFLUSH, &raw) < 0)
return -1;
session->active = 1;
if (interactive_sync_winsize(session) < 0)
return -1;
return 0;
}
static void interactive_child_setup(const struct interactive_session *session) {
if (!session || session->pty_slave_fd < 0) {
errno = EBADF;
die("interactive child setup");
}
if (setsid() < 0)
die("setsid");
if (ioctl(session->pty_slave_fd, TIOCSCTTY, 0) < 0)
die("TIOCSCTTY");
if (dup2(session->pty_slave_fd, STDIN_FILENO) < 0)
die("dup2 stdin");
if (dup2(session->pty_slave_fd, STDOUT_FILENO) < 0)
die("dup2 stdout");
if (dup2(session->pty_slave_fd, STDERR_FILENO) < 0)
die("dup2 stderr");
}
static int start_nonblocking_connect(int fd, const struct sockaddr_in *addr) {
if (set_nonblocking(fd) < 0)
return -1;
if (connect(fd, (const struct sockaddr *)addr, sizeof(*addr)) == 0)
return 0;
if (errno == EINPROGRESS)
return 1;
return -1;
}
static int socket_connect_complete(int fd) {
int so_error = 0;
socklen_t len = sizeof(so_error);
if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &so_error, &len) < 0)
return -1;
if (so_error != 0) {
errno = so_error;
return -1;
}
return 0;
}
static void socks_io_reset(struct socks_io *io) { memset(io, 0, sizeof(*io)); }
static int socks_has_pending_tx(const struct socks_io *io) {
return io->tx_off < io->tx_len;
}
static size_t socks_response_need(const struct socks_io *io) {
if (io->state == SOCKS_IO_METHOD || io->state == SOCKS_IO_AUTH)
return 2;
if (io->state != SOCKS_IO_REQUEST)
return 0;
if (io->rx_len < 4)
return 4;
if (io->rxbuf[3] == 0x01)
return 10;
if (io->rxbuf[3] == 0x04)
return 22;
if (io->rxbuf[3] == 0x03) {
if (io->rx_len < 5)
return 5;
size_t total = (size_t)(4 + 1 + io->rxbuf[4] + 2);
if (total > sizeof(io->rxbuf))
return sizeof(io->rxbuf) + 1; /* Trigger failure check */
return total;
}
return 4;
}
static void socks_consume_rx(struct socks_io *io, size_t used) {
if (used >= io->rx_len) {
io->rx_len = 0;
return;
}
memmove(io->rxbuf, io->rxbuf + used, io->rx_len - used);
io->rx_len -= used;
}
static int socks_queue_send(struct socks_io *io, const uint8_t *data, size_t len) {
if (len > sizeof(io->txbuf))
return -1;
memcpy(io->txbuf, data, len);
io->tx_off = 0;
io->tx_len = len;
return 0;
}
static int socks_queue_greeting(struct socks_io *io, const struct socks_config *cfg) {
uint8_t greeting[4];
if (cfg->username[0] != '\0') {
greeting[0] = 0x05;
greeting[1] = 0x02;
greeting[2] = 0x00;
greeting[3] = 0x02;
return socks_queue_send(io, greeting, 4);
}
greeting[0] = 0x05;
greeting[1] = 0x01;
greeting[2] = 0x00;
return socks_queue_send(io, greeting, 3);
}
static int socks_queue_auth(struct socks_io *io, const struct socks_config *cfg) {
size_t ulen = strlen(cfg->username);
size_t plen = strlen(cfg->password);
uint8_t auth[513];
size_t off = 0;
if (ulen > 255 || plen > 255)
return -1;
auth[off++] = 0x01;
auth[off++] = (uint8_t)ulen;
memcpy(auth + off, cfg->username, ulen);
off += ulen;
auth[off++] = (uint8_t)plen;
memcpy(auth + off, cfg->password, plen);
off += plen;
return socks_queue_send(io, auth, off);
}
static int socks_queue_request(struct socks_io *io) {
uint8_t req[10];
uint16_t port_be = htons(io->target_port);
req[0] = 0x05;
req[1] = io->is_udp ? 0x03 : 0x01;
req[2] = 0x00;
req[3] = 0x01;
memcpy(req + 4, &io->target_ip, 4);
memcpy(req + 8, &port_be, 2);
return socks_queue_send(io, req, sizeof(req));
}
static int socks_begin_handshake(struct socks_io *io,
const struct socks_config *cfg) {
if (socks_queue_greeting(io, cfg) < 0)
return -1;
io->state = SOCKS_IO_METHOD;
return 0;
}
static int socks_flush_tx(int fd, struct socks_io *io) {
while (socks_has_pending_tx(io)) {
ssize_t sent =
send(fd, io->txbuf + io->tx_off, io->tx_len - io->tx_off, MSG_NOSIGNAL);
if (sent > 0) {
io->tx_off += (size_t)sent;
continue;
}
if (sent < 0 && errno == EINTR)
continue;
if (sent < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
return 0;
return -1;
}
io->tx_off = 0;
io->tx_len = 0;
return 0;
}
static int socks_process_rx(struct socks_io *io, const struct socks_config *cfg,
struct sockaddr_in *relay_addr) {
for (;;) {
size_t need = socks_response_need(io);
if (io->state == SOCKS_IO_READY)
return 1;
if (need > sizeof(io->rxbuf)) {
io->state = SOCKS_IO_FAILED;
return -1;
}
if (need == 0 || io->rx_len < need)
return 0;
if (io->state == SOCKS_IO_METHOD) {
if (io->rxbuf[0] != 0x05) {
io->state = SOCKS_IO_FAILED;
return -1;
}
if (io->rxbuf[1] == 0x00) {
socks_consume_rx(io, 2);
if (socks_queue_request(io) < 0) {
io->state = SOCKS_IO_FAILED;
return -1;
}
io->state = SOCKS_IO_REQUEST;
continue;
}
if (io->rxbuf[1] == 0x02 && cfg->username[0] != '\0') {
socks_consume_rx(io, 2);
if (socks_queue_auth(io, cfg) < 0) {
io->state = SOCKS_IO_FAILED;
return -1;
}
io->state = SOCKS_IO_AUTH;
continue;
}
io->state = SOCKS_IO_FAILED;
return -1;
}
if (io->state == SOCKS_IO_AUTH) {
if (io->rxbuf[0] != 0x01 || io->rxbuf[1] != 0x00) {
io->state = SOCKS_IO_FAILED;
return -1;
}
socks_consume_rx(io, 2);
if (socks_queue_request(io) < 0) {
io->state = SOCKS_IO_FAILED;
return -1;
}
io->state = SOCKS_IO_REQUEST;
continue;
}
if (io->state == SOCKS_IO_REQUEST) {
uint8_t atyp;
size_t used = need;
if (io->rxbuf[0] != 0x05 || io->rxbuf[1] != 0x00) {
io->state = SOCKS_IO_FAILED;
return -1;
}
atyp = io->rxbuf[3];
if (relay_addr) {
memset(relay_addr, 0, sizeof(*relay_addr));
relay_addr->sin_family = AF_INET;
if (atyp == 0x01 && need >= 10) {
memcpy(&relay_addr->sin_addr.s_addr, io->rxbuf + 4, 4);
relay_addr->sin_port = htons(
(uint16_t)((io->rxbuf[8] << 8) | io->rxbuf[9]));
} else if (atyp == 0x03 && need >= 7) {
relay_addr->sin_addr = cfg->addr.sin_addr;
relay_addr->sin_port = htons(
(uint16_t)((io->rxbuf[need - 2] << 8) | io->rxbuf[need - 1]));
} else if (atyp == 0x04) {
io->state = SOCKS_IO_FAILED;
return -1;
} else {
io->state = SOCKS_IO_FAILED;
return -1;
}
if (relay_addr->sin_addr.s_addr == 0 ||
relay_addr->sin_addr.s_addr == htonl(0x7f000001)) {
relay_addr->sin_addr = cfg->addr.sin_addr;
}
DBG("SOCKS UDP relay ready at %s:%d", inet_ntoa(relay_addr->sin_addr),
ntohs(relay_addr->sin_port));
}
socks_consume_rx(io, used);
io->state = SOCKS_IO_READY;
return 1;
}
}
}
static int socks_recv_and_process(int fd, struct socks_io *io,
const struct socks_config *cfg,
struct sockaddr_in *relay_addr) {
for (;;) {
ssize_t r = recv(fd, io->rxbuf + io->rx_len, sizeof(io->rxbuf) - io->rx_len,
0);
if (r > 0) {
io->rx_len += (size_t)r;
if (socks_process_rx(io, cfg, relay_addr) != 0)
return (io->state == SOCKS_IO_READY) ? 1 : -1;
continue;
}
if (r == 0)
return -1;
if (errno == EINTR)
continue;
if (errno == EAGAIN || errno == EWOULDBLOCK)
return 0;
return -1;
}
}
/* Drop all capabilities (for rootless mode) */
static void drop_caps(void) {
struct __user_cap_header_struct hdr = {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = 0,
};
struct __user_cap_data_struct data[2] = {{0}};
struct __user_cap_data_struct verify[2] = {{0}};
/* Drop bounding set first while we still have CAP_SETPCAP.
* EINVAL is expected for capability numbers the kernel doesn't know. */
for (int cap = 0; cap <= CAP_LAST_CAP; cap++) {
if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0) < 0 && errno != EINVAL) {
perror("PR_CAPBSET_DROP");
exit(1);
}
}
/* Clear ambient capabilities - EINVAL expected if not supported */
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) < 0 &&
errno != EINVAL) {
perror("PR_CAP_AMBIENT_CLEAR_ALL");
exit(1);
}
/* Verify bounding set is empty */
for (int cap = 0; cap <= CAP_LAST_CAP; cap++) {
int rc = prctl(PR_CAPBSET_READ, cap, 0, 0, 0);
if (rc > 0) {
fprintf(stderr, "Capability %d survived drop\n", cap);
exit(1);
}
}
/* Now clear all capability sets */
if (syscall(SYS_capset, &hdr, data) < 0) {
perror("capset");
exit(1);
}
if (syscall(SYS_capget, &hdr, verify) < 0) {
perror("capget verify");
exit(1);
}
if (verify[0].effective || verify[0].permitted || verify[0].inheritable ||
verify[1].effective || verify[1].permitted || verify[1].inheritable) {
fprintf(stderr, "capabilities survived drop\n");
exit(1);
}
/* Disable core dumps (prevents leaking sensitive data) */
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) {
perror("PR_SET_DUMPABLE");
exit(1);
}
}
/* Parse SOCKS URL: [socks5://]host:port */
static void parse_socks_auth_file(const char *path) {
FILE *f = fopen(path, "r");
if (!f) die("fopen socks auth file");
char buf[256];
if (!fgets(buf, sizeof(buf), f)) die("read socks auth file");
fclose(f);
char *newline = strchr(buf, '\n');
if (newline) *newline = '\0';
char *colon = strchr(buf, ':');
if (!colon) die("invalid auth file format");
*colon = '\0';
snprintf(socks_proxy.username, sizeof(socks_proxy.username), "%s", buf);
snprintf(socks_proxy.password, sizeof(socks_proxy.password), "%s", colon + 1);
}
static int parse_socks_url(const char *url, struct socks_config *cfg) {
const char *p = url;
const char *colon;
long port;
memset(cfg, 0, sizeof(*cfg));
cfg->port = 1080; /* default SOCKS port */
/* Skip protocol prefix if present */
if (strncmp(p, "socks5://", 9) == 0)
p += 9;
else if (strncmp(p, "socks://", 8) == 0)
p += 8;
else if (strncmp(p, "socks5h://", 10) == 0)
p += 10;
if (strchr(p, '@')) {
fprintf(stderr, "Credentials in SOCKS URL are not allowed for security reasons.\\n");
exit(1);
}
/* Parse host:port */
colon = strrchr(p, ':');
if (colon) {
size_t hlen = (size_t)(colon - p);
if (hlen == 0 || hlen >= sizeof(cfg->host) ||
parse_long_strict(colon + 1, 1, 65535, &port) < 0) {
fprintf(stderr, "Invalid SOCKS proxy: %s\n", url);
exit(1);
}
memcpy(cfg->host, p, hlen);
cfg->host[hlen] = '\0';
cfg->port = (int)port;
} else {
size_t hlen = strlen(p);
if (hlen == 0 || hlen >= sizeof(cfg->host)) {
fprintf(stderr, "Invalid SOCKS proxy: %s\n", url);
exit(1);
}
memcpy(cfg->host, p, hlen);
cfg->host[hlen] = '\0';
}
cfg->enabled = (cfg->host[0] != '\0');
return cfg->enabled;
}
static int resolve_socks_proxy(struct socks_config *cfg) {
struct sockaddr_in proxy_addr = {
.sin_family = AF_INET,
.sin_port = htons((uint16_t)cfg->port),
};
if (!cfg->enabled)
return 0;
if (cfg->addr_valid)
return 0;
if (inet_pton(AF_INET, cfg->host, &proxy_addr.sin_addr) <= 0) {
struct addrinfo hints = {.ai_family = AF_INET, .ai_socktype = SOCK_STREAM};
struct addrinfo *res;
if (getaddrinfo(cfg->host, NULL, &hints, &res) != 0)
return -1;
proxy_addr.sin_addr = ((struct sockaddr_in *)res->ai_addr)->sin_addr;
freeaddrinfo(res);
}
cfg->addr = proxy_addr;
cfg->addr_valid = 1;
return 0;
}
extern char **environ;
static void close_extra_fds_for_exec(void) {
#ifdef __NR_close_range
if (syscall(__NR_close_range, 3U, ~0U, 0U) == 0)
return;
if (errno != ENOSYS && errno != EINVAL)
die("close_range");
#endif
DIR *dir = opendir("/proc/self/fd");
if (dir != NULL) {
int scan_fd = dirfd(dir);
struct dirent *ent;
int *fds = NULL;
size_t fds_len = 0;
size_t fds_cap = 0;
while ((ent = readdir(dir)) != NULL) {
char *end = NULL;
long fd = strtol(ent->d_name, &end, 10);
if (end == NULL || *end != '\0')
continue;
if (fd <= 2 || fd == scan_fd)
continue;
if (fds_len == fds_cap) {
size_t new_cap = (fds_cap == 0) ? 16 : fds_cap * 2;
int *new_fds = realloc(fds, new_cap * sizeof(*new_fds));
if (new_fds == NULL) {
free(fds);
closedir(dir);
die("realloc close fd list");
}
fds = new_fds;
fds_cap = new_cap;
}
fds[fds_len++] = (int)fd;
}
closedir(dir);
for (size_t i = 0; i < fds_len; ++i)
close(fds[i]);
free(fds);
return;
}
long maxfd = sysconf(_SC_OPEN_MAX);
if (maxfd < 0)
maxfd = 256;
for (int fd = 3; fd < maxfd; ++fd)
close(fd);
}
static int env_name_matches(const char *entry, const char *name) {
const char *eq = strchr(entry, '=');
size_t len = eq != NULL ? (size_t)(eq - entry) : strlen(entry);
return strlen(name) == len && strncmp(entry, name, len) == 0;
}
static int should_keep_env(const char *entry) {
static const char *const keep[] = {
"PATH", "HOME", "USER", "LOGNAME", "SHELL",
"TERM", "LANG", "TZ", NULL,
};
if (strncmp(entry, "LC_", 3) == 0 && strchr(entry, '=') != NULL)
return 1;
for (size_t i = 0; keep[i] != NULL; ++i) {
if (env_name_matches(entry, keep[i]))
return 1;
}
return 0;
}
static char *make_env_entry(const char *name, const char *value) {
size_t name_len = strlen(name);
size_t value_len = strlen(value);
char *entry = malloc(name_len + 1 + value_len + 1);
if (entry == NULL)
die("malloc env");
memcpy(entry, name, name_len);
entry[name_len] = '=';
memcpy(entry + name_len + 1, value, value_len + 1);
return entry;
}
static char **build_sanitized_envp(const char *cwd) {
static const char default_path[] =
"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
size_t keep_count = 0;
int have_path = 0;
for (char **env = environ; env != NULL && *env != NULL; ++env) {
if (!should_keep_env(*env))
continue;
if (env_name_matches(*env, "PWD") || env_name_matches(*env, "TMPDIR"))
continue;
if (env_name_matches(*env, "PATH"))
have_path = 1;
++keep_count;
}
char **envp = calloc(keep_count + (have_path ? 0U : 1U) + 4U, sizeof(*envp));
size_t idx = 0;
if (envp == NULL)
die("calloc envp");
for (char **env = environ; env != NULL && *env != NULL; ++env) {
if (!should_keep_env(*env))
continue;
if (env_name_matches(*env, "PWD") || env_name_matches(*env, "TMPDIR"))
continue;
envp[idx] = strdup(*env);
if (envp[idx] == NULL)
die("strdup env");
++idx;
}
if (!have_path) {
envp[idx++] = make_env_entry("PATH", default_path);
}
envp[idx++] = make_env_entry("PWD", cwd);
envp[idx++] = make_env_entry("TMPDIR", "/tmp");
envp[idx++] = make_env_entry("HOME", "/tmp/home");
envp[idx] = NULL;
mkdir("/tmp/home", 0700);
return envp;
}
static void copy_ifname(char dst[IFNAMSIZ], const char *src) {
int rc = snprintf(dst, IFNAMSIZ, "%s", src);
if (rc < 0 || rc >= IFNAMSIZ) {
errno = ENAMETOOLONG;
die("ifname too long");
}
}
/* ---------- FD passing ---------- */
static void send_fd(int sock, int fd) {
struct msghdr msg;
memset(&msg, 0, sizeof(msg));
char byte = 'X';
struct iovec iov = {&byte, 1};
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
char cbuf[CMSG_SPACE(sizeof(int))];
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
if (sendmsg(sock, &msg, 0) < 0)
die("sendmsg");
}
static int recv_fd(int sock) {
struct msghdr msg;
memset(&msg, 0, sizeof(msg));
char byte;
struct iovec iov = {&byte, 1};
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
char cbuf[CMSG_SPACE(sizeof(int))];
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
ssize_t n = recvmsg(sock, &msg, MSG_CMSG_CLOEXEC);
if (n < 0)
die("recvmsg");
if (n == 0 || (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) != 0)
die("recv_fd truncated");
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
int fd;
if (cmsg == NULL || cmsg->cmsg_level != SOL_SOCKET ||
cmsg->cmsg_type != SCM_RIGHTS ||
cmsg->cmsg_len < CMSG_LEN(sizeof(int)))
die("recv_fd invalid ancillary data");
memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
/* ---------- TUN and interface helpers for child process ---------- */
static int tun_create(const char *name) {
int fd = open("/dev/net/tun", O_RDWR | O_CLOEXEC);
if (fd < 0)
die("open /dev/net/tun");
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
copy_ifname(ifr.ifr_name, name);
if (ioctl(fd, TUNSETIFF, &ifr) < 0)
die("TUNSETIFF");
return fd;
}
static void if_up(const char *ifname) {
int s = socket(AF_INET, SOCK_DGRAM, 0);
if (s < 0)
die("socket");
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
copy_ifname(ifr.ifr_name, ifname);
if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
die("SIOCGIFFLAGS");
ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
die("SIOCSIFFLAGS");
close(s);
}
/* point-to-point address */
static void if_addr_ptp(const char *ifname, const char *local,
const char *peer) {
int s = socket(AF_INET, SOCK_DGRAM, 0);
struct ifreq ifr = {0};
struct sockaddr_in addr = {.sin_family = AF_INET};
if (s < 0)
die("socket");
copy_ifname(ifr.ifr_name, ifname);
if (inet_pton(AF_INET, local, &addr.sin_addr) != 1)
die("inet_pton local");
memcpy(&ifr.ifr_addr, &addr, sizeof(addr));
if (ioctl(s, SIOCSIFADDR, &ifr) < 0)
die("SIOCSIFADDR");
if (inet_pton(AF_INET, peer, &addr.sin_addr) != 1)
die("inet_pton peer");
memcpy(&ifr.ifr_dstaddr, &addr, sizeof(addr));
if (ioctl(s, SIOCSIFDSTADDR, &ifr) < 0)
die("SIOCSIFDSTADDR");
close(s);
}
static int if_index(const char *ifname) {
int s = socket(AF_INET, SOCK_DGRAM, 0);
if (s < 0)
die("socket");
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
copy_ifname(ifr.ifr_name, ifname);
if (ioctl(s, SIOCGIFINDEX, &ifr) < 0)
die("SIOCGIFINDEX");
close(s);
return ifr.ifr_ifindex;
}
static void add_default_route(const char *ifname, const char *gw) {
int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
struct {
struct nlmsghdr nlh;
struct rtmsg rtm;
char buf[256];
} req = {0};
if (fd < 0)
die("socket");
req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
req.nlh.nlmsg_type = RTM_NEWROUTE;
req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE;
req.rtm.rtm_family = AF_INET;
req.rtm.rtm_table = RT_TABLE_MAIN;
req.rtm.rtm_protocol = RTPROT_BOOT;
req.rtm.rtm_scope = RT_SCOPE_UNIVERSE;
req.rtm.rtm_type = RTN_UNICAST;
struct rtattr *rta;
rta = (void *)req.buf;
rta->rta_type = RTA_GATEWAY;
rta->rta_len = RTA_LENGTH(4);
if (inet_pton(AF_INET, gw, RTA_DATA(rta)) != 1)
die("inet_pton gateway");
req.nlh.nlmsg_len += rta->rta_len;
rta = (void *)((char *)rta + rta->rta_len);
rta->rta_type = RTA_OIF;
rta->rta_len = RTA_LENGTH(4);
*(int *)RTA_DATA(rta) = if_index(ifname);
req.nlh.nlmsg_len += rta->rta_len;
if (send(fd, &req, req.nlh.nlmsg_len, 0) != (ssize_t)req.nlh.nlmsg_len)
die("send netlink route");
close(fd);
}
static void if_up_netlink(const char *ifname) {
int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (fd < 0)
die("socket");
struct {
struct nlmsghdr nlh;
struct ifinfomsg ifi;
} req = {0};
req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nlh.nlmsg_type = RTM_NEWLINK;
req.nlh.nlmsg_flags = NLM_F_REQUEST;
req.ifi.ifi_family = AF_UNSPEC;
req.ifi.ifi_index = if_index(ifname);
req.ifi.ifi_flags = IFF_UP | IFF_RUNNING;
req.ifi.ifi_change = IFF_UP | IFF_RUNNING;
if (send(fd, &req, req.nlh.nlmsg_len, 0) < 0)
die("send netlink");
close(fd);
}
/* ----------- TCP/IP helpers ------------------ */
static uint32_t csum16_partial(const void *buf, size_t len);
static uint16_t csum16_fold(uint32_t sum) {
while (sum >> 16)
sum = (sum & 0xffffU) + (sum >> 16);
return (uint16_t)~sum;
}
static uint16_t csum16(const void *buf, size_t len) {
return csum16_fold(csum16_partial(buf, len));
}
/* ---------- Persistent UDP Flow Management ---------- */
static void udp_close_flow(struct udp_flow *f);
static struct udp_flow *udp_find(uint32_t cip, uint16_t cport, uint32_t sip,
uint16_t sport) {
for (int i = 0; i < MAX_UDP; i++) {
struct udp_flow *f = &udp_flows[i];
/* Full 4-tuple match for proper flow isolation */
if ((f->udp_relay >= 0 || f->udp_staging >= 0 || f->tcp_ctrl >= 0) &&
f->cli_ip == cip &&
f->cli_port == cport &&
f->srv_ip == sip && f->srv_port == sport)
return f;
}
return NULL;
}
static struct udp_flow *udp_alloc(void) {
/* First try to find an empty slot */
for (int i = 0; i < MAX_UDP; i++) {
if (udp_flows[i].udp_relay < 0 && udp_flows[i].udp_staging < 0 &&
udp_flows[i].tcp_ctrl < 0)
return &udp_flows[i];
}
/* Otherwise evict oldest */
struct udp_flow *oldest = &udp_flows[0];
for (int i = 1; i < MAX_UDP; i++) {
if (udp_flows[i].last_used < oldest->last_used)
oldest = &udp_flows[i];
}
if (oldest->tcp_ctrl >= 0)
epoll_del(oldest->tcp_ctrl);
if (oldest->tcp_ctrl >= 0)
close(oldest->tcp_ctrl);
if (oldest->udp_relay >= 0) {
epoll_del(oldest->udp_relay);
close(oldest->udp_relay);
}
if (oldest->udp_staging >= 0)
close(oldest->udp_staging);
memset(oldest, 0, sizeof(*oldest));
return oldest;
}
static uint32_t udp_ctrl_events(const struct udp_flow *f) {
uint32_t events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
if (f->socks.connect_pending || socks_has_pending_tx(&f->socks))
events |= EPOLLOUT;
return events;
}
static void udp_update_events(struct udp_flow *f) {
uint32_t events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
if (f->udp_relay >= 0 && f->pending_set)
events |= EPOLLOUT;
epoll_mod_udp(f, events);
}
static int udp_queue_pending(struct udp_flow *f, const uint8_t *data, size_t len) {
if (len > sizeof(f->pending_data))
return -1;
if (f->pending_set) {
f->dropped_backpressure++;
DBG("UDP pending queue full; dropping newest datagram (drops=%lu)",
f->dropped_backpressure);
return 1;
}
memcpy(f->pending_data, data, len);
f->pending_len = len;
f->pending_set = 1;
f->last_used = time(NULL);
udp_update_events(f);
return 0;
}
static int udp_socks_setup(struct udp_flow *f, struct socks_config *cfg) {
int tcp_sock = socket(AF_INET, SOCK_STREAM, 0);
int udp_sock = -1;
struct sockaddr_in local = {.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = 0};
socklen_t locallen = sizeof(local);
int rc;
if (tcp_sock < 0)
return -1;
udp_sock = socket(AF_INET, SOCK_DGRAM, 0);
if (udp_sock < 0) {
close(tcp_sock);
return -1;
}
if (bind(udp_sock, (struct sockaddr *)&local, sizeof(local)) < 0) {
close(udp_sock);
close(tcp_sock);
return -1;
}
if (getsockname(udp_sock, (struct sockaddr *)&local, &locallen) < 0) {
close(udp_sock);
close(tcp_sock);
return -1;
}
if (set_nonblocking(udp_sock) < 0) {
close(udp_sock);
close(tcp_sock);
return -1;
}
socks_io_reset(&f->socks);
f->socks.active = 1;
f->socks.is_udp = 1;
f->socks.target_ip = 0;
f->socks.target_port = ntohs(local.sin_port);
rc = start_nonblocking_connect(tcp_sock, &cfg->addr);
if (rc < 0) {
close(udp_sock);
close(tcp_sock);
return -1;
}
f->tcp_ctrl = tcp_sock;
f->udp_relay = -1;
f->udp_staging = udp_sock;
f->socks.connect_pending = (rc > 0);
f->socks.state = f->socks.connect_pending ? SOCKS_IO_CONNECTING
: SOCKS_IO_METHOD;
if (!f->socks.connect_pending && socks_begin_handshake(&f->socks, cfg) < 0) {
close(udp_sock);
close(tcp_sock);
f->tcp_ctrl = -1;
f->udp_staging = -1;
return -1;
}
epoll_add_udp_ctrl(f, udp_ctrl_events(f));
return 0;
}
static int udp_direct_setup(struct udp_flow *f, uint32_t dst_ip,
uint16_t dst_port) {
int udp_sock = socket(AF_INET, SOCK_DGRAM, 0);
if (udp_sock < 0)
return -1;
struct sockaddr_in dst = {
.sin_family = AF_INET,
.sin_port = htons(dst_port),
.sin_addr.s_addr = dst_ip,
};
if (connect(udp_sock, (struct sockaddr *)&dst, sizeof(dst)) < 0) {
close(udp_sock);
return -1;
}
if (set_nonblocking(udp_sock) < 0) {
close(udp_sock);
return -1;
}
f->tcp_ctrl = -1;
f->udp_relay = udp_sock;
f->relay_addr = dst;
epoll_add_udp(f);
udp_update_events(f);
return 0;
}
static int udp_try_send_socks(struct udp_flow *f, const uint8_t *payload,
size_t payload_len) {
uint8_t pkt[65536];
struct in_addr dst_addr = {.s_addr = f->srv_ip};
if (payload_len > sizeof(pkt) - 10)
return -1;
pkt[0] = 0;
pkt[1] = 0;
pkt[2] = 0; /* RSV, FRAG */
pkt[3] = 0x01; /* ATYP = IPv4 */
memcpy(&pkt[4], &f->srv_ip, 4);
pkt[8] = (uint8_t)(f->srv_port >> 8);
pkt[9] = (uint8_t)(f->srv_port & 0xff);
memcpy(&pkt[10], payload, payload_len);
DBG("UDP via SOCKS relay %s:%d -> %s:%d len=%zu",
inet_ntoa(f->relay_addr.sin_addr), ntohs(f->relay_addr.sin_port),
inet_ntoa(dst_addr), f->srv_port, payload_len);
for (;;) {
ssize_t n = sendto(f->udp_relay, pkt, 10 + payload_len, 0,
(struct sockaddr *)&f->relay_addr,
sizeof(f->relay_addr));
if (n == (ssize_t)(10 + payload_len)) {
f->last_used = time(NULL);
return 0;
}
if (n >= 0) {
DBG("SOCKS UDP short send: %zd/%zu, closing flow", n, 10 + payload_len);
return -1;
}
if (errno == EINTR)
continue;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
int qrc = udp_queue_pending(f, payload, payload_len);
if (qrc < 0)
return -1;
return 0;
}
if (errno == ENOBUFS || errno == ENOMEM) {
f->dropped_backpressure++;
DBG("SOCKS UDP send drop due to %s (drops=%lu)", strerror(errno),
f->dropped_backpressure);
return 0;
}
return -1;
}
}
static int udp_try_send_direct(struct udp_flow *f, const uint8_t *payload,
size_t payload_len) {
for (;;) {
ssize_t n = send(f->udp_relay, payload, payload_len, MSG_NOSIGNAL);
if (n == (ssize_t)payload_len) {
f->last_used = time(NULL);
return 0;
}
if (n >= 0) {
DBG("UDP short send: %zd/%zu, closing flow", n, payload_len);
return -1;
}
if (errno == EINTR)
continue;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
int qrc = udp_queue_pending(f, payload, payload_len);
if (qrc < 0)
return -1;
return 0;
}
if (errno == ENOBUFS || errno == ENOMEM) {
f->dropped_backpressure++;
DBG("UDP send drop due to %s (drops=%lu)", strerror(errno),
f->dropped_backpressure);
return 0;
}
return -1;
}
}
static int udp_open_relay_socket(struct udp_flow *f,
const struct sockaddr_in *relay_addr) {
int udp_sock = f->udp_staging;
if (udp_sock < 0)
return -1;
f->udp_relay = udp_sock;
f->udp_staging = -1;
f->relay_addr = *relay_addr;
epoll_add_udp(f);
udp_update_events(f);
return 0;
}
static int udp_flush_pending(struct udp_flow *f) {
if (f->udp_relay < 0 || !f->pending_set)
return 0;
size_t len = f->pending_len;
uint8_t payload[65535];
int rc;
memcpy(payload, f->pending_data, len);
f->pending_set = 0;
f->pending_len = 0;
DBG("Flushing pending UDP datagram len=%zu", len);
if (f->tcp_ctrl >= 0)
rc = udp_try_send_socks(f, payload, len);
else
rc = udp_try_send_direct(f, payload, len);
if (rc < 0)
return -1;
udp_update_events(f);
return 0;
}
/* Inject UDP packet into TUN (response from server to client) */
static void udp_inject_tun(int tunfd, struct udp_flow *f, const uint8_t *data,
size_t len) {
uint8_t out[65536];
if (len > sizeof(out) - sizeof(struct iphdr) - sizeof(struct udphdr))
return;
struct iphdr *ip = (struct iphdr *)out;
struct udphdr *udp = (struct udphdr *)(out + sizeof(*ip));
memset(ip, 0, sizeof(*ip));
ip->version = 4;
ip->ihl = 5;
ip->ttl = 64;
ip->protocol = IPPROTO_UDP;
ip->saddr = f->srv_ip;
ip->daddr = f->cli_ip;
ip->tot_len = htons((uint16_t)(sizeof(*ip) + sizeof(*udp) + len));
ip->check = csum16(ip, sizeof(*ip));
udp->source = htons(f->srv_port);
udp->dest = htons(f->cli_port);
udp->len = htons((uint16_t)(sizeof(*udp) + len));
udp->check = 0;
memcpy(out + sizeof(*ip) + sizeof(*udp), data, len);
IGNORE_RESULT(tun_write_packet(tunfd, out, sizeof(*ip) + sizeof(*udp) + len,
"UDP inject"));
}
static void handle_udp(int tunfd, uint8_t *pkt, ssize_t len) {
(void)tunfd;
if (len <= 0)
return;
size_t ulen = (size_t)len;
struct iphdr *ip = (struct iphdr *)pkt;
size_t iphl = ip->ihl * 4;
if (ip->version != 4)
return;
if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > ulen)
return;
size_t ip_total_len = (size_t)ntohs(ip->tot_len);
if (ip_total_len < iphl || ip_total_len > ulen)
return;
/* Reject IP fragments */
if (ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) {
DBG("UDP: dropping IP fragment (frag_off=0x%04x)", ntohs(ip->frag_off));
return;
}
if (ulen < iphl + sizeof(struct udphdr))
return;
struct udphdr *udp = (struct udphdr *)(pkt + iphl);
size_t udp_len = (size_t)ntohs(udp->len);
if (udp_len < sizeof(struct udphdr) || udp_len > ip_total_len - iphl)
return;
uint16_t dport = ntohs(udp->dest);
uint16_t sport = ntohs(udp->source);
uint8_t *payload = pkt + iphl + sizeof(struct udphdr);
size_t plen = udp_len - sizeof(struct udphdr);
/* Check for host gateway access (10.0.1.x -> 127.0.0.x) */
uint32_t target_ip = ip->daddr;
int is_gateway = is_gateway_ip(target_ip);
if (is_gateway) {
if (!is_gateway_allowed(target_ip, dport, IPPROTO_UDP)) {
DBG("[parent] UDP to 10.0.1.%d:%d blocked", gateway_last_octet(target_ip),
dport);
return;
}
target_ip = gateway_to_localhost(ip->daddr);
DBG("[parent] UDP gateway: 10.0.1.%d:%d -> 127.0.0.%d:%d",
gateway_last_octet(ip->daddr), dport, gateway_last_octet(ip->daddr),
dport);
}
if (socks_proxy.enabled && !is_gateway) {
/* Find or create persistent UDP flow */
struct udp_flow *f = udp_find(ip->saddr, sport, ip->daddr, dport);
if (!f) {
f = udp_alloc();
memset(f, 0, sizeof(*f));
f->cli_ip = ip->saddr;
f->cli_port = sport;
f->srv_ip = ip->daddr;
f->srv_port = dport;
f->tcp_ctrl = -1;
f->udp_relay = -1;
f->udp_staging = -1;
f->pending_set = 0;
f->pending_len = 0;
socks_io_reset(&f->socks);
if (udp_socks_setup(f, &socks_proxy) < 0) {
f->udp_relay = -1;
return;
}
}
/* Update source port for response routing (may differ on reused flow) */
f->cli_port = sport;
if (f->udp_relay >= 0) {
if (udp_try_send_socks(f, payload, plen) < 0) {
udp_close_flow(f);
return;
}
udp_update_events(f);
} else {
int qrc = udp_queue_pending(f, payload, plen);
if (qrc < 0) {
udp_close_flow(f);
return;
}
}
} else {
/* Direct UDP now uses persistent non-blocking flows too. */
struct udp_flow *f = udp_find(ip->saddr, sport, ip->daddr, dport);
if (!f) {
f = udp_alloc();
memset(f, 0, sizeof(*f));
f->cli_ip = ip->saddr;
f->cli_port = sport;
f->srv_ip = ip->daddr;
f->srv_port = dport;
f->tcp_ctrl = -1;
f->udp_relay = -1;
f->udp_staging = -1;
if (udp_direct_setup(f, target_ip, dport) < 0) {
f->udp_relay = -1;
return;
}
}
f->cli_port = sport;
if (udp_try_send_direct(f, payload, plen) < 0) {
udp_close_flow(f);
return;
}
udp_update_events(f);
DBG("UDP queued %zu bytes to %s:%d", plen,
is_gateway ? "127.0.0.1" : "remote", dport);
}
}
/* Handle ICMP echo request (ping) to gateway - responds directly */
static void handle_icmp(int tunfd, uint8_t *pkt, ssize_t len) {
if (len <= 0)
return;
size_t ulen = (size_t)len;
struct iphdr *ip = (struct iphdr *)pkt;
size_t iphl = ip->ihl * 4;
if (ip->version != 4)
return;
if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > ulen)
return;
size_t ip_total_len = (size_t)ntohs(ip->tot_len);
if (ip_total_len < iphl || ip_total_len > ulen)
return;
/* Reject IP fragments */
if (ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) {
DBG("ICMP: dropping IP fragment (frag_off=0x%04x)", ntohs(ip->frag_off));
return;
}
if (ip_total_len < iphl + 8) /* ICMP header is 8 bytes minimum */
return;
/* Only respond to ping on 10.0.0.1 (always allowed) */
if (ip->daddr != HOST_PING_IP)
return;
uint8_t *icmp = pkt + iphl;
uint8_t type = icmp[0];
/* Only respond to echo request (type 8) */
if (type != 8)
return;
DBG("ICMP echo request to gateway - sending reply");
/* Build echo reply */
uint8_t out[65536];
size_t icmp_len = ip_total_len - iphl;
struct iphdr *rip = (struct iphdr *)out;
memset(rip, 0, sizeof(*rip));
rip->version = 4;
rip->ihl = 5;
rip->ttl = 64;
rip->protocol = IPPROTO_ICMP;
rip->saddr = ip->daddr; /* Gateway IP */
rip->daddr = ip->saddr; /* Client IP */
rip->tot_len = htons((uint16_t)(sizeof(*rip) + icmp_len));
rip->check = csum16(rip, sizeof(*rip));
/* Copy ICMP data and change type to echo reply (0) */
memcpy(out + sizeof(*rip), icmp, icmp_len);
out[sizeof(*rip)] = 0; /* Type = echo reply */
/* Recalculate ICMP checksum */
uint8_t *ricmp = out + sizeof(*rip);
ricmp[2] = 0;
ricmp[3] = 0;
uint16_t icmp_csum = csum16(ricmp, icmp_len);
ricmp[2] = (uint8_t)(icmp_csum & 0xff);
ricmp[3] = (uint8_t)(icmp_csum >> 8);
IGNORE_RESULT(
tun_write_packet(tunfd, out, sizeof(*rip) + icmp_len, "ICMP reply"));
}
static struct tcp_flow *tcp_find(uint32_t cip, uint16_t cport, uint32_t sip,
uint16_t sport) {
for (int i = 0; i < MAX_TCP; i++) {
struct tcp_flow *f = &tcp_flows[i];
if (f->sock >= 0 && f->cli_ip == cip && f->cli_port == cport &&
f->srv_ip == sip && f->srv_port == sport)
return f;
}
return NULL;
}
static struct tcp_flow *tcp_alloc(void) {
for (int i = 0; i < MAX_TCP; i++) {
if (tcp_flows[i].sock < 0)
return &tcp_flows[i];
}
struct tcp_flow *oldest = NULL;
for (int i = 0; i < MAX_TCP; i++) {
if (tcp_flows[i].state == SP_TCP_ESTABLISHED)
continue;
if (oldest == NULL || tcp_flows[i].last_active < oldest->last_active)
oldest = &tcp_flows[i];
}
if (oldest == NULL)
return NULL;
if (oldest->sock >= 0) {
if (g_epfd >= 0)
epoll_ctl(g_epfd, EPOLL_CTL_DEL, oldest->sock, NULL);
close(oldest->sock);
}
oldest->sock = -1;
oldest->state = SP_TCP_CLOSED;
oldest->pending_write_len = 0;
oldest->pending_fin = 0;
oldest->backend_ready = 0;
memset(&oldest->socks, 0, sizeof(oldest->socks));
return oldest;
}
static uint32_t csum16_partial(const void *buf, size_t len) {
const uint8_t *p = buf;
uint32_t sum = 0;
while (len > 1) {
/* Little-endian integer construction from bytes */
sum += (uint32_t)p[0] | ((uint32_t)p[1] << 8);
p += 2;
len -= 2;
}
if (len)
sum += (uint32_t)p[0];
return sum;
}
static uint32_t checksum_add_bytes(const void *buf, size_t len) {
const uint8_t *p = buf;
uint32_t sum = 0;
while (len > 1) {
sum += ((uint32_t)p[0] << 8) | (uint32_t)p[1];
p += 2;
len -= 2;
}
if (len > 0)
sum += (uint32_t)p[0] << 8;
return sum;
}
static uint32_t checksum_add_ipv4_pseudo(uint32_t saddr, uint32_t daddr,
uint8_t proto, uint16_t len) {
uint8_t pseudo[12];
pseudo[0] = (uint8_t)((ntohl(saddr) >> 24) & 0xff);
pseudo[1] = (uint8_t)((ntohl(saddr) >> 16) & 0xff);
pseudo[2] = (uint8_t)((ntohl(saddr) >> 8) & 0xff);
pseudo[3] = (uint8_t)(ntohl(saddr) & 0xff);
pseudo[4] = (uint8_t)((ntohl(daddr) >> 24) & 0xff);
pseudo[5] = (uint8_t)((ntohl(daddr) >> 16) & 0xff);
pseudo[6] = (uint8_t)((ntohl(daddr) >> 8) & 0xff);
pseudo[7] = (uint8_t)(ntohl(daddr) & 0xff);
pseudo[8] = 0;
pseudo[9] = proto;
pseudo[10] = (uint8_t)(len >> 8);
pseudo[11] = (uint8_t)(len & 0xff);
return checksum_add_bytes(pseudo, sizeof(pseudo));
}
static uint16_t checksum_finish(uint32_t sum) {
while (sum >> 16)
sum = (sum & 0xffffU) + (sum >> 16);
return (uint16_t)~sum;
}
static int tcp_checksum_valid(const struct iphdr *ip, const struct tcphdr *tcp,
size_t tcp_len) {
uint32_t sum;
if (tcp_len < sizeof(struct tcphdr))
return 0;
sum = checksum_add_ipv4_pseudo(ip->saddr, ip->daddr, IPPROTO_TCP,
(uint16_t)tcp_len);
sum += checksum_add_bytes(tcp, tcp_len);
return checksum_finish(sum) == 0;
}
static int udp_checksum_valid(const struct iphdr *ip, const struct udphdr *udp,
size_t udp_len) {
uint32_t sum;
if (udp->check == 0)
return 1;
if (udp_len < sizeof(struct udphdr))
return 0;
sum = checksum_add_ipv4_pseudo(ip->saddr, ip->daddr, IPPROTO_UDP,
(uint16_t)udp_len);
sum += checksum_add_bytes(udp, udp_len);
return checksum_finish(sum) == 0;
}
static int icmp_checksum_valid(const uint8_t *icmp, size_t icmp_len) {
uint32_t sum;
if (icmp_len == 0)
return 0;
sum = checksum_add_bytes(icmp, icmp_len);
return checksum_finish(sum) == 0;
}
static uint16_t tcp_checksum(struct iphdr *ip, struct tcphdr *tcp,
size_t tcp_len, const uint8_t *payload,
size_t plen) {
struct {
uint32_t src;
uint32_t dst;
uint8_t zero;
uint8_t proto;
uint16_t len;
} __attribute__((packed)) pseudo;
memset(&pseudo, 0, sizeof(pseudo));
pseudo.src = ip->saddr;
pseudo.dst = ip->daddr;
pseudo.zero = 0;
pseudo.proto = IPPROTO_TCP;
pseudo.len = htons((uint16_t)(tcp_len + plen));
uint32_t sum = 0;
uint32_t p1 = csum16_partial(&pseudo, sizeof(pseudo));
uint32_t p2 = csum16_partial(tcp, tcp_len);
uint32_t p3 = plen ? csum16_partial(payload, plen) : 0;
sum = p1 + p2 + p3;
while (sum >> 16)
sum = (sum & 0xffff) + (sum >> 16);
return (uint16_t)~sum;
}
static uint16_t ip_checksum(const void *buf, size_t len) {
const uint16_t *p = buf;
uint32_t sum = 0;
while (len > 1) {
sum += *p++;
len -= 2;
}
if (len)
sum += *(const uint8_t *)p;
while (sum >> 16)
sum = (sum & 0xffff) + (sum >> 16);
return (uint16_t)~sum;
}
static uint16_t tcp_advertised_window(const struct tcp_flow *f) {
size_t free_bytes = 0;
if (f->pending_write_len < TCP_PENDING_WRITE_CAP)
free_bytes = TCP_PENDING_WRITE_CAP - f->pending_write_len;
if (free_bytes > 65535)
free_bytes = 65535;
return (uint16_t)free_bytes;
}
/* Send a TCP packet from server to client */
static void send_tcp_packet(int tunfd, struct tcp_flow *f, uint8_t flags,
const uint8_t *payload, size_t plen) {
uint8_t out[65536];
struct iphdr *ip = (struct iphdr *)out;
struct tcphdr *tcp = (struct tcphdr *)(out + sizeof(*ip));
size_t tcp_hdr_len = sizeof(*tcp);
size_t total_len = sizeof(*ip) + tcp_hdr_len + plen;
memset(ip, 0, sizeof(*ip));
ip->version = 4;
ip->ihl = 5;
ip->ttl = 64;
ip->protocol = IPPROTO_TCP;
ip->saddr = f->srv_ip;
ip->daddr = f->cli_ip;
ip->tot_len = htons((uint16_t)total_len);
memset(tcp, 0, sizeof(*tcp));
tcp->source = htons(f->srv_port);
tcp->dest = htons(f->cli_port);
tcp->seq = htonl(f->srv_next);
tcp->ack_seq = htonl(f->cli_next);
tcp->doff = (tcp_hdr_len / 4) & 0xF;
tcp->ack = 1;
if (flags & 0x08)
tcp->psh = 1; /* PSH */
if (flags & 0x01)
tcp->fin = 1; /* FIN */
tcp->window = htons(tcp_advertised_window(f));
if (plen > 0)
memcpy(out + sizeof(*ip) + tcp_hdr_len, payload, plen);
tcp->check = tcp_checksum(ip, tcp, tcp_hdr_len, payload, plen);
ip->check = ip_checksum(ip, sizeof(*ip));
IGNORE_RESULT(tun_write_packet(tunfd, out, total_len, "TCP packet"));
/* Update sequence number for data sent */
if (plen > 0)
f->srv_next += (uint32_t)plen;
if ((flags & 0x01) != 0)
f->srv_next++;
}
/* ---------- TCP Option Parsing ---------- */
struct tcp_options {
uint16_t mss;
uint8_t wscale;
uint32_t tsval;
uint32_t tsecr;
int ts_present;
int sack_permitted;
};
/* Parse TCP options from the options portion of TCP header.
Returns 0 on success, -1 if options are malformed. */
static int parse_tcp_options(const uint8_t *opts, size_t len,
struct tcp_options *out) {
memset(out, 0, sizeof(*out));
out->mss = 536; /* Default MSS per RFC 879 */
size_t i = 0;
while (i < len) {
uint8_t kind = opts[i];
if (kind == 0) /* End of option list */
break;
if (kind == 1) { /* NOP */
i++;
continue;
}
/* All other options have length field */
if (i + 1 >= len)
return -1;
uint8_t optlen = opts[i + 1];
if (optlen < 2 || i + optlen > len)
return -1;
switch (kind) {
case 2: /* MSS */
if (optlen == 4) {
out->mss = (uint16_t)((opts[i + 2] << 8) | opts[i + 3]);
}
break;
case 3: /* Window Scale */
if (optlen == 3) {
out->wscale = opts[i + 2];
}
break;
case 4: /* SACK Permitted */
if (optlen == 2) {
out->sack_permitted = 1;
}
break;
case 8: /* Timestamp */
if (optlen == 10) {
out->ts_present = 1;
out->tsval = (uint32_t)((opts[i + 2] << 24) | (opts[i + 3] << 16) |
(opts[i + 4] << 8) | opts[i + 5]);
out->tsecr = (uint32_t)((opts[i + 6] << 24) | (opts[i + 7] << 16) |
(opts[i + 8] << 8) | opts[i + 9]);
}
break;
}
i += optlen;
}
return 0;
}
/* Build TCP options for SYN-ACK response.
Returns the number of bytes written to buf. */
static size_t build_synack_options(const struct tcp_options *client_opts,
uint8_t *buf, uint32_t our_tsval) {
size_t off = 0;
/* MSS option (kind=2, len=4) - always include */
buf[off++] = 2;
buf[off++] = 4;
buf[off++] = 0x05; /* MSS = 1460 */
buf[off++] = 0xb4;
/* Window scale option if client requested (kind=3, len=3) */
if (client_opts->wscale > 0) {
buf[off++] = 1; /* NOP for alignment */
buf[off++] = 3; /* kind = Window Scale */
buf[off++] = 3; /* length = 3 */
buf[off++] = 0; /* our shift count = 0 (advertise x1 scaling) */
}
/* Timestamp option if client requested (kind=8, len=10) */
if (client_opts->ts_present) {
buf[off++] = 8;
buf[off++] = 10;
buf[off++] = (uint8_t)(our_tsval >> 24);
buf[off++] = (uint8_t)(our_tsval >> 16);
buf[off++] = (uint8_t)(our_tsval >> 8);
buf[off++] = (uint8_t)(our_tsval);
buf[off++] = (uint8_t)(client_opts->tsval >> 24);
buf[off++] = (uint8_t)(client_opts->tsval >> 16);
buf[off++] = (uint8_t)(client_opts->tsval >> 8);
buf[off++] = (uint8_t)(client_opts->tsval);
}
/* Pad to 4-byte boundary with NOPs */
while (off % 4 != 0)
buf[off++] = 1; /* NOP */
return off;
}
/* Send a TCP RST packet */
static void send_tcp_rst(int tunfd, uint32_t saddr, uint32_t daddr,
uint16_t sport, uint16_t dport, uint32_t seq,
uint32_t ack_seq) {
uint8_t out[64];
struct iphdr *ip = (struct iphdr *)out;
struct tcphdr *tcp = (struct tcphdr *)(out + sizeof(*ip));
memset(out, 0, sizeof(out));
ip->version = 4;
ip->ihl = 5;
ip->ttl = 64;
ip->protocol = IPPROTO_TCP;
ip->saddr = saddr;
ip->daddr = daddr;
ip->tot_len = htons(sizeof(*ip) + sizeof(*tcp));
tcp->source = htons(sport);
tcp->dest = htons(dport);
tcp->seq = htonl(seq);
tcp->ack_seq = htonl(ack_seq);
tcp->doff = 5;
tcp->rst = 1;
tcp->ack = 1;
tcp->window = 0;
tcp->check = tcp_checksum(ip, tcp, sizeof(*tcp), NULL, 0);
ip->check = ip_checksum(ip, sizeof(*ip));
IGNORE_RESULT(
tun_write_packet(tunfd, out, sizeof(*ip) + sizeof(*tcp), "TCP RST"));
}
/* Send RST for a given flow and clean it up */
static void tcp_flow_rst(int tunfd, struct tcp_flow *f) {
if (f->sock >= 0) {
epoll_del(f->sock);
close(f->sock);
}
f->pending_write_off = 0;
f->pending_write_len = 0;
f->pending_fin = 0;
f->pending_fin_seq = 0;
f->backend_ready = 0;
socks_io_reset(&f->socks);
send_tcp_rst(tunfd, f->srv_ip, f->cli_ip, f->srv_port, f->cli_port,
f->srv_next, f->cli_next);
f->sock = -1;
f->state = SP_TCP_CLOSED;
}
static void tcp_update_events(struct tcp_flow *f) {
uint32_t events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
if (!f->backend_ready || f->pending_write_len > 0 ||
socks_has_pending_tx(&f->socks))
events |= EPOLLOUT;
epoll_mod_tcp(f, events);
}
static int tcp_finish_pending_fin(int tunfd, struct tcp_flow *f) {
if (!f->backend_ready || f->pending_write_len > 0 || !f->pending_fin ||
f->pending_fin_seq != f->cli_next)
return 0;
f->pending_fin = 0;
f->pending_fin_seq = 0;
f->cli_next++;
send_tcp_packet(tunfd, f, 0, NULL, 0);
if (shutdown(f->sock, SHUT_WR) < 0 && errno != ENOTCONN && errno != EPIPE) {
tcp_flow_rst(tunfd, f);
return -1;
}
f->state = SP_TCP_CLOSE_WAIT;
f->last_active = time(NULL);
return 1;
}
static int tcp_queue_pending_write(struct tcp_flow *f, const uint8_t *data,
size_t len) {
if (f->pending_write_off != 0 && f->pending_write_len > 0) {
memmove(f->pending_write, f->pending_write + f->pending_write_off,
f->pending_write_len);
f->pending_write_off = 0;
}
if (f->pending_write_len + len > sizeof(f->pending_write))
return -1;
memcpy(f->pending_write + f->pending_write_len, data, len);
f->pending_write_len += len;
return 0;
}
static int tcp_flush_pending_write(int tunfd, struct tcp_flow *f) {
size_t total_sent = 0;
while (f->pending_write_len > 0) {
ssize_t sent = send(f->sock, f->pending_write + f->pending_write_off,
f->pending_write_len, MSG_NOSIGNAL);
if (sent > 0) {
f->pending_write_off += (size_t)sent;
f->pending_write_len -= (size_t)sent;
total_sent += (size_t)sent;
continue;
}
if (sent == 0) {
tcp_flow_rst(tunfd, f);
return -1;
}
if (sent < 0 && errno == EINTR)
continue;
if (sent < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
break;
tcp_flow_rst(tunfd, f);
return -1;
}
if (total_sent > 0) {
f->cli_next += (uint32_t)total_sent;
if (tcp_finish_pending_fin(tunfd, f) < 0)
return -1;
if (f->sock >= 0 && f->pending_fin == 0)
send_tcp_packet(tunfd, f, 0, NULL, 0);
}
if (f->pending_write_len == 0)
f->pending_write_off = 0;
if (f->sock >= 0)
tcp_update_events(f);
return 0;
}
static void handle_tcp(int tunfd, uint8_t *pkt, ssize_t len) {
if (len <= 0)
return;
size_t ulen = (size_t)len;
struct iphdr *ip = (struct iphdr *)pkt;
size_t iphl = ip->ihl * 4;
if (ip->version != 4)
return;
if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > ulen)
return;
size_t ip_total_len = (size_t)ntohs(ip->tot_len);
if (ip_total_len < iphl || ip_total_len > ulen)
return;
/* Reject IP fragments */
if (ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) {
DBG("TCP: dropping IP fragment (frag_off=0x%04x)", ntohs(ip->frag_off));
return;
}
if (ip_total_len < iphl + sizeof(struct tcphdr))
return;
struct tcphdr *tcp = (struct tcphdr *)(pkt + iphl);
size_t tcp_hdr_len = (size_t)tcp->doff * 4;
if (tcp_hdr_len < sizeof(struct tcphdr) || tcp_hdr_len > 60 ||
ip_total_len < iphl + tcp_hdr_len)
return;
uint32_t cip = ip->saddr;
uint32_t sip = ip->daddr;
uint16_t cport = ntohs(tcp->source);
uint16_t sport = ntohs(tcp->dest);
/* ---------- RST ---------- */
if (tcp->rst) {
struct tcp_flow *f = tcp_find(cip, cport, sip, sport);
if (f && f->sock >= 0) {
epoll_del(f->sock);
close(f->sock);
f->sock = -1;
f->state = SP_TCP_CLOSED;
}
return;
}
/* ---------- SYN ---------- */
if (tcp->syn && !tcp->ack) {
struct tcp_flow *f = tcp_find(cip, cport, sip, sport);
/* Parse TCP options from SYN */
struct tcp_options cli_opts;
if (tcp_hdr_len > sizeof(struct tcphdr)) {
const uint8_t *opt_start = (const uint8_t *)tcp + sizeof(struct tcphdr);
size_t opt_len = tcp_hdr_len - sizeof(struct tcphdr);
if (parse_tcp_options(opt_start, opt_len, &cli_opts) < 0)
return;
} else {
memset(&cli_opts, 0, sizeof(cli_opts));
}
if (!f) {
/* Rate limit new connections */
if (!check_rate_limit()) {
return; /* Too many connections - drop SYN silently */
}
/* first SYN */
f = tcp_alloc();
if (!f) {
send_tcp_rst(tunfd, sip, cip, sport, cport, 0, ntohl(tcp->seq) + 1U);
return;
}
memset(f, 0, sizeof(*f));
f->cli_ip = cip;
f->cli_port = cport;
f->srv_ip = sip;
f->srv_port = sport;
f->cli_isn = ntohl(tcp->seq);
f->cli_next = f->cli_isn + 1;
uint32_t isn;
if (getrandom(&isn, sizeof(isn), 0) != (ssize_t)sizeof(isn))
die("getrandom");
f->srv_isn = isn;
f->srv_next = f->srv_isn + 1;
/* Store timestamp negotiation state */
f->ts_ok = cli_opts.ts_present;
if (cli_opts.ts_present) {
f->ts_recent = cli_opts.tsval;
}
int s;
int connect_rc;
struct sockaddr_in dst;
int use_gateway =
is_gateway_ip(sip) && is_gateway_allowed(sip, sport, IPPROTO_TCP);
if (use_gateway) {
/* Gateway access - connect to localhost (10.0.1.x -> 127.0.0.x) */
uint32_t local_ip = gateway_to_localhost(sip);
DBG("[parent] TCP gateway: 10.0.1.%d:%d -> 127.0.0.%d:%d",
gateway_last_octet(sip), sport, gateway_last_octet(sip), sport);
s = socket(AF_INET, SOCK_STREAM, 0);
if (s < 0)
return;
memset(&dst, 0, sizeof(dst));
dst.sin_family = AF_INET;
dst.sin_port = htons(sport);
dst.sin_addr.s_addr = local_ip;
} else if (socks_proxy.enabled) {
/* Connect via SOCKS5 proxy */
s = socket(AF_INET, SOCK_STREAM, 0);
if (s < 0)
return;
dst = socks_proxy.addr;
} else {
/* Direct connection */
s = socket(AF_INET, SOCK_STREAM, 0);
if (s < 0)
return;
memset(&dst, 0, sizeof(dst));
dst.sin_family = AF_INET;
dst.sin_port = htons(sport);
dst.sin_addr.s_addr = sip;
}
connect_rc = start_nonblocking_connect(s, &dst);
if (connect_rc < 0) {
close(s);
f->sock = -1;
send_tcp_rst(tunfd, sip, cip, sport, cport, 0, f->cli_next);
return;
}
f->sock = s;
f->backend_ready = 0;
socks_io_reset(&f->socks);
if (socks_proxy.enabled && !use_gateway) {
f->socks.active = 1;
f->socks.is_udp = 0;
f->socks.target_ip = sip;
f->socks.target_port = sport;
f->socks.connect_pending = (connect_rc > 0);
f->socks.state = f->socks.connect_pending ? SOCKS_IO_CONNECTING
: SOCKS_IO_METHOD;
if (!f->socks.connect_pending &&
socks_begin_handshake(&f->socks, &socks_proxy) < 0) {
close(s);
f->sock = -1;
send_tcp_rst(tunfd, sip, cip, sport, cport, 0, f->cli_next);
return;
}
} else if (connect_rc == 0) {
f->backend_ready = 1;
}
f->state = SP_TCP_SYN_RECEIVED;
f->last_active = time(NULL);
epoll_add_tcp(f);
tcp_update_events(f);
}
/* Build SYN-ACK with mirrored options */
uint8_t out[128];
struct iphdr *rip = (struct iphdr *)out;
struct tcphdr *rtcp = (struct tcphdr *)(out + sizeof(*rip));
/* Build TCP options mirroring client's capabilities */
uint8_t opts[24];
uint32_t our_tsval = (uint32_t)time(NULL);
size_t opts_len = build_synack_options(&cli_opts, opts, our_tsval);
memset(rip, 0, sizeof(*rip));
rip->version = 4;
rip->ihl = 5;
rip->ttl = 64;
rip->protocol = IPPROTO_TCP;
rip->saddr = sip;
rip->daddr = cip;
memset(rtcp, 0, sizeof(*rtcp));
rtcp->source = htons(sport);
rtcp->dest = htons(cport);
rtcp->seq = htonl(f->srv_isn);
rtcp->ack_seq = htonl(f->cli_next);
rtcp->syn = 1;
rtcp->ack = 1;
size_t full_tcp_len = sizeof(struct tcphdr) + opts_len;
rtcp->doff = (full_tcp_len / 4) & 0xF;
rtcp->window = htons(tcp_advertised_window(f));
memcpy((uint8_t *)rtcp + sizeof(*rtcp), opts, opts_len);
rip->tot_len = htons((uint16_t)(sizeof(*rip) + full_tcp_len));
rtcp->check = tcp_checksum(rip, rtcp, full_tcp_len, NULL, 0);
rip->check = ip_checksum(rip, sizeof(*rip));
IGNORE_RESULT(tun_write_packet(tunfd, out, sizeof(*rip) + full_tcp_len,
"TCP SYN-ACK"));
return;
}
/* ---------- ACK / DATA ---------- */
if (tcp->ack && !tcp->syn) {
struct tcp_flow *f = tcp_find(cip, cport, sip, sport);
if (!f || f->sock < 0)
return;
uint32_t seq = ntohl(tcp->seq);
/* Calculate payload */
size_t payload_off = iphl + tcp_hdr_len;
size_t payload_len = 0;
if (ip_total_len > payload_off)
payload_len = ip_total_len - payload_off;
if (f->state == SP_TCP_SYN_RECEIVED) {
if (ntohl(tcp->ack_seq) != f->srv_next) {
DBG("TCP: dropping invalid handshake ACK (%u != %u)",
ntohl(tcp->ack_seq), f->srv_next);
return;
}
f->state = SP_TCP_ESTABLISHED;
tcp_update_events(f);
}
/* Update activity time */
f->last_active = time(NULL);
uint32_t pending_end = f->cli_next + (uint32_t)f->pending_write_len;
if (!f->backend_ready || f->pending_write_len > 0) {
uint8_t *payload = pkt + payload_off;
size_t append_off = 0;
size_t append_len = 0;
if (payload_len > 0 && seq <= pending_end &&
seq + payload_len > pending_end) {
append_off = (size_t)(pending_end - seq);
append_len = payload_len - append_off;
} else if (payload_len > 0 && seq == pending_end) {
append_len = payload_len;
}
if (append_len > 0 &&
tcp_queue_pending_write(f, payload + append_off, append_len) < 0) {
tcp_flow_rst(tunfd, f);
return;
}
if (tcp->fin) {
f->pending_fin = 1;
f->pending_fin_seq = seq + (uint32_t)payload_len;
}
if (f->backend_ready && (payload_len > 0 || tcp->fin))
send_tcp_packet(tunfd, f, 0, NULL, 0);
return;
}
/* Forward payload to real server and ACK only bytes the backend accepted. */
if (payload_len > 0) {
uint8_t *payload = pkt + payload_off;
size_t payload_off_trim = 0;
if (seq < f->cli_next) {
if (seq + payload_len <= f->cli_next) {
send_tcp_packet(tunfd, f, 0, NULL, 0);
return;
}
payload_off_trim = (size_t)(f->cli_next - seq);
seq = f->cli_next;
payload += payload_off_trim;
payload_len -= payload_off_trim;
}
if (seq != f->cli_next) {
send_tcp_packet(tunfd, f, 0, NULL, 0);
return;
}
size_t total_sent = 0;
while (total_sent < payload_len) {
ssize_t sent = send(f->sock, payload + total_sent,
payload_len - total_sent, MSG_NOSIGNAL);
if (sent == 0) {
tcp_flow_rst(tunfd, f);
return;
}
if (sent < 0) {
if (errno == EINTR)
continue;
if (errno == EAGAIN || errno == EWOULDBLOCK)
break;
/* Connection error - send RST and clean up */
tcp_flow_rst(tunfd, f);
return;
}
total_sent += (size_t)sent;
}
if (total_sent > 0) {
f->cli_next += (uint32_t)total_sent;
/* Send ACK back to client */
send_tcp_packet(tunfd, f, 0, NULL, 0);
}
if (total_sent < payload_len) {
if (tcp_queue_pending_write(f, payload + total_sent,
payload_len - total_sent) < 0) {
tcp_flow_rst(tunfd, f);
return;
}
tcp_update_events(f);
}
}
/* Handle FIN from client */
if (tcp->fin) {
if (f->pending_write_len > 0 || seq != f->cli_next) {
f->pending_fin = 1;
f->pending_fin_seq = seq + (uint32_t)payload_len;
send_tcp_packet(tunfd, f, 0, NULL, 0);
return;
}
f->cli_next++;
/* ACK the client FIN but keep the backend open for reads. */
send_tcp_packet(tunfd, f, 0, NULL, 0);
if (shutdown(f->sock, SHUT_WR) < 0 && errno != ENOTCONN &&
errno != EPIPE) {
tcp_flow_rst(tunfd, f);
return;
}
f->state = SP_TCP_CLOSE_WAIT;
f->last_active = time(NULL);
}
return;
}
}
static void dispatch_tun_ipv4_packet(int tunfd, uint8_t *pkt, size_t len) {
struct iphdr *ip;
size_t iphl;
size_t ip_total_len;
if (len < sizeof(struct iphdr))
return;
ip = (struct iphdr *)pkt;
if (ip->version != 4)
return;
iphl = (size_t)ip->ihl * 4;
if (iphl < sizeof(struct iphdr) || iphl > 60 || iphl > len) {
DBG("Dropping packet: bad IHL=%zu", iphl);
return;
}
if (ip_checksum(ip, iphl) != 0) {
DBG("Dropping packet: bad IP checksum");
return;
}
ip_total_len = (size_t)ntohs(ip->tot_len);
if (ip_total_len < iphl || ip_total_len > len)
return;
if ((ntohs(ip->frag_off) & (IP_MF | IP_OFFMASK)) != 0) {
DBG("Dropping packet: dropping IP fragment (frag_off=0x%04x)",
ntohs(ip->frag_off));
return;
}
if (ip->protocol == IPPROTO_TCP) {
if (ip_total_len < iphl + sizeof(struct tcphdr))
return;
const struct tcphdr *tcp = (const struct tcphdr *)(pkt + iphl);
size_t tcp_len = ip_total_len - iphl;
size_t tcp_hdr_len = (size_t)tcp->doff * 4;
if (tcp_hdr_len < sizeof(struct tcphdr) || tcp_hdr_len > tcp_len)
return;
if (!tcp_checksum_valid(ip, tcp, tcp_len)) {
DBG("Dropping packet: bad TCP checksum");
return;
}
handle_tcp(tunfd, pkt, (ssize_t)ip_total_len);
return;
}
if (ip->protocol == IPPROTO_UDP) {
if (ip_total_len < iphl + sizeof(struct udphdr))
return;
const struct udphdr *udp = (const struct udphdr *)(pkt + iphl);
size_t udp_len = (size_t)ntohs(udp->len);
if (udp_len < sizeof(struct udphdr) || udp_len > ip_total_len - iphl)
return;
if (!udp_checksum_valid(ip, udp, udp_len)) {
DBG("Dropping packet: bad UDP checksum");
return;
}
handle_udp(tunfd, pkt, (ssize_t)ip_total_len);
return;
}
if (ip->protocol == IPPROTO_ICMP) {
const uint8_t *icmp = pkt + iphl;
size_t icmp_len = ip_total_len - iphl;
if (!icmp_checksum_valid(icmp, icmp_len)) {
DBG("Dropping packet: bad ICMP checksum");
return;
}
handle_icmp(tunfd, pkt, (ssize_t)ip_total_len);
}
}
static void udp_close_flow(struct udp_flow *f) {
if (f->udp_relay >= 0) {
epoll_del(f->udp_relay);
close(f->udp_relay);
f->udp_relay = -1;
}
if (f->udp_staging >= 0) {
close(f->udp_staging);
f->udp_staging = -1;
}
if (f->tcp_ctrl >= 0) {
epoll_del(f->tcp_ctrl);
close(f->tcp_ctrl);
f->tcp_ctrl = -1;
}
f->pending_set = 0;
f->pending_len = 0;
socks_io_reset(&f->socks);
}
static int interactive_maybe_sync_winsize(struct interactive_session *session) {
if (!session || !session->active)
return 0;
if (!interactive_resize_pending)
return 0;
interactive_resize_pending = 0;
return interactive_sync_winsize(session);
}
static int event_loop(int tunfd, pid_t pid, int stdout_fd, int stderr_fd,
struct interactive_session *session) {
struct epoll_event events[MAX_EVENTS];
int child_status = -1;
int interactive_pty_active = 0;
time_t child_exited_at = 0;
/* Register TUN fd (static wrapper on stack - never removed) */
struct epoll_wrapper tun_ew = {.type = FD_TUN, .fd = tunfd, .flow = NULL};
struct epoll_event tun_ev = {.events = EPOLLIN, .data.ptr = &tun_ew};
if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, tunfd, &tun_ev) < 0)
die("epoll_ctl TUN");
int pipes_active = (stdout_fd >= 0 ? 1 : 0) + (stderr_fd >= 0 ? 1 : 0);
struct epoll_wrapper stdout_ew = {.type = FD_STDOUT_RELAY, .fd = stdout_fd, .flow = NULL};
struct epoll_wrapper stderr_ew = {.type = FD_STDERR_RELAY, .fd = stderr_fd, .flow = NULL};
if (stdout_fd >= 0) {
struct epoll_event ev = {.events = EPOLLIN | EPOLLRDHUP, .data.ptr = &stdout_ew};
epoll_ctl(g_epfd, EPOLL_CTL_ADD, stdout_fd, &ev);
}
if (stderr_fd >= 0) {
struct epoll_event ev = {.events = EPOLLIN | EPOLLRDHUP, .data.ptr = &stderr_ew};
epoll_ctl(g_epfd, EPOLL_CTL_ADD, stderr_fd, &ev);
}
struct epoll_wrapper interactive_tty_ew = {
.type = FD_INTERACTIVE_TTY, .fd = -1, .flow = NULL};
struct epoll_wrapper interactive_pty_ew = {
.type = FD_INTERACTIVE_PTY, .fd = -1, .flow = NULL};
if (session && session->active) {
struct epoll_event tty_ev = {
.events = EPOLLIN | EPOLLRDHUP | EPOLLHUP | EPOLLERR,
.data.ptr = &interactive_tty_ew};
struct epoll_event pty_ev = {
.events = EPOLLIN | EPOLLRDHUP | EPOLLHUP | EPOLLERR,
.data.ptr = &interactive_pty_ew};
interactive_tty_ew.fd = session->host_tty_fd;
interactive_pty_ew.fd = session->pty_master_fd;
if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, session->host_tty_fd, &tty_ev) < 0)
die("epoll_ctl interactive tty");
if (epoll_ctl(g_epfd, EPOLL_CTL_ADD, session->pty_master_fd, &pty_ev) < 0)
die("epoll_ctl interactive pty");
interactive_pty_active = 1;
}
for (;;) {
if (interactive_maybe_sync_winsize(session) < 0)
break;
/* Check child status */
int status;
if (child_status == -1 && waitpid(pid, &status, WNOHANG) > 0) {
if (WIFEXITED(status) || WIFSIGNALED(status)) {
child_status = status;
child_exited_at = time(NULL);
if (pipes_active <= 0 && interactive_pty_active <= 0)
break;
}
}
if (child_status != -1 && interactive_pty_active > 0 &&
child_exited_at != 0 && session && session->pty_master_fd >= 0 &&
(time(NULL) - child_exited_at) >= 1) {
epoll_ctl(g_epfd, EPOLL_CTL_DEL, session->pty_master_fd, NULL);
close(session->pty_master_fd);
session->pty_master_fd = -1;
interactive_pty_active = 0;
if (pipes_active <= 0)
break;
}
int n = epoll_wait(g_epfd, events, MAX_EVENTS, EPOLL_TIMEOUT_MS);
if (n < 0) {
if (errno == EINTR)
continue;
break;
}
if (interactive_maybe_sync_winsize(session) < 0)
break;
for (int i = 0; i < n; i++) {
struct epoll_wrapper *ew = events[i].data.ptr;
if (!ew)
continue;
switch (ew->type) {
case FD_STDOUT_RELAY:
case FD_STDERR_RELAY: {
ssize_t r = read(ew->fd, g_io_buf, sizeof(g_io_buf));
if (r > 0) {
int out_fd = (ew->type == FD_STDOUT_RELAY) ? STDOUT_FILENO : STDERR_FILENO;
ssize_t w = write(out_fd, g_io_buf, (size_t)r);
(void)w;
} else if (r == 0 || (r < 0 && errno != EAGAIN && errno != EWOULDBLOCK) || (events[i].events & (EPOLLHUP | EPOLLRDHUP))) {
epoll_ctl(g_epfd, EPOLL_CTL_DEL, ew->fd, NULL);
close(ew->fd);
ew->fd = -1;
pipes_active--;
if (child_status != -1 && pipes_active <= 0) {
goto loop_end;
}
}
break;
}
case FD_INTERACTIVE_TTY:
case FD_INTERACTIVE_PTY: {
int dst_fd = -1;
ssize_t r;
if (!session || !session->active || ew->fd < 0)
break;
r = read(ew->fd, g_io_buf, sizeof(g_io_buf));
if (r > 0) {
dst_fd = (ew->type == FD_INTERACTIVE_TTY) ? session->pty_master_fd
: session->host_tty_fd;
if (write_all(dst_fd, g_io_buf, (size_t)r) < 0 &&
errno != EPIPE && errno != EIO)
goto loop_end;
} else if (r == 0 ||
(r < 0 && errno != EAGAIN && errno != EWOULDBLOCK &&
errno != EINTR) ||
(events[i].events & (EPOLLHUP | EPOLLRDHUP | EPOLLERR))) {
epoll_ctl(g_epfd, EPOLL_CTL_DEL, ew->fd, NULL);
if (ew->type == FD_INTERACTIVE_TTY) {
/* Keep the tty fd open so terminal restore still has a target. */
} else {
close(ew->fd);
session->pty_master_fd = -1;
interactive_pty_active = 0;
}
ew->fd = -1;
if (child_status != -1 && interactive_pty_active <= 0 &&
pipes_active <= 0) {
goto loop_end;
}
}
break;
}
case FD_TUN: {
/* Handle TUN packets (outgoing from child) */
ssize_t r = read(tunfd, g_io_buf, sizeof(g_io_buf));
if (r > 0)
dispatch_tun_ipv4_packet(tunfd, g_io_buf, (size_t)r);
break;
}
case FD_TCP: {
/* Handle TCP server socket responses */
struct tcp_flow *f = ew->flow;
if (!f || f->sock < 0)
break;
if (!f->backend_ready) {
if ((events[i].events & (EPOLLOUT | EPOLLERR | EPOLLHUP |
EPOLLRDHUP)) &&
((!f->socks.active) || f->socks.connect_pending)) {
if (socket_connect_complete(f->sock) < 0) {
tcp_flow_rst(tunfd, f);
break;
}
if (f->socks.active) {
f->socks.connect_pending = 0;
if (socks_begin_handshake(&f->socks, &socks_proxy) < 0) {
tcp_flow_rst(tunfd, f);
break;
}
} else {
f->backend_ready = 1;
}
tcp_update_events(f);
}
if (f->sock < 0)
break;
if (f->socks.active && !f->backend_ready) {
if ((events[i].events & EPOLLOUT) && socks_has_pending_tx(&f->socks)) {
if (socks_flush_tx(f->sock, &f->socks) < 0) {
tcp_flow_rst(tunfd, f);
break;
}
tcp_update_events(f);
}
if ((events[i].events & EPOLLIN) != 0) {
int rc = socks_recv_and_process(f->sock, &f->socks, &socks_proxy,
NULL);
if (rc < 0) {
tcp_flow_rst(tunfd, f);
break;
}
if (rc > 0) {
f->backend_ready = 1;
f->last_active = time(NULL);
if (f->pending_write_len > 0 &&
tcp_flush_pending_write(tunfd, f) < 0)
break;
if (f->sock >= 0 && tcp_finish_pending_fin(tunfd, f) < 0)
break;
if (f->sock >= 0)
tcp_update_events(f);
} else if (f->sock >= 0) {
tcp_update_events(f);
}
}
if (!f->backend_ready)
break;
}
}
if (f->sock >= 0 && (events[i].events & EPOLLOUT) &&
f->pending_write_len > 0) {
if (tcp_flush_pending_write(tunfd, f) < 0)
break;
}
if (f->sock >= 0 && tcp_finish_pending_fin(tunfd, f) < 0)
break;
if (f->sock >= 0 &&
(f->state == SP_TCP_ESTABLISHED ||
f->state == SP_TCP_CLOSE_WAIT) && (events[i].events & EPOLLIN)) {
ssize_t r = recv(f->sock, g_io_buf, sizeof(g_io_buf) - 64, 0);
if (r > 0) {
/* Forward data to client */
send_tcp_packet(tunfd, f, 0x08, g_io_buf, (size_t)r);
f->last_active = time(NULL);
} else if (r == 0) {
if (f->pending_write_len > 0) {
tcp_flow_rst(tunfd, f);
break;
}
/* Server closed connection - send FIN to client */
send_tcp_packet(tunfd, f, 0x01, NULL, 0);
epoll_del(f->sock);
close(f->sock);
f->sock = -1;
f->state = SP_TCP_CLOSED;
} else if (errno != EAGAIN && errno != EWOULDBLOCK &&
errno != EINTR) {
tcp_flow_rst(tunfd, f);
}
}
break;
}
case FD_UDP_CTRL: {
struct udp_flow *f = ew->flow;
if (!f || f->tcp_ctrl < 0)
break;
if ((events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) &&
!f->socks.connect_pending && f->udp_relay >= 0) {
udp_close_flow(f);
break;
}
if (f->socks.connect_pending &&
(events[i].events & (EPOLLOUT | EPOLLERR | EPOLLHUP |
EPOLLRDHUP))) {
if (socket_connect_complete(f->tcp_ctrl) < 0) {
udp_close_flow(f);
break;
}
f->socks.connect_pending = 0;
if (socks_begin_handshake(&f->socks, &socks_proxy) < 0) {
udp_close_flow(f);
break;
}
epoll_mod_udp_ctrl(f, udp_ctrl_events(f));
}
if (f->tcp_ctrl < 0)
break;
if ((events[i].events & EPOLLOUT) && socks_has_pending_tx(&f->socks)) {
if (socks_flush_tx(f->tcp_ctrl, &f->socks) < 0) {
udp_close_flow(f);
break;
}
epoll_mod_udp_ctrl(f, udp_ctrl_events(f));
}
if ((events[i].events & EPOLLIN) != 0) {
struct sockaddr_in relay_addr;
int rc = socks_recv_and_process(f->tcp_ctrl, &f->socks, &socks_proxy,
&relay_addr);
if (rc < 0) {
udp_close_flow(f);
break;
}
if (rc > 0) {
if (udp_open_relay_socket(f, &relay_addr) < 0) {
udp_close_flow(f);
break;
}
epoll_mod_udp_ctrl(f, EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP);
f->last_used = time(NULL);
if (udp_flush_pending(f) < 0) {
udp_close_flow(f);
break;
}
udp_update_events(f);
} else {
epoll_mod_udp_ctrl(f, udp_ctrl_events(f));
}
}
break;
}
case FD_UDP_RELAY: {
/* Handle UDP relay responses (incoming from SOCKS proxy) */
struct udp_flow *f = ew->flow;
if (f && f->udp_relay >= 0) {
if ((events[i].events & EPOLLOUT) && f->pending_set) {
if (udp_flush_pending(f) < 0) {
udp_close_flow(f);
break;
}
}
struct sockaddr_in from;
socklen_t fromlen = sizeof(from);
ssize_t r = recvfrom(f->udp_relay, g_io_buf, sizeof(g_io_buf), 0,
(struct sockaddr *)&from, &fromlen);
/* Direct and SOCKS relay traffic must return from the exact source
* address and port the broker bound to for the flow. */
if (r > 0 &&
(from.sin_addr.s_addr != f->relay_addr.sin_addr.s_addr ||
from.sin_port != f->relay_addr.sin_port)) {
DBG("[parent] UDP source mismatch: got %s:%d, expected %s:%d",
inet_ntoa(from.sin_addr), ntohs(from.sin_port),
inet_ntoa(f->relay_addr.sin_addr),
ntohs(f->relay_addr.sin_port));
break; /* Packet from unexpected source */
}
if (r > 0 && f->tcp_ctrl < 0) {
udp_inject_tun(tunfd, f, g_io_buf, (size_t)r);
f->last_used = time(NULL);
} else if (r > 10) {
DBG("UDP relay received %zd bytes from %s:%d", r,
inet_ntoa(from.sin_addr), ntohs(from.sin_port));
/* Validate FRAG field (byte 2) - we don't support fragmentation */
if (g_io_buf[2] != 0)
break;
/* Strip SOCKS5 UDP header */
size_t hdr_len = 10;
if (g_io_buf[3] == 0x03)
hdr_len = 4 + 1 + g_io_buf[4] + 2;
else if (g_io_buf[3] == 0x04)
hdr_len = 4 + 16 + 2;
if ((size_t)r > hdr_len) {
udp_inject_tun(tunfd, f, g_io_buf + hdr_len, (size_t)r - hdr_len);
f->last_used = time(NULL);
}
} else if (r == 0) {
/* Relay closed */
udp_close_flow(f);
break;
}
udp_update_events(f);
}
break;
}
}
}
/* Cleanup stale TCP flows */
time_t now = time(NULL);
for (int i = 0; i < MAX_TCP; i++) {
struct tcp_flow *f = &tcp_flows[i];
if (f->sock < 0) continue;
int timeout = TCP_IDLE_TIMEOUT_SEC;
if (f->state == SP_TCP_SYN_RECEIVED) timeout = TCP_HALF_OPEN_TIMEOUT_SEC;
else if (f->state != SP_TCP_ESTABLISHED && f->state != SP_TCP_CLOSE_WAIT)
timeout = 10; /* Quick cleanup for FIN_WAIT / CLOSING / LAST_ACK / TIME_WAIT */
if ((now - f->last_active) > timeout) {
if (f->state == SP_TCP_ESTABLISHED || f->state == SP_TCP_CLOSE_WAIT) {
send_tcp_packet(tunfd, f, 0x01, NULL, 0); /* FIN */
f->state = SP_TCP_FIN_WAIT_1;
f->last_active = now;
shutdown(f->sock, SHUT_WR);
tcp_update_events(f);
} else {
/* Force close for other states or if already closing and timed out */
epoll_del(f->sock);
close(f->sock);
f->sock = -1;
f->state = SP_TCP_CLOSED;
}
}
}
/* Cleanup stale UDP flows (idle for >30 seconds) */
for (int i = 0; i < MAX_UDP; i++) {
if ((udp_flows[i].udp_relay >= 0 || udp_flows[i].tcp_ctrl >= 0) &&
(now - udp_flows[i].last_used) > 30) {
udp_close_flow(&udp_flows[i]);
}
}
loop_end:
if (child_status != -1 && pipes_active <= 0 && interactive_pty_active <= 0)
break;
}
return child_status;
}
/* ---------- main ---------- */
int main(int argc, char **argv) {
/* Pre-scan verbose and bootstrap into a delegated scope before side effects. */
int cmd_start = 1;
pre_scan_verbose_flag(argc, argv);
maybe_reexec_under_systemd_scope(argc, argv);
struct interactive_session interactive_session = {0};
interactive_session.host_tty_fd = -1;
interactive_session.pty_master_fd = -1;
interactive_session.pty_slave_fd = -1;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "--socks") == 0 && i + 1 < argc) {
parse_socks_url(argv[i + 1], &socks_proxy);
if (socks_proxy.enabled) {
fprintf(stderr, "Using SOCKS5 proxy: %s:%d%s\n", socks_proxy.host,
socks_proxy.port,
socks_proxy.username[0] ? " (with auth)" : "");
}
i++; /* skip next arg (the proxy URL) */
cmd_start = i + 1;
} else if (strcmp(argv[i], "--unsafe-share-cwd") == 0) {
unsafe_share_cwd = 1;
cmd_start = i + 1;
} else if (strcmp(argv[i], "--interactive") == 0) {
interactive_stdio = 1;
cmd_start = i + 1;
} else if (strncmp(argv[i], "--socks-auth-file=", 18) == 0) {
parse_socks_auth_file(argv[i] + 18);
cmd_start = i + 1;
} else if (strcmp(argv[i], "--verbose") == 0 ||
strcmp(argv[i], "-v") == 0) {
verbose = 1;
cmd_start = i + 1;
} else if (strncmp(argv[i], "--allow-host=", 13) == 0) {
const char *spec = argv[i] + 13;
if (host_rule_count < MAX_HOST_RULES) {
struct host_rule *r = &host_rules[host_rule_count];
memset(r, 0, sizeof(*r));
/* Make a mutable copy for parsing */
char buf[128];
strncpy(buf, spec, sizeof(buf) - 1);
buf[sizeof(buf) - 1] = '\0';
/* Parse protocol suffix /tcp or /udp */
char *slash = strchr(buf, '/');
if (slash) {
*slash = '\0';
if (strcmp(slash + 1, "tcp") == 0)
r->proto = IPPROTO_TCP;
else if (strcmp(slash + 1, "udp") == 0)
r->proto = IPPROTO_UDP;
else {
fprintf(stderr, "Invalid protocol: %s (use /tcp or /udp)\n",
slash + 1);
cmd_start = i + 1;
continue;
}
}
/* Parse 127.0.0.X:PORT format */
if (strncmp(buf, "127.0.0.", 8) == 0) {
char *colon = strchr(buf + 8, ':');
if (colon) {
long last_octet;
long port_val;
*colon = '\0';
const char *port_str = colon + 1;
if (parse_long_strict(buf + 8, 0, 255, &last_octet) < 0) {
fprintf(stderr, "Invalid IP: %s (must be 127.0.0.X)\n", spec);
cmd_start = i + 1;
continue;
}
r->last_octet = (uint8_t)last_octet;
if (strcmp(port_str, "*") == 0) {
fprintf(stderr, "Wildcard port not allowed\\n");
cmd_start = i + 1;
continue;
} else {
if (parse_long_strict(port_str, 1, 65535, &port_val) < 0) {
fprintf(stderr, "Invalid port: %s\\n", port_str);
cmd_start = i + 1;
continue;
}
r->port = (uint16_t)port_val;
}
host_rule_count++;
const char *proto_str = r->proto == IPPROTO_TCP ? "/tcp"
: r->proto == IPPROTO_UDP ? "/udp"
: "";
DBG("Host gateway: 127.0.0.%d:%d%s", r->last_octet, r->port, proto_str);
} else {
fprintf(stderr, "Invalid format: %s (expected 127.0.0.X:PORT)\n",
spec);
}
} else {
fprintf(stderr, "Invalid IP: %s (must be 127.0.0.X)\n", spec);
}
}
cmd_start = i + 1;
} else {
/* First non-flag argument is the command */
cmd_start = i;
break;
}
}
if (cmd_start >= argc) {
fprintf(stderr, "usage: %s [OPTIONS] <cmd> [args...]\n\n", argv[0]);
fprintf(stderr, "Options:\n");
fprintf(stderr, " --socks <proxy> SOCKS5 proxy\n");
fprintf(stderr, " --socks-auth-file= File with proxy credentials\n");
fprintf(stderr, " --unsafe-share-cwd Allow unsafe sandbox source paths\n");
fprintf(stderr, " --interactive Attach child to a private PTY\n");
fprintf(stderr, " -v, --verbose Print debug info\n");
fprintf(stderr,
"\nHost gateway (child accesses 10.0.1.x -> host 127.0.0.x):\n");
fprintf(stderr, " --allow-host=127.0.0.X:PORT/PROTO Allow IP:PORT\n");
fprintf(stderr, "\nExamples:\n");
fprintf(stderr, " %s --allow-host=127.0.0.1:8080/tcp curl 10.0.1.1:8080\n", argv[0]);
return 1;
}
if (socks_proxy.enabled && resolve_socks_proxy(&socks_proxy) < 0) {
fprintf(stderr, "Failed to resolve SOCKS proxy %s:%d\n", socks_proxy.host,
socks_proxy.port);
return 1;
}
/* Shift argv to command */
argv = &argv[cmd_start];
argc -= cmd_start;
int sp[2], ctl[2], sync[2];
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sp) < 0)
die("socketpair sp");
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ctl) < 0)
die("socketpair ctl");
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sync) < 0)
die("socketpair sync");
char overlay_base[] = "/tmp/.sockpuppet-overlay-XXXXXX";
if (!mkdtemp(overlay_base))
die("mkdtemp overlay base");
uid_t uid = getuid();
gid_t gid = getgid();
for (int i = 0; i < MAX_TCP; i++)
tcp_flows[i].sock = -1;
for (int i = 0; i < MAX_UDP; i++) {
udp_flows[i].udp_relay = -1;
udp_flows[i].udp_staging = -1;
udp_flows[i].tcp_ctrl = -1;
}
/* Create epoll instance for event loop */
g_epfd = epoll_create1(EPOLL_CLOEXEC);
if (g_epfd < 0)
die("epoll_create1");
if (interactive_stdio) {
struct sigaction sa;
if (interactive_parent_setup(&interactive_session) < 0) {
fprintf(stderr, "Interactive mode requires a usable parent tty and PTY support (%s)\n",
strerror(errno));
interactive_close_session(&interactive_session);
return 1;
}
memset(&sa, 0, sizeof(sa));
sigemptyset(&sa.sa_mask);
sa.sa_handler = interactive_handle_sigwinch;
if (sigaction(SIGWINCH, &sa, NULL) < 0) {
interactive_close_session(&interactive_session);
die("sigaction SIGWINCH");
}
}
int stdout_pipe[2] = {-1, -1};
int stderr_pipe[2] = {-1, -1};
if (!interactive_stdio) {
if (pipe(stdout_pipe) < 0 || pipe(stderr_pipe) < 0) die("pipe");
fcntl(stdout_pipe[0], F_SETFL, fcntl(stdout_pipe[0], F_GETFL) | O_NONBLOCK);
fcntl(stderr_pipe[0], F_SETFL, fcntl(stderr_pipe[0], F_GETFL) | O_NONBLOCK);
}
apply_parent_rlimits();
cgroup_setup_containment();
pid_t pid = fork();
if (pid < 0) {
interactive_close_session(&interactive_session);
die("fork");
}
if (pid == 0) {
/* ---------- child ---------- */
close(sp[0]);
close(ctl[0]);
close(sync[0]);
close(g_epfd);
g_epfd = -1;
if (interactive_session.host_tty_fd >= 0)
close(interactive_session.host_tty_fd);
if (interactive_session.pty_master_fd >= 0)
close(interactive_session.pty_master_fd);
if (unshare(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWIPC |
CLONE_NEWUTS | CLONE_NEWPID) < 0)
die("unshare");
DBG("Created namespaces: user, net, mnt, ipc, uts, pid");
/* CLONE_NEWPID requires a second fork - the child becomes PID 1 */
pid_t inner_pid = fork();
if (inner_pid < 0)
die("fork (inner)");
if (inner_pid > 0) {
/* Intermediate process: wait for inner child and exit with its status */
int status;
waitpid(inner_pid, &status, 0);
_exit(WIFEXITED(status) ? WEXITSTATUS(status) : 1);
}
/* ---------- inner child (PID 1 in new namespace) ---------- */
IGNORE_RESULT(write(sync[1], "1", 1));
IGNORE_RESULT(read(sync[1], &uid, sizeof(uid)));
IGNORE_RESULT(read(sync[1], &gid, sizeof(gid)));
close(sync[1]);
char cwd[PATH_MAX];
struct fs_sandbox fs_sandbox;
if (!getcwd(cwd, sizeof(cwd)))
die("getcwd");
prepare_fs_sandbox(&fs_sandbox, cwd, overlay_base);
int sockfd = recv_fd(ctl[1]);
close(sp[1]);
int tunfd = tun_create("tun0");
/* network config inside child netns */
if_up_netlink("lo");
if_addr_ptp("tun0", "10.0.0.2", "10.0.0.1");
if_up("tun0");
add_default_route("tun0", "10.0.0.1");
DBG("Network setup: tun0 (10.0.0.2 -> 10.0.0.1), lo up");
fcntl(tunfd, F_SETFD, FD_CLOEXEC);
send_fd(ctl[1], tunfd);
close(tunfd);
close(sockfd);
close(ctl[1]);
enter_fs_sandbox(&fs_sandbox);
/* Privilege dropping is mandatory */
drop_caps();
DBG("Dropped capabilities");
if (apply_landlock_policy(fs_sandbox.resolved_cwd) < 0)
die("landlock_restrict_self");
if (apply_child_seccomp() < 0)
die("seccomp");
char **envp = build_sanitized_envp(fs_sandbox.resolved_cwd);
if (interactive_stdio) {
interactive_child_setup(&interactive_session);
if (interactive_session.pty_slave_fd > STDERR_FILENO)
close(interactive_session.pty_slave_fd);
} else {
dup2(stdout_pipe[1], STDOUT_FILENO);
dup2(stderr_pipe[1], STDERR_FILENO);
int null_fd = open("/dev/null", O_RDONLY);
if (null_fd >= 0) {
dup2(null_fd, STDIN_FILENO);
close(null_fd);
}
close(stdout_pipe[0]);
close(stdout_pipe[1]);
close(stderr_pipe[0]);
close(stderr_pipe[1]);
}
close_extra_fds_for_exec();
apply_child_rlimits();
DBG("Executing: %s", argv[0]);
execvpe(argv[0], argv, envp);
die("exec");
}
/* ---------- parent ---------- */
cgroup_move_child_to_payload(pid);
close(ctl[1]);
close(sync[1]);
if (interactive_session.pty_slave_fd >= 0) {
close(interactive_session.pty_slave_fd);
interactive_session.pty_slave_fd = -1;
}
char tmp;
IGNORE_RESULT(read(sync[0], &tmp, 1));
char path[128], map[64];
snprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
write_file(path, "deny");
snprintf(path, sizeof(path), "/proc/%d/uid_map", pid);
snprintf(map, sizeof(map), "%d %d 1\n", uid, uid);
write_file(path, map);
snprintf(path, sizeof(path), "/proc/%d/gid_map", pid);
snprintf(map, sizeof(map), "%d %d 1\n", gid, gid);
write_file(path, map);
IGNORE_RESULT(write(sync[0], &uid, sizeof(uid)));
IGNORE_RESULT(write(sync[0], &gid, sizeof(gid)));
send_fd(ctl[0], sp[1]);
close(sp[1]);
if (!interactive_stdio) {
close(stdout_pipe[1]);
close(stderr_pipe[1]);
}
int tunfd = recv_fd(ctl[0]);
int status =
event_loop(tunfd, pid, stdout_pipe[0], stderr_pipe[0], &interactive_session);
interactive_close_session(&interactive_session);
if (status < 0 && waitpid(pid, &status, 0) < 0)
status = 1;
if (rmdir(overlay_base) < 0 && errno != ENOENT)
perror("rmdir overlay base");
if (WIFEXITED(status))
return WEXITSTATUS(status);
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment