From b686794a0b8d237501a21214b9aebcf7e90bb8b6 Mon Sep 17 00:00:00 2001 From: koki-develop Date: Thu, 30 Apr 2026 21:44:41 +0900 Subject: [PATCH] feat: whitelist socket families and block legacy AIO via seccomp Generalize the previous AF_ALG-specific block to a whitelist that allows only AF_UNIX, AF_INET, AF_INET6, and AF_NETLINK. This removes a long tail of niche kernel subsystems (AF_PACKET, AF_VSOCK, AF_BLUETOOTH, AF_TIPC, AF_KEY, AF_NFC, etc.) that have significant historical CVE counts and zero legitimate sandbox use. socketpair() is restricted to AF_UNIX, the only family the kernel itself accepts. Also block the legacy libaio family (io_setup, io_destroy, io_submit, io_getevents, io_cancel, io_pgetevents) with ENOSYS, mirroring the existing io_uring block. No runtime in the image links against libaio. io_pgetevents is referenced by name on x86_64 (kafel's amd64 syscall db has it as 333) and by raw syscall number 292 on arm64 (kafel's aarch64 db is missing this entry). Tests cover representative blocked families (AF_PACKET, AF_VSOCK, AF_TIPC), the AF_NETLINK regression, the socketpair restriction, each legacy AIO syscall, both architectures of io_pgetevents, and a size=4 regression that catches accidental migration to the inline declaration form (which would over-block legitimate calls with non-zero high bits in the family arg). --- e2e/tests/security/seccomp.yml | 494 ++++++++++++++++++++++++- internal/sandbox/configs/seccomp.kafel | 90 ++++- 2 files changed, 565 insertions(+), 19 deletions(-) diff --git a/e2e/tests/security/seccomp.yml b/e2e/tests/security/seccomp.yml index 6178ea0..b58029a 100644 --- a/e2e/tests/security/seccomp.yml +++ b/e2e/tests/security/seccomp.yml @@ -262,10 +262,12 @@ tests: ) func main() { - // family = AF_ALG (38) with non-zero high 32 bits. Linux truncates the - // family arg to int and still routes to AF_ALG, but seccomp sees the - // full 64-bit args[0]. The kafel rule must use size=4 (low-word-only) - // comparison so the high bits cannot be used to slip past the filter. + // family = AF_ALG (38) with non-zero high 32 bits. Linux truncates + // the family arg to int (still routes to AF_ALG), but seccomp sees + // the full 64-bit args[0]. Verifies that the whitelist rule blocks + // AF_ALG even when high bits are set — this is the historical bypass + // that motivated using kafel's built-in `family` (size=4) instead of + // the inline declaration form (which would default to size=8). family := uintptr(38) | (uintptr(1) << 32) fd, _, errno := syscall.RawSyscall(syscall.SYS_SOCKET, family, syscall.SOCK_SEQPACKET, 0) if errno == syscall.EPERM { @@ -298,3 +300,487 @@ tests: status: "OK" signal: null duration_ms: "/^[0-9]+$/" + + - name: "AF_UNIX socket with non-zero high 32 bits in family arg is allowed (size=4 regression)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // family = AF_UNIX (1) with non-zero high 32 bits. Kernel truncates + // to int (treats as AF_UNIX) and seccomp must do the same: only the + // low 32 bits matter. With the correct size=4 rule, this call is + // allowed because LOW=1 ∈ {AF_UNIX, AF_INET, AF_INET6, AF_NETLINK}. + // If the rule were rewritten with the inline declaration form + // (size=8), the HIGH_WORD!=0 condition would over-trigger the + // whitelist's negation chain and incorrectly block this legitimate + // AF_UNIX call. This test catches that misconfiguration. + family := uintptr(1) | (uintptr(1) << 32) + fd, _, errno := syscall.RawSyscall(syscall.SYS_SOCKET, family, syscall.SOCK_STREAM, 0) + if errno != 0 { + fmt.Printf("BLOCKED: errno=%d\n", int(errno)) + return + } + syscall.Close(int(fd)) + fmt.Println("OK") + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "OK\n" + stderr: "" + output: "OK\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "AF_PACKET socket creation is blocked by seccomp (go)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // AF_PACKET = 17 — raw packet sockets, kernel attack surface + // (CVE-2016-8655, CVE-2020-14386). + fd, _, errno := syscall.RawSyscall(syscall.SYS_SOCKET, 17, syscall.SOCK_RAW, 0) + if errno == syscall.EPERM { + fmt.Println("EPERM") + return + } + if errno != 0 { + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + return + } + syscall.Close(int(fd)) + fmt.Println("ALLOWED") + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "EPERM\n" + stderr: "" + output: "EPERM\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "AF_VSOCK socket creation is blocked by seccomp (go)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // AF_VSOCK = 40 — VM sockets, recent LPE history + // (CVE-2024-50264, CVE-2025-21756). + fd, _, errno := syscall.RawSyscall(syscall.SYS_SOCKET, 40, syscall.SOCK_STREAM, 0) + if errno == syscall.EPERM { + fmt.Println("EPERM") + return + } + if errno != 0 { + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + return + } + syscall.Close(int(fd)) + fmt.Println("ALLOWED") + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "EPERM\n" + stderr: "" + output: "EPERM\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "AF_TIPC socket creation is blocked by seccomp (go)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // AF_TIPC = 30 — cluster IPC; CVE-2021-43267 was a remote heap + // overflow with CVSS 9.8. + fd, _, errno := syscall.RawSyscall(syscall.SYS_SOCKET, 30, syscall.SOCK_DGRAM, 0) + if errno == syscall.EPERM { + fmt.Println("EPERM") + return + } + if errno != 0 { + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + return + } + syscall.Close(int(fd)) + fmt.Println("ALLOWED") + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "EPERM\n" + stderr: "" + output: "EPERM\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "AF_NETLINK socket creation remains allowed (regression)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // AF_NETLINK = 16 — required by Go's net package for interface + // enumeration. Verify the whitelist still permits it. + fd, _, errno := syscall.RawSyscall(syscall.SYS_SOCKET, 16, syscall.SOCK_RAW, 0) + if errno != 0 { + fmt.Printf("BLOCKED: errno=%d\n", int(errno)) + return + } + syscall.Close(int(fd)) + fmt.Println("OK") + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "OK\n" + stderr: "" + output: "OK\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "socketpair with non-AF_UNIX family is blocked by seccomp (go)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + "unsafe" + ) + + func main() { + // socketpair() is only legitimately used with AF_UNIX. The kernel + // itself returns EOPNOTSUPP for other families; seccomp enforces + // the same boundary as a defense-in-depth layer. + var fds [2]int32 + _, _, errno := syscall.RawSyscall6( + syscall.SYS_SOCKETPAIR, + uintptr(syscall.AF_INET), + uintptr(syscall.SOCK_STREAM), + 0, + uintptr(unsafe.Pointer(&fds)), + 0, 0, + ) + if errno == syscall.EPERM { + fmt.Println("EPERM") + return + } + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "EPERM\n" + stderr: "" + output: "EPERM\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "io_setup is blocked by seccomp (go)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // Legacy AIO io_setup → ENOSYS (mirrors io_uring_setup handling). + _, _, errno := syscall.RawSyscall(syscall.SYS_IO_SETUP, 1, 0, 0) + if errno == syscall.ENOSYS { + fmt.Println("ENOSYS") + return + } + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "ENOSYS\n" + stderr: "" + output: "ENOSYS\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "io_submit is blocked by seccomp (go)" + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // io_submit dispatches opaque iocb structures into many + // filesystem/block-layer paths (long-tail attack surface). + _, _, errno := syscall.RawSyscall(syscall.SYS_IO_SUBMIT, 0, 0, 0) + if errno == syscall.ENOSYS { + fmt.Println("ENOSYS") + return + } + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "ENOSYS\n" + stderr: "" + output: "ENOSYS\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "io_pgetevents is blocked by seccomp (go, x86_64)" + arch: [amd64] + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // io_pgetevents on x86_64 = syscall 333. Bound by name in kafel. + _, _, errno := syscall.Syscall6(333, 0, 0, 0, 0, 0, 0) + if errno == syscall.ENOSYS { + fmt.Println("ENOSYS") + return + } + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "ENOSYS\n" + stderr: "" + output: "ENOSYS\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + + - name: "io_pgetevents is blocked by seccomp (go, arm64)" + arch: [arm64] + requests: + - input: + runtime: go + files: + - name: main.go + type: plain + content: | + package main + + import ( + "fmt" + "syscall" + ) + + func main() { + // io_pgetevents on arm64 = syscall 292. Bound by raw number in + // kafel because the bundled aarch64 syscall db lacks the name. + _, _, errno := syscall.Syscall6(292, 0, 0, 0, 0, 0, 0) + if errno == syscall.ENOSYS { + fmt.Println("ENOSYS") + return + } + fmt.Printf("UNEXPECTED: errno=%d\n", int(errno)) + } + output: + status: 200 + body: + compile: + stdout: "" + stderr: "" + output: "" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" + run: + stdout: "ENOSYS\n" + stderr: "" + output: "ENOSYS\n" + exit_code: 0 + status: "OK" + signal: null + duration_ms: "/^[0-9]+$/" diff --git a/internal/sandbox/configs/seccomp.kafel b/internal/sandbox/configs/seccomp.kafel index 6033fca..3c8dd08 100644 --- a/internal/sandbox/configs/seccomp.kafel +++ b/internal/sandbox/configs/seccomp.kafel @@ -49,24 +49,84 @@ POLICY blocked { clone { (clone_flags & 0x7E020000) != 0 } }, - // -- AF_ALG (Linux Crypto API sockets) -- - // AF_ALG sockets expose the kernel crypto subsystem to userspace via a - // socket/bind/accept/splice chain. The crypto subsystem is global and - // not constrained by network namespaces, so it remains reachable even - // from inside an unprivileged user namespace. This makes AF_ALG a direct - // path to kernel crypto vulnerabilities (e.g. CVE-2026-31431) without - // any race condition or kernel offset requirement. - // None of the sandbox runtimes require AF_ALG (38 == AF_ALG). + // -- Socket address family whitelist -- + // Allow only the families our sandbox runtimes actually need: + // AF_UNIX (1) — local IPC + // AF_INET (2) — IPv4 + // AF_INET6 (10) — IPv6 + // AF_NETLINK (16) — kernel/userspace messaging (used by Go's net package + // for interface enumeration and by glibc's + // getaddrinfo() AI_ADDRCONFIG path via check_pf.c, + // which opens PF_NETLINK NETLINK_ROUTE) + // + // All other families expose niche kernel subsystems with significant + // historical CVE counts and zero legitimate sandbox use. Notable removed + // attack surface (a partial list — block applies to every family except + // the four allowed): + // - AF_ALG (38): kernel Crypto API → CVE-2026-31431 (Copy Fail) + // - AF_PACKET (17): raw packets → CVE-2016-8655, CVE-2020-14386 + // - AF_VSOCK (40): useless in container → CVE-2024-50264 (LPE) + // - AF_BLUETOOTH (31): BleedingTooth (CVE-2020-12351 RCE) + // - AF_TIPC (30): cluster IPC → CVE-2021-43267 (RCE, CVSS 9.8) + // - AF_KEY (15): IPsec key mgmt → CVE-2022-1353 + // - AF_NFC (39): NFC stack → CVE-2021-23134 (LPE) + // - AF_KCM (41), AF_RDS (21), AF_CAN (29), AF_MPLS (28), AF_LLC (26), + // AF_RXRPC (33), AF_XDP (44), AF_SMC (43), AF_MCTP (45), AF_PHONET, + // AF_QIPCRTR, AF_IEEE802154, AF_CAIF, AF_PPPOX, AF_IB, AF_IUCV, + // AF_ATMPVC/SVC, AF_AX25, AF_NETROM, AF_ROSE, AF_X25, AF_IPX, + // AF_APPLETALK, AF_BRIDGE, AF_SECURITY, AF_NETBEUI, AF_SNA, AF_ISDN, + // AF_UNSPEC, and any future family up to AF_MAX. + // + // For socketpair(), only AF_UNIX is legitimate (the kernel itself returns + // EOPNOTSUPP for other families); we enforce this at seccomp for + // defense-in-depth. + // // kafel's built-in database names this argument "family" (not "domain"), // which produces a 32-bit (size=4) comparison against the low word of - // args[0]. Using the inline declaration form would default to size=8 and - // generate a 64-bit comparison (HIGH_WORD==0 AND LOW_WORD==38), which can - // be bypassed by passing rdi = 38|(non_zero<<32) — the kernel truncates - // the arg to int and still routes to AF_ALG, but seccomp's HIGH_WORD check - // fails. The built-in "family" (size=4) checks only LOW_WORD and blocks - // any socket() call whose lower 32 bits equal 38, regardless of upper bits. + // args[0]. Using the inline declaration form `socket(family, type, + // protocol) { ... }` would default to size=8 and generate a 64-bit + // comparison (HIGH_WORD==0 AND LOW_WORD==N), which can be bypassed by + // passing rdi = N|(non_zero<<32) — the kernel truncates the arg to int + // and still routes to family N, but seccomp's HIGH_WORD check fails. + // The built-in "family" (size=4) checks only LOW_WORD, blocking any + // socket() call whose lower 32 bits do not match an allowed family, + // regardless of upper bits. ERRNO(1) { - socket { family == 38 } + socket { + family != 1 && family != 2 && family != 10 && family != 16 + }, + socketpair { + family != 1 + } + }, + + // -- Legacy AIO (libaio) syscalls -- + // The old in-kernel AIO interface (io_setup family) accepts opaque iocb + // structures that fs/aio.c::__io_submit_one dispatches into many + // filesystem and block-layer handlers (aio_read, aio_write, aio_fsync, + // aio_poll), forming a long-tail attack surface even though discrete CVE + // counts are low (the interface is rarely used by modern userspace). + // Sandbox runtimes (Node, Ruby, Go, Python, Rust, Bash) all use + // synchronous I/O or epoll; verified by `ldd` on the runtime binaries — + // none link against libaio, and libaio is not installed in the image. + // The newer io_uring family is already blocked above; blocking legacy AIO + // closes the parallel path. ENOSYS lets any optional probing in language + // runtimes fall back gracefully (mirroring the io_uring block strategy). + // + // io_pgetevents is bound by name on x86_64 (kafel's amd64 syscall db has + // it as syscall 333) and by raw syscall number 292 on arm64 (kafel's + // aarch64 db is missing this entry; arm64 inherits asm-generic where + // io_pgetevents = 292). The 32-bit-compat companion io_pgetevents_time64 + // (416) is unreachable from native 64-bit arm64 userspace — the kernel + // returns ENOSYS for it on aarch64 — so no separate block is required. + ERRNO(38) { + io_setup, + io_destroy, + io_submit, + io_getevents, + io_cancel, + io_pgetevents ON x86_64, + SYSCALL[292] ON arm64 }, KILL_PROCESS {