Skip to content

Instantly share code, notes, and snippets.

@amodm
Created July 3, 2024 15:18
Show Gist options
  • Save amodm/a61e6d0c413e8cc9ac4c56a803150daf to your computer and use it in GitHub Desktop.
Save amodm/a61e6d0c413e8cc9ac4c56a803150daf to your computer and use it in GitHub Desktop.
/* See the corresponding blog post for details:
* https://amodm.com/blog/2024/07/03/running-a-linux-router-on-macos
*/
#pragma once
#include <net/if_var.h>
#pragma pack(4)
struct ifbreq {
char ifbr_ifsname[IFNAMSIZ]; /* member if name */
uint32_t ifbr_ifsflags; /* member if flags */
uint32_t ifbr_stpflags; /* member if STP flags */
uint32_t ifbr_path_cost; /* member if STP cost */
uint8_t ifbr_portno; /* member if port number */
uint8_t ifbr_priority; /* member if STP priority */
uint8_t ifbr_proto; /* member if STP protocol */
uint8_t ifbr_role; /* member if STP role */
uint8_t ifbr_state; /* member if STP state */
uint32_t ifbr_addrcnt; /* member if addr number */
uint32_t ifbr_addrmax; /* member if addr max */
uint32_t ifbr_addrexceeded; /* member if addr violations */
uint8_t pad[32];
};
struct ifbifconf {
uint32_t ifbic_len; /* buffer size */
union {
caddr_t ifbicu_buf;
struct ifbreq *ifbicu_req;
#define ifbic_buf ifbic_ifbicu.ifbicu_buf
#define ifbic_req ifbic_ifbicu.ifbicu_req
} ifbic_ifbicu;
};
/* See the corresponding blog post for details:
* https://amodm.com/blog/2024/07/03/running-a-linux-router-on-macos
*/
#pragma once
#include <net/if_var.h>
/* -----------------------------------------------------
* Fake ethernet related headers.
* https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/net/if_fake_var.h.auto.html
* -----------------------------------------------------
*/
/*
* SIOCSDRVSPEC
*/
enum {
IF_FAKE_S_CMD_NONE = 0,
IF_FAKE_S_CMD_SET_PEER = 1,
IF_FAKE_S_CMD_SET_MEDIA = 2,
IF_FAKE_S_CMD_SET_DEQUEUE_STALL = 3,
};
/*
* SIOCGDRVSPEC
*/
enum {
IF_FAKE_G_CMD_NONE = 0,
IF_FAKE_G_CMD_GET_PEER = 1,
};
#define IF_FAKE_MEDIA_LIST_MAX 27
struct if_fake_media {
int32_t iffm_current;
uint32_t iffm_count;
uint32_t iffm_reserved[3];
int32_t iffm_list[IF_FAKE_MEDIA_LIST_MAX];
};
struct if_fake_request {
uint64_t iffr_reserved[4];
union {
char iffru_buf[128]; /* stable size */
struct if_fake_media iffru_media;
char iffru_peer_name[IFNAMSIZ]; /* if name, e.g. "en0" */
/*
* control dequeue stall. 0: disable dequeue stall, else
* enable dequeue stall.
*/
uint32_t iffru_dequeue_stall;
} iffr_u;
#define iffr_peer_name iffr_u.iffru_peer_name
#define iffr_media iffr_u.iffru_media
#define iffr_dequeue_stall iffr_u.iffru_dequeue_stall
};
// See the corresponding blog post for details:
// https://amodm.com/blog/2024/07/03/running-a-linux-router-on-macos
import Foundation
// xnu is a custom module that I created to expose the relevant C structs
// that the kernel expects, as those structs are not part of the userspace
// API. This module contains C-bridge headers if-fake.h and if-bridge.h
// which are also shown in this gist.
import xnu
struct NetworkInterface {
let name: String
let mac: ether_addr_t
let ips: [String]
let type: UInt32
let flags: UInt32
var isBridge: Bool {
return type == UInt(IFT_BRIDGE)
}
var isLoopback: Bool {
return flags & UInt32(IFF_LOOPBACK) != 0
}
var isFakeEth: Bool {
return name.starts(with: "feth") // TODO: figure out type?
}
var up: Bool {
return flags & UInt32(IFF_UP) != 0
}
func changeStatus(up: Bool) throws {
try Self.changeStatus(name: name, up: up)
}
/// - Returns: all network interfaces currently configured on this system.
static var all: [NetworkInterface] {
var ifap: UnsafeMutablePointer<ifaddrs>? = nil
guard getifaddrs(&ifap) == 0 else {
fatalError("getifaddrs() failed: \(String(cString: strerror(errno)))")
}
defer { freeifaddrs(ifap) }
var interfaces = [NetworkInterface]()
try! withControlSocket { ctl in
for ifa in sequence(first: ifap, next: { $0?.pointee.ifa_next }) {
if let ifa = ifa?.pointee {
let ifname = String(cString: ifa.ifa_name)
let flags = ifa.ifa_flags
var ips = [String]()
var mac = ether_addr_t()
switch Int32(ifa.ifa_addr.pointee.sa_family) {
case AF_LINK:
var addr = ifa.ifa_addr.withMemoryRebound(to: sockaddr_dl.self, capacity: 1) { $0.pointee }
mac = withUnsafeMutableBytes(of: &addr.sdl_data) { ptr in
ptr.baseAddress!.advanced(by: Int(addr.sdl_nlen)).assumingMemoryBound(to: ether_addr_t.self).pointee
}
case AF_INET:
var addr = ifa.ifa_addr.withMemoryRebound(to: sockaddr_in.self, capacity: 1) { $0.pointee }
var ip = [CChar](repeating: 0, count: Int(INET_ADDRSTRLEN))
inet_ntop(AF_INET, &addr.sin_addr, &ip, socklen_t(INET_ADDRSTRLEN))
ips.append(String(cString: ip))
case AF_INET6:
var addr = ifa.ifa_addr.withMemoryRebound(to: sockaddr_in6.self, capacity: 1) { $0.pointee }
var ip = [CChar](repeating: 0, count: Int(INET6_ADDRSTRLEN))
inet_ntop(AF_INET6, &addr.sin6_addr, &ip, socklen_t(INET6_ADDRSTRLEN))
ips.append(String(cString: ip))
default:
continue
}
var ifr = ifreq()
memset(&ifr, 0, MemoryLayout<ifreq>.size)
ifname.copyTo(&ifr.ifr_name)
guard ioctl(ctl, IfIoctl.SIOCFIFTYPE, &ifr) == 0 else {
fatalError("\(ifname):ioctl(SIOCFIFTYPE): \(String(cString: strerror(errno)))")
}
let type = ifr.ifr_ifru.ifru_functional_type
interfaces.append(NetworkInterface(name: ifname, mac: mac, ips: ips, type: type, flags: flags))
}
}
}
return interfaces
}
private static func withControlSocket<T>(_ family: Int32 = AF_LOCAL, _ body: (Int32) throws -> T) throws -> T {
let sock = socket(AF_LOCAL, SOCK_DGRAM, 0)
guard sock >= 0 else {
throw RVMError.sycallError("control:socket()")
}
defer { close(sock) }
return try body(sock)
}
/// Creates a fake eth interface, and peers with `peer` (if provided).
/// - Parameter peer: the peer to connect to
/// - Returns: the name of the fake eth interface that was created.
static func createFakeEth(peer: String? = nil) throws -> String {
let allFakeEths = Set(all.filter { $0.isFakeEth }.map { $0.name })
for i in 0..<128 {
let name = "feth\(i)"
if !allFakeEths.contains(name) {
var ifr = ifreq()
memset(&ifr, 0, MemoryLayout.size(ofValue: ifr))
name.copyTo(&ifr.ifr_name)
ifr.ifr_ifru.ifru_flags = Int16(IFF_UP | IFF_RUNNING)
// create
try withControlSocket { ctl in
guard ioctl(ctl, IfIoctl.SIOCIFCREATE2, &ifr) == 0 else {
throw RVMError.sycallError("feth:create()")
}
if peer != nil {
// from https://opensource.apple.com/source/network_cmds/network_cmds-606.40.2/ifconfig.tproj/iffake.c.auto.html
var iffr = if_fake_request()
memset(&iffr, 0, MemoryLayout.size(ofValue: iffr))
peer!.copyTo(&iffr.iffr_u.iffru_peer_name)
var ifd = ifdrv()
memset(&ifd, 0, MemoryLayout.size(ofValue: ifd))
name.copyTo(&ifd.ifd_name)
ifd.ifd_cmd = UInt(IF_FAKE_S_CMD_SET_PEER)
withUnsafeMutablePointer(to: &iffr) { ifd.ifd_data = UnsafeMutableRawPointer($0) }
ifd.ifd_len = MemoryLayout.size(ofValue: iffr)
guard ioctl(ctl, IfIoctl.SIOCSDRVSPEC, &ifd) == 0 else {
throw RVMError.sycallError("feth:ioctl(set-peer)")
}
}
}
return name
}
}
throw RVMError.illegalState("feth:create(): out of options")
}
/// Deletes the network interface with the given name.
/// - Parameter name: the name of the network interface to delete.
static func deleteInterface(_ name: String) throws {
var ifr = ifreq()
memset(&ifr, 0, MemoryLayout.size(ofValue: ifr))
name.copyTo(&ifr.ifr_name)
try withControlSocket { ctl in
guard ioctl(ctl, IfIoctl.SIOCIFDESTROY, &ifr) == 0 else {
throw RVMError.sycallError("\(name):ioctl(SIOCIFDESTROY)")
}
}
}
/// Creates a pair of fake eth interfaces, and peers them together.
/// - Returns: the names of the two fake eth interfaces that were created.
static func createFakeEthPair() throws -> (String, String) {
let feth1 = try createFakeEth()
let feth2 = try createFakeEth(peer: feth1)
try changeStatus(name: feth1, up: true)
try changeStatus(name: feth2, up: true)
return (feth1, feth2)
}
/// Change the status of the network interface with the given name.
/// - Parameters:
/// - name: the name of the network interface
/// - up: whether to bring the interface up or down
/// - Throws: an error if the operation fails
static func changeStatus(name: String, up: Bool) throws {
var ifr = ifreq()
memset(&ifr, 0, MemoryLayout.size(ofValue: ifr))
name.copyTo(&ifr.ifr_name)
try NetworkInterface.withControlSocket(AF_INET) { ctl in
guard ioctl(ctl, IfIoctl.SIOCGIFFLAGS, &ifr) == 0 else {
throw RVMError.sycallError("\(name):ioctl(SIOCGIFFLAGS)")
}
let oldFlag = Int32(ifr.ifr_ifru.ifru_flags) & 0xffff
var newFlag = oldFlag
if up {
newFlag |= Int32(IFF_UP | IFF_RUNNING)
} else {
newFlag &= ~Int32(IFF_UP | IFF_RUNNING)
}
if oldFlag != newFlag {
ifr.ifr_ifru.ifru_flags = Int16(bitPattern: UInt16(newFlag & 0xffff))
guard ioctl(ctl, IfIoctl.SIOCSIFFLAGS, &ifr) >= 0 else {
throw RVMError.sycallError("\(name):ioctl(SIOCSIFFLAGS)")
}
}
}
}
/// Adds `ifc` to the network bridge `bridge`.
/// - Parameters:
/// - ifc: the network interface to add to the bridge.
/// - bridge: the network bridge.
static func addInterfaceToBridge(_ ifc: String, to bridge: String) throws {
var req = ifbreq()
memset(&req, 0, MemoryLayout.size(ofValue: req))
ifc.copyTo(&req.ifbr_ifsname)
var ifd = ifdrv()
memset(&ifd, 0, MemoryLayout.size(ofValue: ifd))
bridge.copyTo(&ifd.ifd_name)
ifd.ifd_cmd = 0 // BRDGADD: https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/net/if_bridgevar.h.auto.html
withUnsafeMutablePointer(to: &req) { ifd.ifd_data = UnsafeMutableRawPointer($0) }
ifd.ifd_len = MemoryLayout.size(ofValue: req)
try withControlSocket { ctl in
guard ioctl(ctl, IfIoctl.SIOCSDRVSPEC, &ifd) == 0 else {
throw RVMError.sycallError("bridge(\(bridge)):add-if(\(ifc))")
}
}
}
/// Ensures that `member` is a member of the `bridge` network interface.
/// - Returns: `true` if the member was added, `false` if it was already a member.
static func ensureBridgeMembership(bridge: String, member: String) throws -> Bool {
var req = ifbreq()
memset(&req, 0, MemoryLayout.size(ofValue: req))
member.copyTo(&req.ifbr_ifsname)
var ifd = ifdrv()
memset(&ifd, 0, MemoryLayout.size(ofValue: ifd))
bridge.copyTo(&ifd.ifd_name)
ifd.ifd_cmd = 2 // BRDGGIFFLGS: https://opensource.apple.com/source/xnu/xnu-7195.81.3/bsd/net/if_bridgevar.h.auto.html
withUnsafeMutablePointer(to: &req) { ifd.ifd_data = UnsafeMutableRawPointer($0) }
ifd.ifd_len = MemoryLayout.size(ofValue: req)
return try withControlSocket { ctl in
if ioctl(ctl, IfIoctl.SIOCGDRVSPEC, &ifd) < 0 {
if errno == ENOENT {
try addInterfaceToBridge(member, to: bridge)
return true
} else {
throw RVMError.sycallError("bridge(\(bridge)):getifflags(\(member))")
}
}
return false
}
}
}
func _IOC(_ dir: UInt32, _ g: Character, _ n: UInt, _ l: Int) -> UInt {
return UInt(dir) | ((UInt(l) & UInt(IOCPARM_MASK)) << 16) | (UInt(g.asciiValue ?? 0) << 8) | n
}
func _IO(_ g: Character, _ n: UInt) -> UInt {
return _IOC(IOC_VOID, g, n, 0)
}
func _IOW<T>(_ char: Character, _ nr: UInt, _ ctype: T.Type) -> UInt {
return _IOC(IOC_IN, char, nr, MemoryLayout<T>.size)
}
func _IOR<T>(_ char: Character, _ nr: UInt, _ ctype: T.Type) -> UInt {
return _IOC(IOC_OUT, char, nr, MemoryLayout<T>.size)
}
func _IOWR<T>(_ char: Character, _ nr: UInt, _ ctype: T.Type) -> UInt {
return _IOC(IOC_INOUT, char, nr, MemoryLayout<T>.size)
}
enum IfIoctl {
static let SIOCSIFFLAGS = _IOW("i", 16, ifreq.self)
static let SIOCGIFFLAGS = _IOWR("i", 17, ifreq.self)
static let SIOCGIFMEDIA = _IOWR("i", 56, ifmediareq.self)
static let SIOCIFCREATE = _IOWR("i", 120, ifreq.self)
static let SIOCIFDESTROY = _IOW("i", 121, ifreq.self)
static let SIOCIFCREATE2 = _IOWR("i", 122, ifreq.self)
static let SIOCSDRVSPEC = _IOW("i", 123, ifdrv.self)
static let SIOCGDRVSPEC = _IOWR("i", 123, ifdrv.self)
static let SIOCFIFTYPE = _IOWR("i", 159, ifreq.self)
}
// See the corresponding blog post for details:
// https://amodm.com/blog/2024/07/03/running-a-linux-router-on-macos
import Darwin
import Foundation
import Virtualization
// we poll via kqeueues in this thread
final class NetworkSwitch: Thread {
static var shared = NetworkSwitch()
static var logger: VMLogFacility = {
VMFileLogger.shared.newFacility("nwswitch")
}()
private var sockDevs: [VSockDev] = []
func newBridgePort(hostBridge: String, vMac: ether_addr_t) throws -> VZFileHandleNetworkDeviceAttachment {
if isExecuting {
throw RVMError.illegalState("cannot add port after switch has started")
}
let vsockDev = try VSockDev(hostBridge: hostBridge, vMac: vMac)
sockDevs.append(vsockDev)
return VZFileHandleNetworkDeviceAttachment(fileHandle: FileHandle(fileDescriptor: vsockDev.remoteSocket))
}
/// Checks every bridge port and ensures that the bridge contains our interface.
func ensureBridgeMembership() {
for dev in sockDevs {
if dev.isBridge {
do {
if try NetworkInterface.ensureBridgeMembership(bridge: dev.hostInterface, member: dev.fethBridgeSide) {
NetworkSwitch.logger.info("readded \(dev.fethBridgeSide) to bridge \(dev.hostInterface)")
}
} catch {
NetworkSwitch.logger.error("\(error)")
}
}
}
}
private static func kqChangeList(_ capacity: Int) -> UnsafeMutablePointer<kevent> {
let ptr = UnsafeMutablePointer<kevent>.allocate(capacity: capacity)
ptr.initialize(repeating: kevent(), count: capacity)
return ptr
}
override func main() {
if !sockDevs.isEmpty {
defer {
// close all sockets
for dev in sockDevs {
dev.close()
}
}
let kq = kqueue()
if kq < 0 {
fatalError("kqueue() failed: \(String(cString: strerror(errno)))")
}
defer { close(kq) }
let kqs = KQSockets(sockDevs)
while !isCancelled {
if kqs.onEvent(kq) < 0 {
if errno == EINTR || errno == EAGAIN {
continue
}
NetworkSwitch.logger.error("onEvent() failed: \(String(cString: strerror(errno)))")
}
}
// cleanup
for dev in sockDevs {
dev.close()
}
}
}
func cancelAndJoin(_ pollTimeNanos: UInt64 = 100_000_000) async throws {
cancel()
while !isFinished {
try await Task.sleep(nanoseconds: pollTimeNanos)
}
}
}
private struct VSockDev {
let hostInterface: String
let vMac: ether_addr_t
let vmSocket: Int32
let remoteSocket: Int32
let bpfSocket: Int32
let ndrvSocket: Int32
let bpfBufferSize: Int
let bpfReadBuffer: UnsafeMutableRawBufferPointer
let bpfFilter: [bpf_insn]
let fethBridgeSide: String
let fethVmSide: String
let isBridge: Bool
var bpfStats: bpf_stat {
var stats = bpf_stat()
return ioctl(bpfSocket, BpfIoctl.BIOCGSTATS, &stats) == 0 ? stats : bpf_stat(bs_recv: 0, bs_drop: 0)
}
init(hostBridge: String, vMac: ether_addr_t) throws {
self.hostInterface = hostBridge
self.isBridge = NetworkInterface.all.first(where: { $0.name == hostBridge })?.isBridge ?? false
self.vMac = vMac
(fethBridgeSide, fethVmSide) = isBridge ? try NetworkInterface.createFakeEthPair() : (hostBridge, hostBridge)
var socketPair: (Int32, Int32) = (0, 0)
withUnsafePointer(to: &socketPair) {
let ptr = UnsafeMutableRawPointer(mutating: $0).bindMemory(to: Int32.self, capacity: 2)
guard socketpair(PF_LOCAL, SOCK_DGRAM, 0, ptr) == 0 else {
fatalError("socketpair() failed: \(String(cString: strerror(errno)))")
}
}
(vmSocket, remoteSocket) = socketPair
// set buffer size
var size = 1024 * 1024 * 8
setsockopt(vmSocket, SOL_SOCKET, SO_SNDBUF, &size, socklen_t(MemoryLayout<Int>.size))
setsockopt(vmSocket, SOL_SOCKET, SO_RCVBUF, &size, socklen_t(MemoryLayout<Int>.size))
setsockopt(remoteSocket, SOL_SOCKET, SO_SNDBUF, &size, socklen_t(MemoryLayout<Int>.size))
setsockopt(remoteSocket, SOL_SOCKET, SO_RCVBUF, &size, socklen_t(MemoryLayout<Int>.size))
self.bpfBufferSize = Int(BPF_MAXBUFSIZE)
self.bpfReadBuffer = UnsafeMutableRawBufferPointer.allocate(byteCount: bpfBufferSize, alignment: 16)
let vmacTop2 = UInt32(vMac.octet.0) << 8 | UInt32(vMac.octet.1)
let vmacBottom4 = UInt32(vMac.octet.2) << 24 | UInt32(vMac.octet.3) << 16 | UInt32(vMac.octet.4) << 8 | UInt32(vMac.octet.5)
self.bpfFilter = [
// [0] the following 4 statements do `ether dst host <vMac>`
bpf_insn(code: CUnsignedShort(BPF_LD | BPF_W | BPF_ABS), jt: 0, jf: 0, k: 2), // ld dst_host_ether[2..<6]
bpf_insn(code: CUnsignedShort(BPF_JMP | BPF_JEQ | BPF_K), jt: 0, jf: 2, k: vmacBottom4), // if == vMac[2..<6], proceed to next else skip-2
bpf_insn(code: CUnsignedShort(BPF_LD | BPF_H | BPF_ABS), jt: 0, jf: 0, k: 0), // ldh dst_host_ether[0..<2] (msb 2 bytes)
bpf_insn(code: CUnsignedShort(BPF_JMP | BPF_JEQ | BPF_K), jt: 3, jf: 4, k: vmacTop2), // if == vMac[0..<2], skip-3 (true) else skip-4 (false)
// [4] the following 3 statements do `ether dst broadcast`
bpf_insn(code: CUnsignedShort(BPF_JMP | BPF_JEQ | BPF_K), jt: 0, jf: 3, k: 0xffffffff), // if == 0xffffffff (broadcast), next else skip-3 (false)
bpf_insn(code: CUnsignedShort(BPF_LD | BPF_H | BPF_ABS), jt: 0, jf: 0, k: 2), // ld dst_host_ether[2..<6]
bpf_insn(code: CUnsignedShort(BPF_JMP | BPF_JEQ | BPF_K), jt: 0, jf: 1, k: 0xffff), // if == 0xffff (broadcast), next (true) else skip-1 (false)
// [7] return true (capture max packet size)
bpf_insn(code: CUnsignedShort(BPF_RET | BPF_K), jt: 0, jf: 0, k: UInt32(self.bpfBufferSize)),
// [8] return false
bpf_insn(code: CUnsignedShort(BPF_RET | BPF_K), jt: 0, jf: 0, k: 0), // ret false
]
self.ndrvSocket = Self.ndrvSocket(fethVmSide)
self.bpfSocket = Self.bpfSocket(fethVmSide, self.bpfBufferSize, self.bpfFilter)
}
/// Route traffic between host and vm, depending upon the `event`
func routeTraffic(_ event: kevent64_s) -> Bool {
let fd = Int32(event.ident)
if fd == vmSocket {
vmToHost(event)
} else if fd == bpfSocket {
hostToVM(event)
} else {
return false
}
return true
}
/// Route traffic from host to VM by reading from bpfSocket and writing to vmSocket.
func hostToVM(_ event: kevent64_s) {
var numPackets = 0, wlen = 0, wlenActual = 0
let buffer = bpfReadBuffer.baseAddress!
let len = read(bpfSocket, buffer, bpfBufferSize)
if len > 0 {
let endPtr = buffer.advanced(by: len)
var pktPtr = buffer.assumingMemoryBound(to: bpf_hdr.self)
while pktPtr < endPtr {
// for each packet
let hdr = pktPtr.pointee
let nextPktPtr = UnsafeMutableRawPointer(pktPtr).advanced(by: Int(hdr.bh_caplen) + Int(hdr.bh_hdrlen))
if hdr.bh_caplen > 0 {
if nextPktPtr > endPtr {
NetworkSwitch.logger.error("\(hostInterface)-h2g: nextPktPtr out of bounds: \(nextPktPtr) > \(endPtr). current pktPtr=\(pktPtr) hdr=\(hdr)", throttleKey: "h2g-next-oob")
}
let hdr = pktPtr.pointee
let dataPtr = UnsafeMutableRawPointer(mutating: pktPtr).advanced(by: Int(hdr.bh_hdrlen))
let writeLen = write(vmSocket, dataPtr, Int(hdr.bh_caplen))
numPackets += 1
wlen += Int(hdr.bh_caplen)
wlenActual += writeLen
if writeLen < 0 {
NetworkSwitch.logger.error("\(hostInterface)-h2g: write() failed: \(String(cString: strerror(errno)))", throttleKey: "h2g-writ-fail")
} else if writeLen != Int(hdr.bh_caplen) {
NetworkSwitch.logger.error("\(hostInterface)-h2g: write() failed: partial write", throttleKey: "h2g-writ-partial")
}
}
pktPtr = nextPktPtr.alignedUp(toMultipleOf: BPF_ALIGNMENT).assumingMemoryBound(to: bpf_hdr.self)
}
} else if len == 0 {
NetworkSwitch.logger.error("\(hostInterface)-h2g: EOF", throttleKey: "h2g-eof")
} else if errno != EAGAIN && errno != EINTR {
NetworkSwitch.logger.error("\(hostInterface)-h2g: read() failed: \(String(cString: strerror(errno)))", throttleKey: "h2g-read-fail")
}
}
/// Send traffic from VM to host by reading from vmSocket and writing to ndrv socket.
func vmToHost(_ event: kevent64_s, onlyOne: Bool = true) {
let availableLen = min(bpfReadBuffer.count, Int(event.data))
let basePtr = bpfReadBuffer.baseAddress!
var offset = 0
while offset < availableLen {
let n = read(vmSocket, basePtr, availableLen - offset)
if n > 0 {
let len = write(ndrvSocket, basePtr, n)
if len != n {
if len < 0 {
NetworkSwitch.logger.error("\(hostInterface)-g2h: write() failed: \(String(cString: strerror(errno)))", throttleKey: "g2h-writ-fail")
} else if errno != EAGAIN && errno != EINTR {
NetworkSwitch.logger.error("\(hostInterface)-g2h: write() failed: partial write", throttleKey: "g2h-writ-partial")
}
break
}
offset += n
if onlyOne {
break
}
} else {
if n == 0 {
NetworkSwitch.logger.error("\(hostInterface)-g2h: EOF", throttleKey: "g2h-eof")
} else if errno != EAGAIN && errno != EINTR {
NetworkSwitch.logger.error("\(hostInterface)-g2h: read() failed: \(String(cString: strerror(errno))): e=\(event)", throttleKey: "g2h-read-fail")
}
break
}
}
}
static func bpfSocket(_ ifc: String, _ buffSize: Int, _ bpfFilter: [bpf_insn]) -> Int32 {
// TODO: modify sysctl debug.bpf_maxbufsize and use that size
for i in 1..<256 {
let dev = "/dev/bpf\(i)"
let fd = open(dev, O_RDONLY)
if fd >= 0 {
// set buffer size
var arg = buffSize
guard ioctl(fd, BpfIoctl.BIOCSBLEN, &arg) == 0 else {
fatalError("bpf \(dev) ioctl(BIOCSBLEN) failed for \(ifc): \(String(cString: strerror(errno)))")
}
// set immediate mode to true
arg = 1
guard ioctl(fd, BpfIoctl.BIOCIMMEDIATE, &arg) == 0 else {
fatalError("bpf ioctl(BIOCIMMEDIATE) failed for \(ifc): \(String(cString: strerror(errno)))")
}
// see only received packets, not generated locally
arg = 0
guard ioctl(fd, BpfIoctl.BIOCSSEESENT, &arg) == 0 else {
fatalError("bpf ioctl(BIOCSSEESENT) failed for \(ifc): \(String(cString: strerror(errno)))")
}
// bind to interface
var ifr = ifreq()
memset(&ifr, 0, MemoryLayout<ifreq>.size)
ifc.copyTo(&ifr.ifr_name)
guard ioctl(fd, BpfIoctl.BIOCSETIF, &ifr) == 0 else {
fatalError("bpf ioctl(BIOCSETIF) failed for \(ifc): \(String(cString: strerror(errno)))")
}
arg = 1
guard ioctl(fd, BpfIoctl.BIOCSHDRCMPLT, &arg) == 0 else {
fatalError("bpf ioctl(BIOCSHDRCMPLT) failed for \(ifc): \(String(cString: strerror(errno)))")
}
arg = 1
guard ioctl(fd, BpfIoctl.BIOCPROMISC, &arg) == 0 else {
fatalError("bpf ioctl(BIOCPROMISC) failed for \(ifc): \(String(cString: strerror(errno)))")
}
// set filter
var filter = bpf_program()
filter.bf_len = UInt32(bpfFilter.count)
filter.bf_insns = UnsafeMutablePointer<bpf_insn>.allocate(capacity: bpfFilter.count)
for i in 0..<bpfFilter.count {
filter.bf_insns[i] = bpfFilter[i]
}
guard ioctl(fd, BpfIoctl.BIOCSETFNR, &filter) == 0 else {
fatalError("bpf ioctl(BIOCSETFNR) failed for \(ifc): \(String(cString: strerror(errno)))")
}
return fd
}
}
fatalError("bpf open() failed for \(ifc): \(String(cString: strerror(errno)))")
}
static func ndrvSocket(_ ifc: String) -> Int32 {
let fd = socket(PF_NDRV, SOCK_RAW, 0)
guard fd >= 0 else {
fatalError("ndrv socket() failed for \(ifc): \(String(cString: strerror(errno)))")
}
// bind to interface
var nd = sockaddr_ndrv()
nd.snd_len = UInt8(MemoryLayout<sockaddr_ndrv>.size)
nd.snd_family = UInt8(AF_NDRV)
ifc.copyTo(&nd.snd_name)
withUnsafePointer(to: &nd) { nd_ptr in
nd_ptr.withMemoryRebound(to: sockaddr.self, capacity: 1) { nd_ptr in
if Darwin.bind(fd, nd_ptr, socklen_t(MemoryLayout<sockaddr_ndrv>.size)) != 0 {
fatalError("ndrv bind() failed for \(ifc): \(String(cString: strerror(errno)))")
}
if Darwin.connect(fd, nd_ptr, socklen_t(MemoryLayout<sockaddr_ndrv>.size)) != 0 {
fatalError("ndrv connect() failed for \(ifc): \(String(cString: strerror(errno)))")
}
}
}
return fd
}
func close() {
Darwin.close(vmSocket)
Darwin.close(remoteSocket)
Darwin.close(bpfSocket)
Darwin.close(ndrvSocket)
if isBridge {
try? NetworkInterface.deleteInterface(self.fethBridgeSide)
try? NetworkInterface.deleteInterface(self.fethVmSide)
}
}
}
private struct KQSockets {
private let ptr: UnsafeMutablePointer<kevent64_s>
private let eventsPtr: UnsafeMutablePointer<kevent64_s>
private let sockDevs: [VSockDev]
init(_ sockDevs: [VSockDev]) {
self.sockDevs = sockDevs
let capacity = sockDevs.count * 2
self.ptr = UnsafeMutablePointer<kevent64_s>.allocate(capacity: capacity)
self.ptr.initialize(repeating: kevent64_s(), count: capacity)
self.eventsPtr = UnsafeMutablePointer<kevent64_s>.allocate(capacity: capacity)
self.eventsPtr.initialize(repeating: kevent64_s(), count: capacity)
for i in 0..<sockDevs.count {
guard Foundation.fcntl(sockDevs[i].vmSocket, F_SETFL, O_NONBLOCK) == 0 else {
fatalError("fcntl() failed for \(sockDevs[i].hostInterface) vmSocket: \(String(cString: strerror(errno)))")
}
guard Foundation.fcntl(sockDevs[i].bpfSocket, F_SETFL, O_NONBLOCK) == 0 else {
fatalError("fcntl() failed for \(sockDevs[i].hostInterface) bpfSocket: \(String(cString: strerror(errno)))")
}
self.ptr.advanced(by: 2*i).pointee = kevent64_s(
ident: UInt64(sockDevs[i].vmSocket),
filter: Int16(EVFILT_READ),
flags: UInt16(EV_ADD | EV_ENABLE),
fflags: 0,
data: 0,
udata: 0,
ext: (0, 0)
)
self.ptr.advanced(by: 2*i+1).pointee = kevent64_s(
ident: UInt64(sockDevs[i].bpfSocket),
filter: Int16(EVFILT_READ),
flags: UInt16(EV_ADD | EV_ENABLE),
fflags: 0,
data: 0,
udata: 0,
ext: (0, 0)
)
}
}
func onEvent(_ kq: Int32) -> Int {
let timeoutMillis: Int = 1000
let timeoutSecs = timeoutMillis / 1000
let timeoutNanos = (timeoutMillis % 1000) * 1_000_000
var timeout = timespec(tv_sec: timeoutSecs, tv_nsec: timeoutNanos)
let len = sockDevs.count * 2
let numEvents = Int(kevent64(kq, ptr, Int32(len), eventsPtr, Int32(len), 0, &timeout))
if numEvents > 0 {
eventLoop: for i in 0..<len {
let evt = eventsPtr.advanced(by: i).pointee
if evt.flags & UInt16(EV_ERROR) != 0 {
NetworkSwitch.logger.error("evt-error: \(String(cString: strerror(Int32(evt.data))))", throttleKey: "kq-evt-error")
} else if evt.data > 0 {
let fd = Int32(evt.ident)
for j in 0..<sockDevs.count {
let dev = sockDevs[j]
if dev.vmSocket == fd {
dev.vmToHost(evt)
continue eventLoop
} else if dev.bpfSocket == fd {
dev.hostToVM(evt)
continue eventLoop
} else {
continue
}
}
NetworkSwitch.logger.error("no route found for event: \(evt)", throttleKey: "kq-no-route")
}
}
}
return numEvents
}
}
private let BPF_ALIGNMENT = MemoryLayout<Int32>.size
enum BpfIoctl {
static let BIOCSBLEN = _IOWR("B", 102, CUnsignedInt.self)
static let BIOCPROMISC = _IO("B", 105)
static let BIOCSETIF = _IOW("B", 108, ifreq.self)
static let BIOCGSTATS = _IOR("B", 111, bpf_stat.self)
static let BIOCIMMEDIATE = _IOW("B", 112, CUnsignedInt.self)
static let BIOCSHDRCMPLT = _IOW("B", 117, CUnsignedInt.self)
static let BIOCSSEESENT = _IOW("B", 119, CUnsignedInt.self)
static let BIOCSETFNR = _IOW("B", 126, bpf_program.self)
}
@njhsi
Copy link

njhsi commented Dec 9, 2024

thanks for the creative solution! I got to run router things on vm from macos too. would it possible to 'steal' your codes on vm as well? or just make it public as you mentioned:;

@njhsi
Copy link

njhsi commented Dec 9, 2024

I've built your work into mine. not fully work, and I found it need 'sudo' permission to create feth otherwise failed with sycallError("feth:create()") .

my question is, is sudo permission by design? I don't like anything of sudo a must....

@amodm
Copy link
Author

amodm commented Dec 9, 2024

@njhsi glad that you found it useful. The sudo is a macOS requirement, and you're right to want to avoid it, but any manual network interface creation/destruction (in this case feth) will require root permission, and this is a natural consequence of it.

For me, the sudo is acceptable because in my case there's no way to interact with the VM by an untrusted user after it has started. And by design, the network traffic that flows doesn't interact with my code in any way. But you should make your own security assessments based on your individual context.

One way to avoid sudo is to rely on Virtualization.framework's VZBridgedNetworkDeviceAttachment, but that requires an entitlement which is very difficult to get in practice. That was the whole reason why I had to use their VZFileHandleNetworkDeviceAttachment instead.

@njhsi
Copy link

njhsi commented Dec 9, 2024

@amodm thanks for the info.

I could not anyway(yes sudo) to make feth0 as member into bridge1(manually created with ethernet en0 in advance). have to give up for now, maybe there's difference between apple silicon and Intel(I guess you're now on this), maybe check my faulty codes later.. thanks anyway!

@njhsi
Copy link

njhsi commented Dec 9, 2024

@amodm
Copy link
Author

amodm commented Dec 9, 2024

Can you tell me where and what error you're getting? I use both macOS on both Apple Silicon and Intel, so that's unlikely to be an issue. I'll also try to open up my code sooner than later, but I don't have an immediate timeline for it.

@njhsi
Copy link

njhsi commented Dec 10, 2024

( am not familiar with macOS swift coding, did all this by on-job experience and your great blog guiding)
on MBA m1, en0 is the wifi interface, I used en0 to call switch.newBridgePort(hostBridge: "en0", vMac: vmac) so that no feth should be involved(?), just expecting the vm able to access outside other than the MBA host. Built and ran with no error, able to log into vm console. in vm, the eth0( of VZNAT..) was up and working the eth1(I guess it's of VZFileNetw..) was existed but down. in vm I did 'ifconfig eth1 192.168.1.222 up' to bring eth1 up successfully. but not able to ping outside, oops.

would be appreciated if shed light on how to troubleshot in next step.

vm had also VZNATNetworkDeviceAttachment as eth0, without this, vm seemed not start to give me console. I debugged by pausing to find out it's running on switch.onEvent.

. . . //vm preparing ..
network.attachment = VZNATNetworkDeviceAttachment()

let network2 = VZVirtioNetworkDeviceConfiguration()
let bridger = NetworkSwitch()
let vmac = ether_addr_t(octet:(u_char(123),u_char(123),u_char(123),u_char(123),u_char(123),u_char(123)))
do {
    try network2.attachment = bridger.newBridgePort(hostBridge: "en0", vMac: vmac)        //<----------en0 the wifi IF
} catch {
    fatalError("Virtual Machine Config Bridger Error: \(error)")
}
config.networkDevices = [network,network2]                               // <--------------network
. .  . .
let delegate = VMDelegate()
vm.delegate = delegate
vm.start { result in
    switch result {
    case .success:
        NSLog("Virtual Machine Started")
        bridger.start()                                                             //<-----------------switch thread
        NSLog("bridger Started")
    case let .failure(error):
        fatalError("Virtual Machine Start Error: \(error)")
    }
}

dispatchMain()

@amodm
Copy link
Author

amodm commented Dec 10, 2024

You've gone quite deep for a person not familiar with Swift πŸ‘

Given that you're passing en0 (the wifi interface) directly to the VM, I can take a guess at what might be happening, though it's only a guess, and you'll need to check for yourself.

Our VSockDev is doing two things:

  • eth1(linux) β†’ vsockdev β†’ en0(macOS): Packets sent by VM over eth1 are sent out to en0 via the NDRV mechanism.
  • en0(macOS) β†’ vsockdev β†’ eth1(linux): Packets received on en0 are read (via BPF mechanism) and sent out to eth1 if the ethernet address of the received packet matches that of eth1. The if is important!

I doubt there's anything going wrong on the sending path: eth1 β†’ vsockdev β†’ en0:

  1. eth1 β†’ vsockdev: not possible
  2. vsockdev β†’ en0: unlikely. You can try doing a tcpdump -nlei en0 ether host <your-vm-hardware-address> on your host macOS to validate.
  3. en0 β†’ further: unlikely. Pick a machine on your network (not your host macOS), and while pinging it from inside your VM, do a tcpdump -nlei <interface> host <your-vm-ip> on that machine to check if you're receiving packets from the VM's IP. I suspect you would be. Pay attention to the hardware address of those packets. They'll contain either your VM's eth1 mac address, or en0's mac address.

I suspect the real issue is happening on the receiving path: en0 β†’ vsockdev β†’ eth1:

  1. WiFi access point β†’ en0: This could be a potential issue, which we can dig deeper into based on your observations in step 3 previously, and the step 2 below.
  2. en0 β†’ vsockdev: Run tcpdump -nlei en0 host <your-vm-ip> on your host macOS to see if you're receiving replies. If you are, what is the destination hardware address that you see on those packets? That of en0? Or eth1? Note that vsockdev uses BPF with a filter that picks only those packets which are destined for that mac address (see lines 135-149).
  3. vsockdev β†’ eth1: unlikely to be an issue.

@njhsi
Copy link

njhsi commented Dec 11, 2024

@amodm THANK YOU SOOO MUCH!

I made it work by a long morning digging around. my problem, wrong vMac set to NetworkSwitch.shared.newBridgePort(hostBridge: "en0", vMac: vmac). It should be the same as to VZVirtioNetworkDeviceConfiguration.macAddress.

This careless problem was found by your thoughtful troubleshooting post above, using tcpdump to check the traffic to and from. Really appreciate for your creative codes and help!

Happy like a child ~

@amodm
Copy link
Author

amodm commented Dec 11, 2024

Awesome! πŸ‘

@njhsi
Copy link

njhsi commented Dec 12, 2024

@amodm pls lend me a hand when you're not busy..

Another problem with error "en0-h2g: write(vmSocket=3) len=60 failed: Destination address required", what might be the cause?

I made a launchd helper that runs your NetworkSwitch.swift, to provide an UnixDomainSocket such as into a file /tmp/vm.socket. and this socket file would be given to a virtual machine tool such as vfkit which could take it as a virtio-net device. In this way, I could isolate the sudo network things from rootless virtual machine part.

So I created a unix socket by "unixSocket = Darwin.socket(PF_LOCAL, SOCK_DGRAM, 0)", bind it to /tmp/vm.socket, and give this socket to VSockDev.vmSocket. (I deleted VSockDev.remoteSocket as it's not needed now)

Then I got errors in log as below, seems vmToHost is ok, hostToVm is not, as I checked by tcpdump.
2024-12-12 23:22:32.965 virt[2706:284159] en0-g2h: read len=42
2024-12-12 23:22:32.966 virt[2706:284159] en0-h2g: write(vmSocket=3) len=60 failed: Destination address required
2024-12-12 23:22:34.392 virt[2706:284159] en0-g2h: read len=70
2024-12-12 23:22:34.908 virt[2706:284159] en0-g2h: read len=42
2024-12-12 23:22:34.908 virt[2706:284159] en0-h2g: write(vmSocket =3) len=60 failed: Destination address required

$ vfkit --cpus 2 --memory 1024 --bootloader linux,kernel=/tmp/virt/vmlinuz,initrd=/tmp/vmlinuz,cmdline=""console=hvc0 root=/dev/vda"" --device virtio-blk,path=/tmp/virt/vda.img --device virtio-blk,path=/tmp/virt/vdb.img --device virtio-net,unixSocketPath=/tmp/s.socket,mac=c2:6d:fd:60:10:2b --restful-uri tcp://localhost:5122 --device virtio-serial,stdio

Thanks again!

@amodm
Copy link
Author

amodm commented Dec 12, 2024

Instead of doing what you did, bind VSockDev.remoteSocket to unix socket, and let everything else be the same.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment