From 39d8d9fb29b7bb95a7f2e71c3a836556505ba6e2 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Thu, 21 May 2026 12:22:00 +0200 Subject: [PATCH 1/5] devices/vsock: forward protocol field to make DGRAM+ICMP ping work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read the protocol field from TsiProxyCreate (backward compatible — old guests that don't send it get 0, treated as default/UDP). When protocol is IPPROTO_ICMP or IPPROTO_ICMPV6, create a ping socket (SOCK_DGRAM + IPPROTO_ICMP) on the host instead of a plain UDP socket. This enables rootless ping through TSI. Only works for ping implementations using SOCK_DGRAM ping sockets (iputils on Fedora, Ubuntu, etc.). SOCK_RAW-based ping (busybox, alpine) is not supported as TSI only hijacks SOCK_STREAM and SOCK_DGRAM. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/vsock/muxer.rs | 3 +- src/devices/src/virtio/vsock/packet.rs | 34 +++++++++++++++-------- src/devices/src/virtio/vsock/tsi_dgram.rs | 14 ++++++++-- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/devices/src/virtio/vsock/muxer.rs b/src/devices/src/virtio/vsock/muxer.rs index f4c10247e..f12ce31c5 100644 --- a/src/devices/src/virtio/vsock/muxer.rs +++ b/src/devices/src/virtio/vsock/muxer.rs @@ -316,7 +316,7 @@ impl VsockMuxer { } } defs::SOCK_DGRAM => { - debug!("proxy create dgram"); + debug!("proxy create dgram (protocol={})", req.protocol); let id = ((req.peer_port as u64) << 32) | (defs::TSI_PROXY_PORT as u64); if req.family as i32 == libc::AF_UNIX && !self.tsi_flags.contains(TsiFlags::HIJACK_UNIX) @@ -335,6 +335,7 @@ impl VsockMuxer { self.cid, req.family, req.peer_port, + req.protocol, mem.clone(), queue.clone(), self.rxq.clone(), diff --git a/src/devices/src/virtio/vsock/packet.rs b/src/devices/src/virtio/vsock/packet.rs index 51b3cf1b2..a1fb866e5 100644 --- a/src/devices/src/virtio/vsock/packet.rs +++ b/src/devices/src/virtio/vsock/packet.rs @@ -103,6 +103,7 @@ pub struct TsiProxyCreate { pub peer_port: u32, pub family: u16, pub _type: u16, + pub protocol: u16, } #[repr(C)] @@ -625,19 +626,28 @@ impl VsockPacket { } pub fn read_proxy_create(&self) -> Option { - if self.buf_size >= 6 { - let peer_port: u32 = byte_order::read_le_u32(&self.buf().unwrap()[0..]); - let family: u16 = byte_order::read_le_u16(&self.buf().unwrap()[4..]); - let _type: u16 = byte_order::read_le_u16(&self.buf().unwrap()[6..]); - - Some(TsiProxyCreate { - peer_port, - family, - _type, - }) + let buf = self.buf()?; + if buf.len() < 8 { + return None; + } + + let peer_port: u32 = byte_order::read_le_u32(&buf[0..]); + let family: u16 = byte_order::read_le_u16(&buf[4..]); + let _type: u16 = byte_order::read_le_u16(&buf[6..]); + // Protocol field added for ICMP ping socket support. Old guests + // that don't send it get 0 (= default, same as before). + let protocol: u16 = if buf.len() >= 10 { + byte_order::read_le_u16(&buf[8..]) } else { - None - } + 0 + }; + + Some(TsiProxyCreate { + peer_port, + family, + _type, + protocol, + }) } pub fn read_connect_req(&self) -> Option { diff --git a/src/devices/src/virtio/vsock/tsi_dgram.rs b/src/devices/src/virtio/vsock/tsi_dgram.rs index de0850c37..d7d1b692a 100644 --- a/src/devices/src/virtio/vsock/tsi_dgram.rs +++ b/src/devices/src/virtio/vsock/tsi_dgram.rs @@ -10,7 +10,7 @@ use nix::fcntl::{fcntl, FcntlArg, OFlag}; use nix::sys::socket::UnixAddr; use nix::sys::socket::{ bind, connect, getpeername, recv, send, sendto, socket, AddressFamily, MsgFlags, SockFlag, - SockType, SockaddrIn, SockaddrLike, SockaddrStorage, + SockProtocol, SockType, SockaddrIn, SockaddrLike, SockaddrStorage, }; #[cfg(target_os = "macos")] @@ -48,11 +48,13 @@ pub struct TsiDgramProxy { } impl TsiDgramProxy { + #[allow(clippy::too_many_arguments)] pub fn new( id: u64, cid: u64, family: u16, peer_port: u32, + protocol: u16, mem: GuestMemoryMmap, queue: Arc>, rxq: Arc>, @@ -65,7 +67,15 @@ impl TsiDgramProxy { _ => return Err(ProxyError::InvalidFamily), }; - let fd = socket(family, SockType::Datagram, SockFlag::empty(), None) + // When the guest requests IPPROTO_ICMP (1) or IPPROTO_ICMPV6 (58), + // create a ping socket instead of a plain UDP socket. + let sock_protocol = match protocol as _ { + libc::IPPROTO_ICMP => Some(SockProtocol::Icmp), + libc::IPPROTO_ICMPV6 => Some(SockProtocol::IcmpV6), + _ => None, + }; + + let fd = socket(family, SockType::Datagram, SockFlag::empty(), sock_protocol) .map_err(ProxyError::CreatingSocket)?; // macOS forces us to do this here instead of just using SockFlag::SOCK_NONBLOCK above. From a5b72fb5ed88c94ff8e1ea53316232ade4304718 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Thu, 21 May 2026 17:34:20 +0200 Subject: [PATCH 2/5] tests: add needs_host_network() to skip namespace isolation Tests that need real network connectivity (e.g. external ping) can return true from needs_host_network() to skip the unshare --net namespace isolation automatically. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- tests/runner/src/main.rs | 1 + tests/test_cases/src/lib.rs | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/tests/runner/src/main.rs b/tests/runner/src/main.rs index c7b1bbc8d..0e5ace5be 100644 --- a/tests/runner/src/main.rs +++ b/tests/runner/src/main.rs @@ -142,6 +142,7 @@ fn run_single_test( }; let use_buildah_unshare = cfg!(target_os = "linux") && std::env::var_os("KRUN_NO_UNSHARE").is_none() + && !test_case.needs_host_network() && has_cmd("buildah") && has_cmd("unshare"); diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..bac701ea4 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -19,6 +19,9 @@ use test_net_perf::TestNetPerf; mod test_multiport_console; use test_multiport_console::TestMultiportConsole; +mod test_tsi_ping; +use test_tsi_ping::TestTsiPing; + mod test_virtiofs_root_ro; use test_virtiofs_root_ro::TestVirtiofsRootRo; @@ -82,6 +85,7 @@ pub fn test_cases() -> Vec { TestCase::new("net-tap", Box::new(TestNet::new_tap())), TestCase::new("net-gvproxy", Box::new(TestNet::new_gvproxy())), TestCase::new("net-vmnet-helper", Box::new(TestNet::new_vmnet_helper())), + TestCase::new("tsi-ping", Box::new(TestTsiPing)), TestCase::new("multiport-console", Box::new(TestMultiportConsole)), TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), @@ -221,6 +225,11 @@ pub trait Test { fn timeout_secs(&self) -> u64 { 15 } + + /// Whether this test needs the host's real network (skips unshare --net). + fn needs_host_network(&self) -> bool { + false + } } #[guest] @@ -257,6 +266,11 @@ impl TestCase { self.test.timeout_secs() } + #[host] + pub fn needs_host_network(&self) -> bool { + self.test.needs_host_network() + } + #[allow(dead_code)] pub fn name(&self) -> &'static str { self.name From f60f1526a488922e49dfe3b1a44673460144dc0b Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Thu, 21 May 2026 17:34:26 +0200 Subject: [PATCH 3/5] tests: add tsi-ping test for ICMP through TSI Run Fedora's /usr/bin/ping against 8.8.8.8 from inside the guest to verify SOCK_DGRAM+IPPROTO_ICMP is properly proxied through TSI. Uses needs_host_network() since the test requires real connectivity. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- tests/test_cases/src/test_tsi_ping.rs | 84 +++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/test_cases/src/test_tsi_ping.rs diff --git a/tests/test_cases/src/test_tsi_ping.rs b/tests/test_cases/src/test_tsi_ping.rs new file mode 100644 index 000000000..76c1338c9 --- /dev/null +++ b/tests/test_cases/src/test_tsi_ping.rs @@ -0,0 +1,84 @@ +use macros::{guest, host}; + +pub struct TestTsiPing; + +#[host] +mod host { + use super::*; + use crate::common::setup_fs_and_enter; + use crate::{krun_call, krun_call_u32}; + use crate::{ShouldRun, Test, TestOutcome, TestSetup}; + use krun_sys::*; + + const CONTAINERFILE: &str = "\ +FROM fedora:44 +RUN dnf install -y iputils && dnf clean all +"; + + impl Test for TestTsiPing { + fn rootfs_image(&self) -> Option<&'static str> { + Some(CONTAINERFILE) + } + + fn should_run(&self) -> ShouldRun { + ShouldRun::Yes + } + + fn timeout_secs(&self) -> u64 { + 30 + } + + fn needs_host_network(&self) -> bool { + true + } + + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + setup_fs_and_enter(ctx, test_setup)?; + } + Ok(()) + } + + fn check(self: Box, stdout: Vec, _test_setup: TestSetup) -> TestOutcome { + let output = String::from_utf8(stdout).unwrap_or_default(); + if output == "OK\n" { + TestOutcome::Pass + } else { + TestOutcome::Fail(format!("expected {:?}, got {:?}", "OK\n", output)) + } + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::process::Command; + + impl Test for TestTsiPing { + fn in_guest(self: Box) { + // Ping an external address so the guest kernel can't satisfy it + // locally — forces the TSI vsock proxy path. Without the + // protocol fix, TSI creates a UDP socket and ping times out. + let output = Command::new("/usr/bin/ping") + .args(["-c", "3", "-W", "2", "8.8.8.8"]) + .output() + .expect("Failed to run ping"); + + if output.status.success() { + println!("OK"); + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "ping failed (exit={}):\nstdout: {}\nstderr: {}", + output.status, stdout, stderr + ); + } + } + } +} From a40c548c6c68a962e35ab5cefd796d12b93f6536 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Thu, 21 May 2026 16:00:25 +0200 Subject: [PATCH 4/5] init: enable ping_group_range for ICMP ping sockets Set /proc/sys/net/ipv4/ping_group_range to allow all GIDs when TSI is enabled. Without this, the guest kernel rejects SOCK_DGRAM + IPPROTO_ICMP sockets with EACCES, preventing TSI from hijacking them. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- init/init.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/init/init.c b/init/init.c index 28a42f881..866c4a899 100644 --- a/init/init.c +++ b/init/init.c @@ -1552,6 +1552,14 @@ int main(int argc, char **argv) if (enable_dummy_interface() < 0) { printf("Warning: Couldn't enable dummy interface\n"); } + + /* Allow unprivileged ICMP ping sockets (SOCK_DGRAM + IPPROTO_ICMP) + * for all GIDs so that TSI can hijack and proxy them to the host. */ + int ping_fd = open("/proc/sys/net/ipv4/ping_group_range", O_WRONLY); + if (ping_fd >= 0) { + write(ping_fd, "0 2147483647\n", 13); + close(ping_fd); + } } #endif From 360ebf2d8cf823cba3b0a03d23d6021bb90843c2 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Thu, 21 May 2026 17:07:57 +0200 Subject: [PATCH 5/5] devices/vsock: strip IP header from macOS ICMP ping replies macOS DGRAM ICMP sockets include the IP header in recv, unlike Linux which strips it. Detect this and remove the IP header before forwarding to the guest so the guest sees the same format as a Linux ping socket. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/vsock/tsi_dgram.rs | 31 ++++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/devices/src/virtio/vsock/tsi_dgram.rs b/src/devices/src/virtio/vsock/tsi_dgram.rs index d7d1b692a..7e0bfdfc8 100644 --- a/src/devices/src/virtio/vsock/tsi_dgram.rs +++ b/src/devices/src/virtio/vsock/tsi_dgram.rs @@ -38,6 +38,8 @@ pub struct TsiDgramProxy { sendto_addr: Option, listening: bool, family: AddressFamily, + #[cfg_attr(not(target_os = "macos"), allow(dead_code))] + protocol: u16, mem: GuestMemoryMmap, queue: Arc>, rxq: Arc>, @@ -116,6 +118,7 @@ impl TsiDgramProxy { sendto_addr: None, listening: false, family, + protocol, mem, queue, rxq, @@ -180,11 +183,31 @@ impl TsiDgramProxy { match recv(self.fd.as_raw_fd(), &mut buf[..max_len], MsgFlags::empty()) { Ok(cnt) => { debug!("recv cnt={cnt}"); - if cnt > 0 { - RecvPkt::Read(cnt) - } else { - RecvPkt::Close + if cnt == 0 { + return RecvPkt::Close; } + + // macOS DGRAM ICMP sockets include the IP header in + // recv, unlike Linux which strips it. Strip the IP + // header (variable length, from the IHL field) so the + // guest sees the same format as a Linux ping socket. + // buf is the guest's RX virtqueue descriptor — writable. + #[cfg(target_os = "macos")] + if matches!( + self.protocol as _, + libc::IPPROTO_ICMP | libc::IPPROTO_ICMPV6 + ) && cnt >= 20 + { + // IHL (Internet Header Length): low 4 bits of first + // byte, in 32-bit words. Typically 5 (= 20 bytes). + let ip_hdr_len = (buf[0] & 0x0F) as usize * 4; + if ip_hdr_len <= cnt { + buf.copy_within(ip_hdr_len..cnt, 0); + return RecvPkt::Read(cnt - ip_hdr_len); + } + } + + RecvPkt::Read(cnt) } Err(e) => { debug!("recv_pkt: recv error: {e:?}");