diff --git a/init/init.c b/init/init.c index 28a42f881..866c4a899 100644 --- a/init/init.c +++ b/init/init.c @@ -1552,6 +1552,14 @@ int main(int argc, char **argv) if (enable_dummy_interface() < 0) { printf("Warning: Couldn't enable dummy interface\n"); } + + /* Allow unprivileged ICMP ping sockets (SOCK_DGRAM + IPPROTO_ICMP) + * for all GIDs so that TSI can hijack and proxy them to the host. */ + int ping_fd = open("/proc/sys/net/ipv4/ping_group_range", O_WRONLY); + if (ping_fd >= 0) { + write(ping_fd, "0 2147483647\n", 13); + close(ping_fd); + } } #endif diff --git a/src/devices/src/virtio/vsock/muxer.rs b/src/devices/src/virtio/vsock/muxer.rs index f4c10247e..f12ce31c5 100644 --- a/src/devices/src/virtio/vsock/muxer.rs +++ b/src/devices/src/virtio/vsock/muxer.rs @@ -316,7 +316,7 @@ impl VsockMuxer { } } defs::SOCK_DGRAM => { - debug!("proxy create dgram"); + debug!("proxy create dgram (protocol={})", req.protocol); let id = ((req.peer_port as u64) << 32) | (defs::TSI_PROXY_PORT as u64); if req.family as i32 == libc::AF_UNIX && !self.tsi_flags.contains(TsiFlags::HIJACK_UNIX) @@ -335,6 +335,7 @@ impl VsockMuxer { self.cid, req.family, req.peer_port, + req.protocol, mem.clone(), queue.clone(), self.rxq.clone(), diff --git a/src/devices/src/virtio/vsock/packet.rs b/src/devices/src/virtio/vsock/packet.rs index 51b3cf1b2..a1fb866e5 100644 --- a/src/devices/src/virtio/vsock/packet.rs +++ b/src/devices/src/virtio/vsock/packet.rs @@ -103,6 +103,7 @@ pub struct TsiProxyCreate { pub peer_port: u32, pub family: u16, pub _type: u16, + pub protocol: u16, } #[repr(C)] @@ -625,19 +626,28 @@ impl VsockPacket { } pub fn read_proxy_create(&self) -> Option { - if self.buf_size >= 6 { - let peer_port: u32 = byte_order::read_le_u32(&self.buf().unwrap()[0..]); - let family: u16 = byte_order::read_le_u16(&self.buf().unwrap()[4..]); - let _type: u16 = byte_order::read_le_u16(&self.buf().unwrap()[6..]); - - Some(TsiProxyCreate { - peer_port, - family, - _type, - }) + let buf = self.buf()?; + if buf.len() < 8 { + return None; + } + + let peer_port: u32 = byte_order::read_le_u32(&buf[0..]); + let family: u16 = byte_order::read_le_u16(&buf[4..]); + let _type: u16 = byte_order::read_le_u16(&buf[6..]); + // Protocol field added for ICMP ping socket support. Old guests + // that don't send it get 0 (= default, same as before). + let protocol: u16 = if buf.len() >= 10 { + byte_order::read_le_u16(&buf[8..]) } else { - None - } + 0 + }; + + Some(TsiProxyCreate { + peer_port, + family, + _type, + protocol, + }) } pub fn read_connect_req(&self) -> Option { diff --git a/src/devices/src/virtio/vsock/tsi_dgram.rs b/src/devices/src/virtio/vsock/tsi_dgram.rs index de0850c37..7e0bfdfc8 100644 --- a/src/devices/src/virtio/vsock/tsi_dgram.rs +++ b/src/devices/src/virtio/vsock/tsi_dgram.rs @@ -10,7 +10,7 @@ use nix::fcntl::{fcntl, FcntlArg, OFlag}; use nix::sys::socket::UnixAddr; use nix::sys::socket::{ bind, connect, getpeername, recv, send, sendto, socket, AddressFamily, MsgFlags, SockFlag, - SockType, SockaddrIn, SockaddrLike, SockaddrStorage, + SockProtocol, SockType, SockaddrIn, SockaddrLike, SockaddrStorage, }; #[cfg(target_os = "macos")] @@ -38,6 +38,8 @@ pub struct TsiDgramProxy { sendto_addr: Option, listening: bool, family: AddressFamily, + #[cfg_attr(not(target_os = "macos"), allow(dead_code))] + protocol: u16, mem: GuestMemoryMmap, queue: Arc>, rxq: Arc>, @@ -48,11 +50,13 @@ pub struct TsiDgramProxy { } impl TsiDgramProxy { + #[allow(clippy::too_many_arguments)] pub fn new( id: u64, cid: u64, family: u16, peer_port: u32, + protocol: u16, mem: GuestMemoryMmap, queue: Arc>, rxq: Arc>, @@ -65,7 +69,15 @@ impl TsiDgramProxy { _ => return Err(ProxyError::InvalidFamily), }; - let fd = socket(family, SockType::Datagram, SockFlag::empty(), None) + // When the guest requests IPPROTO_ICMP (1) or IPPROTO_ICMPV6 (58), + // create a ping socket instead of a plain UDP socket. + let sock_protocol = match protocol as _ { + libc::IPPROTO_ICMP => Some(SockProtocol::Icmp), + libc::IPPROTO_ICMPV6 => Some(SockProtocol::IcmpV6), + _ => None, + }; + + let fd = socket(family, SockType::Datagram, SockFlag::empty(), sock_protocol) .map_err(ProxyError::CreatingSocket)?; // macOS forces us to do this here instead of just using SockFlag::SOCK_NONBLOCK above. @@ -106,6 +118,7 @@ impl TsiDgramProxy { sendto_addr: None, listening: false, family, + protocol, mem, queue, rxq, @@ -170,11 +183,31 @@ impl TsiDgramProxy { match recv(self.fd.as_raw_fd(), &mut buf[..max_len], MsgFlags::empty()) { Ok(cnt) => { debug!("recv cnt={cnt}"); - if cnt > 0 { - RecvPkt::Read(cnt) - } else { - RecvPkt::Close + if cnt == 0 { + return RecvPkt::Close; } + + // macOS DGRAM ICMP sockets include the IP header in + // recv, unlike Linux which strips it. Strip the IP + // header (variable length, from the IHL field) so the + // guest sees the same format as a Linux ping socket. + // buf is the guest's RX virtqueue descriptor — writable. + #[cfg(target_os = "macos")] + if matches!( + self.protocol as _, + libc::IPPROTO_ICMP | libc::IPPROTO_ICMPV6 + ) && cnt >= 20 + { + // IHL (Internet Header Length): low 4 bits of first + // byte, in 32-bit words. Typically 5 (= 20 bytes). + let ip_hdr_len = (buf[0] & 0x0F) as usize * 4; + if ip_hdr_len <= cnt { + buf.copy_within(ip_hdr_len..cnt, 0); + return RecvPkt::Read(cnt - ip_hdr_len); + } + } + + RecvPkt::Read(cnt) } Err(e) => { debug!("recv_pkt: recv error: {e:?}"); diff --git a/tests/runner/src/main.rs b/tests/runner/src/main.rs index c7b1bbc8d..0e5ace5be 100644 --- a/tests/runner/src/main.rs +++ b/tests/runner/src/main.rs @@ -142,6 +142,7 @@ fn run_single_test( }; let use_buildah_unshare = cfg!(target_os = "linux") && std::env::var_os("KRUN_NO_UNSHARE").is_none() + && !test_case.needs_host_network() && has_cmd("buildah") && has_cmd("unshare"); diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..bac701ea4 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -19,6 +19,9 @@ use test_net_perf::TestNetPerf; mod test_multiport_console; use test_multiport_console::TestMultiportConsole; +mod test_tsi_ping; +use test_tsi_ping::TestTsiPing; + mod test_virtiofs_root_ro; use test_virtiofs_root_ro::TestVirtiofsRootRo; @@ -82,6 +85,7 @@ pub fn test_cases() -> Vec { TestCase::new("net-tap", Box::new(TestNet::new_tap())), TestCase::new("net-gvproxy", Box::new(TestNet::new_gvproxy())), TestCase::new("net-vmnet-helper", Box::new(TestNet::new_vmnet_helper())), + TestCase::new("tsi-ping", Box::new(TestTsiPing)), TestCase::new("multiport-console", Box::new(TestMultiportConsole)), TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), @@ -221,6 +225,11 @@ pub trait Test { fn timeout_secs(&self) -> u64 { 15 } + + /// Whether this test needs the host's real network (skips unshare --net). + fn needs_host_network(&self) -> bool { + false + } } #[guest] @@ -257,6 +266,11 @@ impl TestCase { self.test.timeout_secs() } + #[host] + pub fn needs_host_network(&self) -> bool { + self.test.needs_host_network() + } + #[allow(dead_code)] pub fn name(&self) -> &'static str { self.name diff --git a/tests/test_cases/src/test_tsi_ping.rs b/tests/test_cases/src/test_tsi_ping.rs new file mode 100644 index 000000000..76c1338c9 --- /dev/null +++ b/tests/test_cases/src/test_tsi_ping.rs @@ -0,0 +1,84 @@ +use macros::{guest, host}; + +pub struct TestTsiPing; + +#[host] +mod host { + use super::*; + use crate::common::setup_fs_and_enter; + use crate::{krun_call, krun_call_u32}; + use crate::{ShouldRun, Test, TestOutcome, TestSetup}; + use krun_sys::*; + + const CONTAINERFILE: &str = "\ +FROM fedora:44 +RUN dnf install -y iputils && dnf clean all +"; + + impl Test for TestTsiPing { + fn rootfs_image(&self) -> Option<&'static str> { + Some(CONTAINERFILE) + } + + fn should_run(&self) -> ShouldRun { + ShouldRun::Yes + } + + fn timeout_secs(&self) -> u64 { + 30 + } + + fn needs_host_network(&self) -> bool { + true + } + + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + setup_fs_and_enter(ctx, test_setup)?; + } + Ok(()) + } + + fn check(self: Box, stdout: Vec, _test_setup: TestSetup) -> TestOutcome { + let output = String::from_utf8(stdout).unwrap_or_default(); + if output == "OK\n" { + TestOutcome::Pass + } else { + TestOutcome::Fail(format!("expected {:?}, got {:?}", "OK\n", output)) + } + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::process::Command; + + impl Test for TestTsiPing { + fn in_guest(self: Box) { + // Ping an external address so the guest kernel can't satisfy it + // locally — forces the TSI vsock proxy path. Without the + // protocol fix, TSI creates a UDP socket and ping times out. + let output = Command::new("/usr/bin/ping") + .args(["-c", "3", "-W", "2", "8.8.8.8"]) + .output() + .expect("Failed to run ping"); + + if output.status.success() { + println!("OK"); + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "ping failed (exit={}):\nstdout: {}\nstderr: {}", + output.status, stdout, stderr + ); + } + } + } +}