From fdbbd47a6b442374a54cbd9f1ca12a5307561310 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:12:27 +0200 Subject: [PATCH 01/14] virtio/fs: extract init binary blob into its own crate Move the init binary build script and include_bytes!() from the devices crate into a new init-blob crate. The passthrough modules reference the binary as init_blob::INIT_BINARY instead of using include_bytes! directly. Inspired by https://github.com/containers/libkrun/pull/593. Suggested-by: Geoffrey Goodman Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- Cargo.lock | 5 +++++ Cargo.toml | 1 + src/devices/Cargo.toml | 3 ++- src/devices/src/virtio/fs/linux/passthrough.rs | 2 +- src/devices/src/virtio/fs/macos/passthrough.rs | 2 +- src/init-blob/Cargo.toml | 11 +++++++++++ src/{devices => init-blob}/build.rs | 0 src/init-blob/src/lib.rs | 1 + 8 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 src/init-blob/Cargo.toml rename src/{devices => init-blob}/build.rs (100%) create mode 100644 src/init-blob/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index ecb90d195..41ed2cbcc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -527,6 +527,10 @@ dependencies = [ "serde_core", ] +[[package]] +name = "init-blob" +version = "0.1.0-1.18.0" + [[package]] name = "iocuddle" version = "0.1.1" @@ -663,6 +667,7 @@ dependencies = [ "caps", "crossbeam-channel", "imago", + "init-blob", "krun-arch", "krun-display", "krun-hvf", diff --git a/Cargo.toml b/Cargo.toml index 00b06aa00..35b1dbba4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "src/libkrun", + "src/init-blob", "src/input", "src/display", "src/utils", diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index eacb6cc97..df5ec5a58 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -3,7 +3,7 @@ name = "krun-devices" version = "0.1.0-1.18.0" authors = ["The libkrun Authors"] edition = "2021" -build = "build.rs" + description = "Virtual device emulation for libkrun" license = "Apache-2.0" repository = "https://github.com/containers/libkrun" @@ -37,6 +37,7 @@ vm-memory = { version = "0.17", features = ["backend-mmap"] } zerocopy = { version = "0.8.26", optional = true, features = ["derive"] } krun_display = { package = "krun-display", version = "0.1.0", path = "../display", optional = true, features = ["bindgen_clang_runtime"] } krun_input = { package = "krun-input", version = "0.1.0", path = "../input", features = ["bindgen_clang_runtime"], optional = true } +init-blob = { path = "../init-blob" } arch = { package = "krun-arch", version = "=0.1.0-1.18.0", path = "../arch" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index e5ca21a03..a0c1d6020 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -33,7 +33,7 @@ const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; const INIT_CSTR: &[u8] = b"init.krun\0"; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); +static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; type Inode = u64; type Handle = u64; diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 53680bd92..419cd645b 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -37,7 +37,7 @@ const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); +static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; type Inode = u64; type Handle = u64; diff --git a/src/init-blob/Cargo.toml b/src/init-blob/Cargo.toml new file mode 100644 index 000000000..7792e2042 --- /dev/null +++ b/src/init-blob/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "init-blob" +version = "0.1.0-1.18.0" +edition = "2021" +description = "Default init binary blob for libkrun guests" +license = "Apache-2.0" +repository = "https://github.com/containers/libkrun" +build = "build.rs" + +[lib] +path = "src/lib.rs" diff --git a/src/devices/build.rs b/src/init-blob/build.rs similarity index 100% rename from src/devices/build.rs rename to src/init-blob/build.rs diff --git a/src/init-blob/src/lib.rs b/src/init-blob/src/lib.rs new file mode 100644 index 000000000..4397da679 --- /dev/null +++ b/src/init-blob/src/lib.rs @@ -0,0 +1 @@ +pub static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); From 66d7e99fd4b46c337ad06a614d026fbebacaedf9 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:13:31 +0200 Subject: [PATCH 02/14] virtio/fs: introduce InodeAllocator for shared inode numbering Replace the private next_inode AtomicU64 inside PassthroughFs with a shared InodeAllocator that is passed in at construction. This lets multiple layers (e.g. a future virtual-inode overlay) allocate from the same counter without implicit coordination via reserved ranges. The allocator starts at ROOT_ID + 2, reserving inode 2 for the existing init_inode in PassthroughFs. This reservation is removed in the next commit when init handling moves to AugmentFs. PassthroughFs::new() and PassthroughFsRo::new() now take an Arc parameter. FsWorker::new() creates the allocator and passes it through. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/inode_alloc.rs | 30 +++++++++++++++++++ .../src/virtio/fs/linux/passthrough.rs | 9 +++--- .../src/virtio/fs/macos/passthrough.rs | 9 +++--- src/devices/src/virtio/fs/mod.rs | 1 + src/devices/src/virtio/fs/read_only.rs | 5 ++-- src/devices/src/virtio/fs/worker.rs | 12 ++++++-- 6 files changed, 54 insertions(+), 12 deletions(-) create mode 100644 src/devices/src/virtio/fs/inode_alloc.rs diff --git a/src/devices/src/virtio/fs/inode_alloc.rs b/src/devices/src/virtio/fs/inode_alloc.rs new file mode 100644 index 000000000..63e570acd --- /dev/null +++ b/src/devices/src/virtio/fs/inode_alloc.rs @@ -0,0 +1,30 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use super::fuse; + +/// Allocates unique FUSE inode numbers. +/// +/// FUSE inode numbers are opaque identifiers with two reserved values: +/// - `0` — invalid / negative-entry cache sentinel (never allocated) +/// - `1` (`ROOT_ID`) — the root directory of the filesystem +/// +/// All other numbers are allocated sequentially starting from `ROOT_ID + 2` +/// (inode 2 is reserved for the legacy init_inode in PassthroughFs until the +/// AugmentFs overlay takes over init handling). +/// The allocator is `Send + Sync` and safe to share across threads. +pub struct InodeAllocator { + next: AtomicU64, +} + +impl InodeAllocator { + pub fn new() -> Self { + Self { + next: AtomicU64::new(fuse::ROOT_ID + 2), + } + } + + /// Allocate the next inode number. Each call returns a unique value. + pub fn next(&self) -> u64 { + self.next.fetch_add(1, Ordering::Relaxed) + } +} diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index a0c1d6020..abda1ce53 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -25,6 +25,7 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const CURRENT_DIR_CSTR: &[u8] = b".\0"; @@ -358,7 +359,7 @@ pub struct PassthroughFs { // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock>>, - next_inode: AtomicU64, + inode_alloc: Arc, init_inode: u64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be @@ -392,7 +393,7 @@ enum FileOrLink { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let fd = if let Some(fd) = cfg.proc_sfd_rawfd { fd } else { @@ -438,7 +439,7 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), + inode_alloc, init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), @@ -579,7 +580,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 419cd645b..3d27aec7f 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -29,6 +29,7 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const INIT_CSTR: &[u8] = b"init.krun\0"; @@ -543,7 +544,7 @@ impl Default for Config { /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { inodes: RwLock>>, - next_inode: AtomicU64, + inode_alloc: Arc, init_inode: u64, handles: RwLock>>, @@ -560,7 +561,7 @@ pub struct PassthroughFs { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let root = CString::new(cfg.root_dir.as_str()).expect("CString::new failed"); // Safe because this doesn't modify any memory and we check the return value. @@ -579,7 +580,7 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), + inode_alloc, init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), @@ -723,7 +724,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 7ce9d48c2..179535131 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -2,6 +2,7 @@ mod device; #[allow(dead_code)] mod filesystem; pub mod fuse; +mod inode_alloc; #[allow(dead_code)] mod multikey; mod read_only; diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index e975f2dda..eb8aebef3 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -25,6 +25,7 @@ use super::filesystem::{ OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::fuse; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use crate::virtio::bindings; @@ -60,9 +61,9 @@ pub struct PassthroughFsRo { } impl PassthroughFsRo { - pub fn new(cfg: passthrough::Config) -> io::Result { + pub fn new(cfg: passthrough::Config, inode_alloc: Arc) -> io::Result { Ok(Self { - inner: PassthroughFs::new(cfg)?, + inner: PassthroughFs::new(cfg, inode_alloc)?, }) } } diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index c612b3e9b..e554aa377 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -16,6 +16,7 @@ use vm_memory::GuestMemoryMmap; use super::super::{FsError, Queue}; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; @@ -83,10 +84,17 @@ impl FsWorker { exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { + let inode_alloc = Arc::new(InodeAllocator::new()); let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new(passthrough_cfg)?)) + FsServer::ReadOnly(Server::new(PassthroughFsRo::new( + passthrough_cfg, + inode_alloc, + )?)) } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new(passthrough_cfg)?)) + FsServer::ReadWrite(Server::new(PassthroughFs::new( + passthrough_cfg, + inode_alloc, + )?)) }; Ok(Self { queues, From 614a504f772b31286893947d7a31f8ce16b0effe Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:15:57 +0200 Subject: [PATCH 03/14] virtio/fs: introduce generic AugmentFs overlay for files like init.krun MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce AugmentFs, a generic overlay that wraps any FileSystem implementation and intercepts FUSE operations for virtual inodes — synthetic read-only files and directories backed by static data. One-shot files can only be looked up once. Remove all init.krun special-case code (init_inode, init_handle, INIT_CSTR) from both the Linux and macOS passthrough implementations. The init.krun virtual file is now configured via VirtualDirEntry in the krun API layer and handled generically by the overlay. FsDeviceConfig carries a Vec and FsWorker wraps AugmentFs / AugmentFs. The InodeAllocator now starts at ROOT_ID + 1 since the init_inode reservation is no longer needed. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- Cargo.lock | 2 +- src/devices/Cargo.toml | 1 - src/devices/src/virtio/fs/augment_fs.rs | 737 ++++++++++++++++++ src/devices/src/virtio/fs/device.rs | 6 + src/devices/src/virtio/fs/inode_alloc.rs | 6 +- .../src/virtio/fs/linux/passthrough.rs | 77 +- .../src/virtio/fs/macos/passthrough.rs | 49 +- src/devices/src/virtio/fs/mod.rs | 2 + src/devices/src/virtio/fs/virtual_entry.rs | 56 ++ src/devices/src/virtio/fs/worker.rs | 27 +- src/devices/src/virtio/linux_errno.rs | 34 + src/libkrun/Cargo.toml | 1 + src/libkrun/src/lib.rs | 34 +- src/vmm/src/builder.rs | 1 + src/vmm/src/vmm_config/fs.rs | 5 + 15 files changed, 895 insertions(+), 143 deletions(-) create mode 100644 src/devices/src/virtio/fs/augment_fs.rs create mode 100644 src/devices/src/virtio/fs/virtual_entry.rs diff --git a/Cargo.lock b/Cargo.lock index 41ed2cbcc..c0c4dd9b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -667,7 +667,6 @@ dependencies = [ "caps", "crossbeam-channel", "imago", - "init-blob", "krun-arch", "krun-display", "krun-hvf", @@ -861,6 +860,7 @@ version = "1.18.0" dependencies = [ "crossbeam-channel", "env_logger", + "init-blob", "krun-aws-nitro", "krun-devices", "krun-display", diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index df5ec5a58..1be66e164 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -37,7 +37,6 @@ vm-memory = { version = "0.17", features = ["backend-mmap"] } zerocopy = { version = "0.8.26", optional = true, features = ["derive"] } krun_display = { package = "krun-display", version = "0.1.0", path = "../display", optional = true, features = ["bindgen_clang_runtime"] } krun_input = { package = "krun-input", version = "0.1.0", path = "../input", features = ["bindgen_clang_runtime"], optional = true } -init-blob = { path = "../init-blob" } arch = { package = "krun-arch", version = "=0.1.0-1.18.0", path = "../arch" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs new file mode 100644 index 000000000..a694e5b96 --- /dev/null +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -0,0 +1,737 @@ +// Virtual inode overlay for virtiofs. +// +// `AugmentFs` wraps an inner `FileSystem` implementation and intercepts +// FUSE operations for virtual inodes — synthetic read-only files that exist +// only in memory. All other operations are delegated to the inner filesystem. +// +// Virtual inodes are injected into the root directory (parent = ROOT_ID) and +// are currently only accessible via lookup (they do not appear in readdir). +// +// One-shot files can only be looked up once — the name is removed from the +// directory on first lookup so subsequent lookups return ENOENT. + +#[cfg(target_os = "macos")] +use crossbeam_channel::Sender; +use std::collections::HashMap; +use std::ffi::CStr; +use std::ffi::CString; +use std::io; +use std::mem; +use std::sync::atomic::AtomicI32; +use std::sync::Arc; +use std::sync::RwLock; +use std::time::Duration; + +#[cfg(target_os = "macos")] +use utils::worker_message::WorkerMessage; + +use super::filesystem::{ + Context, DirEntry, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, + OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, +}; +use super::fuse; +use super::inode_alloc::InodeAllocator; +use super::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualEntryContent, VIRTUAL_BLKSIZE}; +use crate::virtio::bindings; +use crate::virtio::linux_errno; + +type Inode = u64; +type Handle = u64; + +/// Sentinel handle returned for all virtual file opens. This works because +/// virtual file operations dispatch on inode, not handle — there is no +/// per-open state. If per-fd state is ever needed (e.g. writable virtual +/// files), this must be replaced with a real handle allocator. +const VIRTUAL_HANDLE: Handle = 0; + +/// Persistent virtual entries never change. +const VIRTUAL_TIMEOUT: Duration = Duration::MAX; + +/// Overlay that injects virtual inodes into an inner `FileSystem`. +pub struct AugmentFs { + inner: T, + /// Maps (parent_inode, name) → child inode number. One-shot entries + /// are removed on first lookup so the file can only be opened once. + name_to_inode: RwLock>, + /// Maps virtual inode number → (mode, inode data). One-shot entries are + /// removed from this map on release. + inodes: RwLock>, +} + +impl> AugmentFs { + /// Create a new overlay. + /// + /// `entries` are registered as virtual inodes in the root directory. + /// Inode numbers are obtained from `inode_alloc`, the same allocator + /// used by the inner filesystem. + pub fn new(inner: T, inode_alloc: &InodeAllocator, entries: Vec) -> Self { + let mut name_to_inode = HashMap::new(); + let mut inodes = HashMap::new(); + + Self::register_entries( + fuse::ROOT_ID, + entries, + inode_alloc, + &mut name_to_inode, + &mut inodes, + ); + + Self { + inner, + name_to_inode: RwLock::new(name_to_inode), + inodes: RwLock::new(inodes), + } + } + + fn register_entries( + parent: Inode, + entries: Vec, + inode_alloc: &InodeAllocator, + name_to_inode: &mut HashMap<(Inode, CString), Inode>, + inodes: &mut HashMap, + ) { + for entry in entries { + let ino = inode_alloc.next(); + name_to_inode.insert((parent, entry.name), ino); + + // Recurse into directory children before moving the node. + if let VirtualEntryContent::Dir { children } = entry.entry.content { + Self::register_entries(ino, children, inode_alloc, name_to_inode, inodes); + inodes.insert( + ino, + VirtualEntry { + mode: entry.entry.mode, + one_shot: entry.entry.one_shot, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + ); + } else { + inodes.insert(ino, entry.entry); + } + } + } + + fn is_virtual(&self, inode: Inode) -> bool { + self.inodes.read().unwrap().contains_key(&inode) + } + + fn virtual_stat(ino: Inode, vnode: &VirtualEntry) -> (bindings::stat64, Duration) { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = ino; + st.st_mode = vnode.st_mode() as _; + st.st_blksize = VIRTUAL_BLKSIZE as _; + let timeout = if vnode.one_shot { + Duration::ZERO + } else { + VIRTUAL_TIMEOUT + }; + match &vnode.content { + VirtualEntryContent::File { data, .. } => { + st.st_size = data.len() as i64; + st.st_nlink = 1; + st.st_blocks = ((data.len() as i64) + 511) / 512; + } + VirtualEntryContent::Dir { .. } => { + st.st_nlink = 2; + } + } + (st, timeout) + } +} + +impl> FileSystem for AugmentFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, capable: FsOptions) -> io::Result { + self.inner.init(capable) + } + + fn destroy(&self) { + self.inner.destroy() + } + + fn lookup(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result { + let key = (parent, CString::from(name)); + let inode = self.name_to_inode.read().unwrap().get(&key).copied(); + if let Some(inode) = inode { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let one_shot = vnode.one_shot; + let (st, timeout) = Self::virtual_stat(inode, vnode); + + if one_shot { + drop(inodes); + self.name_to_inode.write().unwrap().remove(&key); + } + + return Ok(Entry { + inode, + generation: 0, + attr: st, + attr_flags: 0, + attr_timeout: timeout, + entry_timeout: timeout, + }); + } + } + self.inner.lookup(ctx, parent, name) + } + + fn forget(&self, ctx: Context, inode: Inode, count: u64) { + if !self.is_virtual(inode) { + self.inner.forget(ctx, inode, count) + } + } + + fn batch_forget(&self, ctx: Context, mut requests: Vec<(Inode, u64)>) { + requests.retain(|(ino, _)| !self.is_virtual(*ino)); + self.inner.batch_forget(ctx, requests); + } + + fn getattr( + &self, + ctx: Context, + inode: Inode, + handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + return Ok(Self::virtual_stat(inode, vnode)); + } + } + self.inner.getattr(ctx, inode, handle) + } + + fn setattr( + &self, + ctx: Context, + inode: Inode, + attr: bindings::stat64, + handle: Option, + valid: SetattrValid, + ) -> io::Result<(bindings::stat64, Duration)> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.setattr(ctx, inode, attr, handle, valid) + } + + fn readlink(&self, ctx: Context, inode: Inode) -> io::Result> { + if self.is_virtual(inode) { + return Err(linux_errno::einval()); + } + self.inner.readlink(ctx, inode) + } + + fn symlink( + &self, + ctx: Context, + linkname: &CStr, + parent: Inode, + name: &CStr, + extensions: Extensions, + ) -> io::Result { + self.inner.symlink(ctx, linkname, parent, name, extensions) + } + + fn mknod( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + mode: u32, + rdev: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + self.inner + .mknod(ctx, inode, name, mode, rdev, umask, extensions) + } + + fn mkdir( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + let key = (parent, CString::from(name)); + if self.name_to_inode.read().unwrap().contains_key(&key) { + return Err(linux_errno::eexist()); + } + self.inner.mkdir(ctx, parent, name, mode, umask, extensions) + } + + fn unlink(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.unlink(ctx, parent, name) + } + + fn rmdir(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.rmdir(ctx, parent, name) + } + + fn rename( + &self, + ctx: Context, + olddir: Inode, + oldname: &CStr, + newdir: Inode, + newname: &CStr, + flags: u32, + ) -> io::Result<()> { + self.inner + .rename(ctx, olddir, oldname, newdir, newname, flags) + } + + fn link( + &self, + ctx: Context, + inode: Inode, + newparent: Inode, + newname: &CStr, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.link(ctx, inode, newparent, newname) + } + + fn open( + &self, + ctx: Context, + inode: Inode, + kill_priv: bool, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + if vnode.is_dir() { + return Err(linux_errno::eisdir()); + } + if (flags as i32 & libc::O_ACCMODE) != libc::O_RDONLY { + return Err(linux_errno::eacces()); + } + return Ok((Some(VIRTUAL_HANDLE), OpenOptions::empty())); + } + } + self.inner.open(ctx, inode, kill_priv, flags) + } + + fn create( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + kill_priv: bool, + flags: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result<(Entry, Option, OpenOptions)> { + self.inner + .create(ctx, parent, name, mode, kill_priv, flags, umask, extensions) + } + + fn read( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mut w: W, + size: u32, + offset: u64, + lock_owner: Option, + flags: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let data = vnode.data().ok_or_else(linux_errno::eisdir)?; + let off: usize = offset.try_into().map_err(|_| linux_errno::einval())?; + if off >= data.len() { + return Ok(0); + } + let remaining = data.len() - off; + let len = remaining.min(size as usize); + return w.write(&data[off..(off + len)]); + } + } + self.inner + .read(ctx, inode, handle, w, size, offset, lock_owner, flags) + } + + fn write( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + r: R, + size: u32, + offset: u64, + lock_owner: Option, + delayed_write: bool, + kill_priv: bool, + flags: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.write( + ctx, + inode, + handle, + r, + size, + offset, + lock_owner, + delayed_write, + kill_priv, + flags, + ) + } + + fn flush(&self, ctx: Context, inode: Inode, handle: Handle, lock_owner: u64) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.flush(ctx, inode, handle, lock_owner) + } + + fn fsync(&self, ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.fsync(ctx, inode, datasync, handle) + } + + fn fallocate( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mode: u32, + offset: u64, + length: u64, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner + .fallocate(ctx, inode, handle, mode, offset, length) + } + + fn release( + &self, + ctx: Context, + inode: Inode, + flags: u32, + handle: Handle, + flush: bool, + flock_release: bool, + lock_owner: Option, + ) -> io::Result<()> { + { + let mut inodes = self.inodes.write().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + if vnode.one_shot { + inodes.remove(&inode); + } + return Ok(()); + } + } + self.inner + .release(ctx, inode, flags, handle, flush, flock_release, lock_owner) + } + + fn statfs(&self, ctx: Context, inode: Inode) -> io::Result { + self.inner.statfs(ctx, inode) + } + + fn getxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + size: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::enodata()); + } + self.inner.getxattr(ctx, inode, name, size) + } + + fn listxattr(&self, ctx: Context, inode: Inode, size: u32) -> io::Result { + if self.is_virtual(inode) { + if size == 0 { + return Ok(ListxattrReply::Count(0)); + } + return Ok(ListxattrReply::Names(Vec::new())); + } + self.inner.listxattr(ctx, inode, size) + } + + fn setxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + value: &[u8], + flags: u32, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.setxattr(ctx, inode, name, value, flags) + } + + fn removexattr(&self, ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.removexattr(ctx, inode, name) + } + + fn opendir( + &self, + ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + self.inner.opendir(ctx, inode, flags) + } + + fn readdir( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result, + { + self.inner + .readdir(ctx, inode, handle, size, offset, add_entry) + } + + fn readdirplus( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry, Entry) -> io::Result, + { + self.inner + .readdirplus(ctx, inode, handle, size, offset, add_entry) + } + + fn fsyncdir( + &self, + ctx: Context, + inode: Inode, + datasync: bool, + handle: Handle, + ) -> io::Result<()> { + self.inner.fsyncdir(ctx, inode, datasync, handle) + } + + fn releasedir(&self, ctx: Context, inode: Inode, flags: u32, handle: Handle) -> io::Result<()> { + self.inner.releasedir(ctx, inode, flags, handle) + } + + fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { + if self.is_virtual(inode) { + if mask & (libc::W_OK as u32) != 0 { + return Err(linux_errno::eacces()); + } + return Ok(()); + } + self.inner.access(ctx, inode, mask) + } + + fn lseek( + &self, + ctx: Context, + inode: Inode, + _handle: Handle, + offset: u64, + whence: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let size = vnode.data().ok_or_else(linux_errno::eisdir)?.len() as u64; + // FUSE lseek is only called for SEEK_DATA/SEEK_HOLE. + return match whence as i32 { + libc::SEEK_DATA => { + if offset < size { + Ok(offset) + } else { + Err(linux_errno::enxio()) + } + } + libc::SEEK_HOLE => { + if offset < size { + Ok(size) + } else { + Err(linux_errno::enxio()) + } + } + _ => Err(linux_errno::einval()), + }; + } + } + self.inner.lseek(ctx, inode, _handle, offset, whence) + } + + fn copyfilerange( + &self, + ctx: Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Virtual inodes don't have real file descriptors, so copy_file_range + // cannot work. Return EXDEV to tell the kernel to fall back to + // read+write. + if self.is_virtual(inode_in) || self.is_virtual(inode_out) { + return Err(linux_errno::exdev()); + } + self.inner.copyfilerange( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + fn setupmapping( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + foffset: u64, + len: u64, + flags: u64, + moffset: u64, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let data = vnode.data().ok_or_else(linux_errno::eisdir)?; + #[cfg(target_os = "linux")] + { + if (moffset + len) > shm_size { + return Err(linux_errno::einval()); + } + + let addr = host_shm_base + moffset; + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, + -1, + 0, + ) + }; + if std::ptr::eq(ret, libc::MAP_FAILED) { + return Err(io::Error::last_os_error()); + } + + let foff = foffset as usize; + if foff < data.len() { + let available = data.len() - foff; + let to_copy = (len as usize).min(available); + unsafe { + libc::memcpy( + addr as *mut libc::c_void, + data.as_ptr().add(foff) as *const _, + to_copy, + ) + }; + } + + return Ok(()); + } + + // TODO: implement DAX for virtual files on macOS. + // Needs a shared memory region manager (see setupmapping + // in macos/passthrough.rs for the real-file DAX path). + #[cfg(target_os = "macos")] + { + let _ = data; + return Err(linux_errno::enosys()); + } + } + } + self.inner.setupmapping( + ctx, + inode, + handle, + foffset, + len, + flags, + moffset, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn removemapping( + &self, + ctx: Context, + requests: Vec, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + self.inner.removemapping( + ctx, + requests, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn ioctl( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + flags: u32, + cmd: u32, + arg: u64, + in_size: u32, + out_size: u32, + exit_code: &Arc, + ) -> io::Result> { + self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ) + } +} diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index bc877bc24..945f8393e 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -17,6 +17,7 @@ use super::super::{ VirtioShmRegion, }; use super::passthrough; +use super::virtual_entry::VirtualDirEntry; use super::worker::FsWorker; use super::ExportTable; use super::{defs, defs::uapi}; @@ -48,6 +49,7 @@ pub struct Fs { shm_region: Option, passthrough_cfg: passthrough::Config, read_only: bool, + virtual_entries: Vec, worker_thread: Option>, worker_stopfd: EventFd, exit_code: Arc, @@ -62,6 +64,7 @@ impl Fs { exit_code: Arc, allow_root_dir_delete: bool, read_only: bool, + virtual_entries: Vec, ) -> super::Result { let avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); @@ -84,6 +87,7 @@ impl Fs { shm_region: None, passthrough_cfg: fs_cfg, read_only, + virtual_entries, worker_thread: None, worker_stopfd: EventFd::new(EFD_NONBLOCK).map_err(FsError::EventFd)?, exit_code, @@ -180,6 +184,7 @@ impl VirtioDevice for Fs { queue_evts.push(dq.event); } + let virtual_entries = self.virtual_entries.clone(); let worker = FsWorker::new( worker_queues, queue_evts, @@ -188,6 +193,7 @@ impl VirtioDevice for Fs { self.shm_region.clone(), self.passthrough_cfg.clone(), self.read_only, + virtual_entries, self.worker_stopfd.try_clone().unwrap(), self.exit_code.clone(), #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/inode_alloc.rs b/src/devices/src/virtio/fs/inode_alloc.rs index 63e570acd..1919b1406 100644 --- a/src/devices/src/virtio/fs/inode_alloc.rs +++ b/src/devices/src/virtio/fs/inode_alloc.rs @@ -8,9 +8,7 @@ use super::fuse; /// - `0` — invalid / negative-entry cache sentinel (never allocated) /// - `1` (`ROOT_ID`) — the root directory of the filesystem /// -/// All other numbers are allocated sequentially starting from `ROOT_ID + 2` -/// (inode 2 is reserved for the legacy init_inode in PassthroughFs until the -/// AugmentFs overlay takes over init handling). +/// All other numbers are allocated sequentially starting from `ROOT_ID + 1`. /// The allocator is `Send + Sync` and safe to share across threads. pub struct InodeAllocator { next: AtomicU64, @@ -19,7 +17,7 @@ pub struct InodeAllocator { impl InodeAllocator { pub fn new() -> Self { Self { - next: AtomicU64::new(fuse::ROOT_ID + 2), + next: AtomicU64::new(fuse::ROOT_ID + 1), } } diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index abda1ce53..08da133f0 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -32,9 +32,6 @@ const CURRENT_DIR_CSTR: &[u8] = b".\0"; const PARENT_DIR_CSTR: &[u8] = b"..\0"; const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; -const INIT_CSTR: &[u8] = b"init.krun\0"; - -static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; type Inode = u64; type Handle = u64; @@ -360,13 +357,11 @@ pub struct PassthroughFs { // do with an fd opened with this flag. inodes: RwLock>>, inode_alloc: Arc, - init_inode: u64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the @@ -440,11 +435,9 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), inode_alloc, - init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, proc_self_fd, @@ -993,25 +986,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("do_lookup: {name:?}"); - let init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == init_name { - let mut st: libc::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1130,11 +1105,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1235,16 +1206,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset.try_into().map_err(|_| einval())?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -1825,10 +1786,6 @@ impl FileSystem for PassthroughFs { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } - if inode == self.init_inode { - return Err(io::Error::from_raw_os_error(libc::ENODATA)); - } - let mut buf = vec![0; size as usize]; // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we @@ -2088,36 +2045,6 @@ impl FileSystem for PassthroughFs { debug!("setupmapping: ino {inode:?} addr={addr:x} len={len}"); - if inode == self.init_inode { - let ret = unsafe { - libc::mmap( - addr as *mut libc::c_void, - len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, - -1, - 0, - ) - }; - if std::ptr::eq(ret, libc::MAP_FAILED) { - return Err(io::Error::last_os_error()); - } - - let to_copy = if len as usize > INIT_BINARY.len() { - INIT_BINARY.len() - } else { - len as usize - }; - unsafe { - libc::memcpy( - addr as *mut libc::c_void, - INIT_BINARY.as_ptr() as *const _, - to_copy, - ) - }; - return Ok(()); - } - let file = self.open_inode(inode, open_flags)?; let fd = file.as_raw_fd(); diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 3d27aec7f..d1a862d0c 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -32,14 +32,11 @@ use super::super::fuse; use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; -const INIT_CSTR: &[u8] = b"init.krun\0"; const XATTR_KEY: &[u8] = b"user.containers.override_stat\0"; const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; - type Inode = u64; type Handle = u64; @@ -545,11 +542,9 @@ impl Default for Config { pub struct PassthroughFs { inodes: RwLock>>, inode_alloc: Arc, - init_inode: u64, handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, map_windows: Mutex>, @@ -581,11 +576,9 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), inode_alloc, - init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, map_windows: Mutex::new(HashMap::new()), @@ -1202,25 +1195,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("lookup: {name:?}"); - let _init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == _init_name { - let mut st: bindings::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1340,11 +1315,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1457,18 +1428,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset - .try_into() - .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -2054,10 +2013,6 @@ impl FileSystem for PassthroughFs { return Err(linux_error(io::Error::from_raw_os_error(libc::ENOSYS))); } - if inode == self.init_inode { - return Err(linux_error(io::Error::from_raw_os_error(libc::ENODATA))); - } - if name.to_bytes() == XATTR_KEY { return Err(linux_error(io::Error::from_raw_os_error(libc::EACCES))); } diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 179535131..ae5b7bbdc 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -1,3 +1,4 @@ +mod augment_fs; mod device; #[allow(dead_code)] mod filesystem; @@ -7,6 +8,7 @@ mod inode_alloc; mod multikey; mod read_only; mod server; +pub mod virtual_entry; mod worker; #[cfg(target_os = "linux")] diff --git a/src/devices/src/virtio/fs/virtual_entry.rs b/src/devices/src/virtio/fs/virtual_entry.rs new file mode 100644 index 000000000..06f6915b3 --- /dev/null +++ b/src/devices/src/virtio/fs/virtual_entry.rs @@ -0,0 +1,56 @@ +// Virtual entry types for the virtiofs overlay. + +use std::ffi::CString; + +/// Block size reported by virtual entries in st_blksize. +pub const VIRTUAL_BLKSIZE: i64 = 4096; + +/// A synthetic filesystem entry that exists only in memory. +#[derive(Clone, Debug)] +pub struct VirtualEntry { + /// Permission bits. File type bits (S_IFMT) are ignored — the type + /// is derived from the `content` variant. + pub mode: u32, + /// If true, the entry can only be looked up once. + pub one_shot: bool, + pub content: VirtualEntryContent, +} + +#[derive(Clone, Debug)] +pub enum VirtualEntryContent { + /// A read-only file backed by a static byte slice. + File { data: &'static [u8] }, + /// A directory containing other virtual entries. + Dir { children: Vec }, +} + +impl VirtualEntry { + pub fn is_dir(&self) -> bool { + matches!(self.content, VirtualEntryContent::Dir { .. }) + } + + /// Returns the full st_mode: file type bits from the variant OR'd + /// with the permission bits from self.mode. + #[allow(clippy::unnecessary_cast)] // libc::S_IF* is u16 on macOS, u32 on Linux + pub fn st_mode(&self) -> u32 { + let file_type = match self.content { + VirtualEntryContent::File { .. } => libc::S_IFREG as u32, + VirtualEntryContent::Dir { .. } => libc::S_IFDIR as u32, + }; + file_type | (self.mode & !(libc::S_IFMT as u32)) + } + + pub fn data(&self) -> Option<&'static [u8]> { + match &self.content { + VirtualEntryContent::File { data } => Some(data), + VirtualEntryContent::Dir { .. } => None, + } + } +} + +/// A named entry in a virtual directory. +#[derive(Clone, Debug)] +pub struct VirtualDirEntry { + pub name: CString, + pub entry: VirtualEntry, +} diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index e554aa377..084a2aa85 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -14,17 +14,19 @@ use utils::eventfd::EventFd; use vm_memory::GuestMemoryMmap; use super::super::{FsError, Queue}; +use super::augment_fs::AugmentFs; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; +use super::virtual_entry::VirtualDirEntry; use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { - ReadWrite(Server), - ReadOnly(Server), + ReadWrite(Server>), + ReadOnly(Server>), } impl FsServer { @@ -80,21 +82,26 @@ impl FsWorker { shm_region: Option, passthrough_cfg: passthrough::Config, read_only: bool, + virtual_entries: Vec, stop_fd: EventFd, exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { let inode_alloc = Arc::new(InodeAllocator::new()); let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new( - passthrough_cfg, - inode_alloc, - )?)) + let inner = PassthroughFsRo::new(passthrough_cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new( - passthrough_cfg, - inode_alloc, - )?)) + let inner = PassthroughFs::new(passthrough_cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) }; Ok(Self { queues, diff --git a/src/devices/src/virtio/linux_errno.rs b/src/devices/src/virtio/linux_errno.rs index 59aca5789..105f977b5 100644 --- a/src/devices/src/virtio/linux_errno.rs +++ b/src/devices/src/virtio/linux_errno.rs @@ -183,3 +183,37 @@ pub fn linux_errno_raw(errno: i32) -> i32 { _ => LINUX_EIO, } } + +// Helper functions returning io::Error with Linux errno values. +use std::io; + +pub fn eperm() -> io::Error { + io::Error::from_raw_os_error(LINUX_EPERM) +} +pub fn enoent() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENOENT) +} +pub fn eacces() -> io::Error { + io::Error::from_raw_os_error(LINUX_EACCES) +} +pub fn eexist() -> io::Error { + io::Error::from_raw_os_error(LINUX_EEXIST) +} +pub fn einval() -> io::Error { + io::Error::from_raw_os_error(LINUX_EINVAL) +} +pub fn eisdir() -> io::Error { + io::Error::from_raw_os_error(LINUX_EISDIR) +} +pub fn exdev() -> io::Error { + io::Error::from_raw_os_error(LINUX_EXDEV) +} +pub fn enosys() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENOSYS) +} +pub fn enodata() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENODATA) +} +pub fn enxio() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENXIO) +} diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 4e54bf99c..27525ea7e 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -31,6 +31,7 @@ krun_display = { package = "krun-display", version = "0.1.0", path = "../display krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } +init-blob = { path = "../init-blob" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index a7b7eee6a..9e2215a62 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -14,6 +14,8 @@ use env_logger::{Env, Target}; #[cfg(feature = "gpu")] use krun_display::DisplayBackend; +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +use devices::virtio::fs::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualEntryContent}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; @@ -23,7 +25,6 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; use std::env; -#[cfg(target_os = "linux")] use std::ffi::CString; use std::ffi::{c_void, CStr}; use std::fs::File; @@ -90,6 +91,23 @@ static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); // Path to the init binary to be executed inside the VM. const INIT_PATH: &str = "/init.krun"; +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +const DEFAULT_INIT_PAYLOAD: &[u8] = init_blob::INIT_BINARY; + +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn init_virtual_entry() -> VirtualDirEntry { + VirtualDirEntry { + name: CString::new("init.krun").unwrap(), + entry: VirtualEntry { + mode: 0o755, + one_shot: true, + content: VirtualEntryContent::File { + data: DEFAULT_INIT_PAYLOAD, + }, + }, + } +} + static KRUNFW: LazyLock> = LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); @@ -578,7 +596,7 @@ pub extern "C" fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) - #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(not(feature = "tee"))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) -> i32 { let root_path = match CStr::from_ptr(c_root_path).to_str() { Ok(root) => root, @@ -598,6 +616,7 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) shm_size: Some(1 << 29), allow_root_dir_delete: false, read_only: false, + virtual_entries: vec![init_virtual_entry()], }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -608,7 +627,7 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(not(feature = "tee"))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] pub unsafe extern "C" fn krun_add_virtiofs( ctx_id: u32, c_tag: *const c_char, @@ -619,7 +638,7 @@ pub unsafe extern "C" fn krun_add_virtiofs( #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(not(feature = "tee"))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] pub unsafe extern "C" fn krun_add_virtiofs2( ctx_id: u32, c_tag: *const c_char, @@ -631,7 +650,7 @@ pub unsafe extern "C" fn krun_add_virtiofs2( #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(not(feature = "tee"))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] pub unsafe extern "C" fn krun_add_virtiofs3( ctx_id: u32, c_tag: *const c_char, @@ -664,12 +683,17 @@ pub unsafe extern "C" fn krun_add_virtiofs3( match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); + let mut virtual_entries = Vec::new(); + if tag == "/dev/root" { + virtual_entries.push(init_virtual_entry()); + } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), shared_dir: path.to_string(), shm_size: shm, allow_root_dir_delete: false, read_only, + virtual_entries, }); } Entry::Vacant(_) => return -libc::ENOENT, diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index b92b931d4..38072ce70 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -2042,6 +2042,7 @@ fn attach_fs_devices( exit_code.clone(), config.allow_root_dir_delete, config.read_only, + config.virtual_entries.clone(), ) .unwrap(), )); diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index ccf86f5cd..dc5906dab 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -1,3 +1,6 @@ +#[cfg(not(feature = "aws-nitro"))] +use devices::virtio::fs::virtual_entry::VirtualDirEntry; + #[derive(Clone, Debug)] pub struct FsDeviceConfig { pub fs_id: String, @@ -5,4 +8,6 @@ pub struct FsDeviceConfig { pub shm_size: Option, pub allow_root_dir_delete: bool, pub read_only: bool, + #[cfg(not(feature = "aws-nitro"))] + pub virtual_entries: Vec, } From 6d174938a1fedb20d434ab9bb2aa01ce16125380 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:16:53 +0200 Subject: [PATCH 04/14] lib: add krun_disable_implicit_init() Add API to prevent the default init binary (/init.krun) from being injected into the root filesystem. Follows the existing krun_disable_implicit_{console,vsock} pattern. Must be called before krun_set_root(). Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 13 +++++++++++ src/libkrun/src/lib.rs | 50 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/libkrun.h b/include/libkrun.h index 3004110f6..c6caba5fe 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1153,6 +1153,7 @@ int32_t krun_get_max_vcpus(void); */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); + /* * Do not create an implicit console device in the guest. By using this API, * libkrun will create zero console devices on behalf of the user. Any @@ -1167,6 +1168,18 @@ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); */ int32_t krun_disable_implicit_console(uint32_t ctx_id); +/** + * Do not inject the default init binary (/init.krun) into the root + * filesystem. Must be called before krun_set_root(). + * + * Arguments: + * "ctx_id" - the configuration context ID. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_disable_implicit_init(uint32_t ctx_id); + /** * Disable the implicit vsock device. * diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 9e2215a62..4834f3d25 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -184,6 +184,8 @@ struct ContextConfig { console_output: Option, vmm_uid: Option, vmm_gid: Option, + #[cfg(not(any(feature = "tee", feature = "aws-nitro")))] + disable_implicit_init: bool, } impl ContextConfig { @@ -616,7 +618,13 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) shm_size: Some(1 << 29), allow_root_dir_delete: false, read_only: false, - virtual_entries: vec![init_virtual_entry()], + virtual_entries: { + let mut v = Vec::new(); + if !cfg.disable_implicit_init { + v.push(init_virtual_entry()); + } + v + }, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -684,7 +692,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); let mut virtual_entries = Vec::new(); - if tag == "/dev/root" { + if tag == "/dev/root" && !cfg.disable_implicit_init { virtual_entries.push(init_virtual_entry()); } cfg.vmr.add_fs_device(FsDeviceConfig { @@ -2432,6 +2440,19 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( KRUN_SUCCESS } +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().disable_implicit_init = true; + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { @@ -2878,3 +2899,28 @@ fn krun_start_enter_nitro(ctx_id: u32) -> i32 { } } } + +#[cfg(all(test, not(feature = "tee")))] +mod test_disable_implicit_init { + use super::*; + + #[test] + fn test_disable_implicit_init() { + let ctx = unsafe { krun_create_ctx() } as u32; + unsafe { + krun_disable_implicit_init(ctx); + krun_set_root(ctx, c"/tmp".as_ptr()); + } + + let ctx_map = CTX_MAP.lock().unwrap(); + let cfg = ctx_map.get(&ctx).unwrap(); + assert_eq!(cfg.vmr.fs.len(), 1); + assert!( + cfg.vmr.fs[0].virtual_entries.is_empty(), + "root virtiofs should not inject init.krun after krun_disable_implicit_init()" + ); + drop(ctx_map); + + assert_eq!(krun_free_ctx(ctx), KRUN_SUCCESS); + } +} From 3ab23add4779db64be69ffb17f3ff977059d20b6 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:17:51 +0200 Subject: [PATCH 05/14] lib: add krun_fs_add_overlay_file(), krun_fs_add_overlay_dir() APIs Add C APIs to inject virtual files and directories into a virtiofs device. Entries are backed entirely by host memory (no host file). Files support one-shot semantics (disappear after the first lookup). Paths may contain '/' to nest entries inside existing virtual directories (e.g. krun_fs_add_overlay_dir for "etc", then krun_fs_add_overlay_file for "etc/hostname"). Intermediate directories must already exist; -ENOENT / -ENOTDIR is returned otherwise. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 50 ++++++++++ src/libkrun/src/lib.rs | 207 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 235 insertions(+), 22 deletions(-) diff --git a/include/libkrun.h b/include/libkrun.h index c6caba5fe..37ce25e85 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1180,6 +1180,56 @@ int32_t krun_disable_implicit_console(uint32_t ctx_id); */ int32_t krun_disable_implicit_init(uint32_t ctx_id); +/** + * Add a virtual overlay file to a virtiofs device. + * + * The file is backed entirely by host memory (no host file). The data + * pointer is NOT copied — the caller must keep the memory valid for the + * full VM lifetime. + * + * "path" may contain '/' to place the file inside a virtual directory + * previously created with krun_fs_add_overlay_dir (e.g. "etc/hostname"). + * All intermediate directories must already exist; -ENOENT is returned + * if a component is missing, -ENOTDIR if a component is not a directory. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "path" - path of the file (e.g. "init.krun" or "etc/hostname"). + * "data" - pointer to the file content. + * "data_len" - length of the file content in bytes. + * "mode" - file mode bits (e.g. 0100644 for a regular file). + * "one_shot" - if true, the file can only be looked up once. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_file(uint32_t ctx_id, const char *fs_tag, + const char *path, const uint8_t *data, + size_t data_len, uint32_t mode, bool one_shot); + +/** + * Add a virtual overlay directory to a virtiofs device. + * + * The directory is empty and read-only, useful as a mount point. + * + * "path" may contain '/' to nest inside an existing virtual directory + * (e.g. "usr/lib"). All intermediate directories must already exist; + * -ENOENT is returned if a component is missing, -ENOTDIR if a component + * is not a directory. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "path" - path of the directory (e.g. "dev" or "usr/lib"). + * "mode" - directory mode bits (e.g. 040755). + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_dir(uint32_t ctx_id, const char *fs_tag, + const char *path, uint32_t mode); + /** * Disable the implicit vsock device. * diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 4834f3d25..283df1141 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -19,8 +19,6 @@ use devices::virtio::fs::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualE use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; -#[cfg(all(feature = "blk", not(feature = "tee")))] -use rand::distr::{Alphanumeric, SampleString}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; @@ -613,10 +611,9 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) let cfg = ctx_cfg.get_mut(); cfg.vmr.add_fs_device(FsDeviceConfig { fs_id, - shared_dir, + shared_dir: Some(shared_dir), // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: false, read_only: false, virtual_entries: { let mut v = Vec::new(); @@ -666,7 +663,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( shm_size: u64, read_only: bool, ) -> i32 { - if c_tag.is_null() || c_path.is_null() { + if c_tag.is_null() { return -libc::EINVAL; } @@ -674,9 +671,15 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Ok(tag) => tag, Err(_) => return -libc::EINVAL, }; - let path = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(_) => return -libc::EINVAL, + + // NULL path means NullFs (virtual-only filesystem, no host directory). + let path = if c_path.is_null() { + None + } else { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(path), + Err(_) => return -libc::EINVAL, + } }; let shm = if shm_size > 0 { @@ -697,9 +700,8 @@ pub unsafe extern "C" fn krun_add_virtiofs3( } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), - shared_dir: path.to_string(), + shared_dir: path.map(|p| p.to_string()), shm_size: shm, - allow_root_dir_delete: false, read_only, virtual_entries, }); @@ -2411,25 +2413,35 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( return -libc::EINVAL; } - // To boot from a filesystem other than virtiofs, - // we need to setup a temporary root from which init.krun can be executed. - // Otherwise, it would have to be copied to the target filesystem beforehand. - // Instead, init.krun will run from virtiofs and then switch to the real root. - let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); - let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); - - if let Err(e) = std::fs::create_dir_all(&empty_root) { - error!("Failed to create empty root directory: {e:?}"); - return -libc::EINVAL; + // Boot from a block device: the virtiofs root only needs to + // serve init.krun and provide mount points for /dev, /proc, /sys. + // Use a NullFs (no host directory) with the inode overlay. + let mut virtual_entries = Vec::new(); + if !ctx_cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } + // init.c needs these directories as mount points before + // pivoting to the block device root. + for name in ["dev", "proc", "sys", "newroot"] { + virtual_entries.push(VirtualDirEntry { + name: CString::new(name).unwrap(), + entry: VirtualEntry { + mode: 0o755, + one_shot: false, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + }); } ctx_cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: "/dev/root".into(), - shared_dir: empty_root.to_string_lossy().into(), + shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: true, read_only: false, + virtual_entries, }); ctx_cfg.set_block_root(device, fstype, options); @@ -2453,6 +2465,157 @@ pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { KRUN_SUCCESS } +/// Resolve a path like "a/b/c" into parent directory children + leaf name. +/// Errors with a libc errno if any intermediate component is missing or not a Dir. +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn resolve_overlay_path<'a>( + entries: &'a mut Vec, + path: &str, +) -> Result<(&'a mut Vec, CString), i32> { + let path = path.strip_prefix('/').unwrap_or(path); + let components: Vec<&str> = path.split('/').collect(); + let (leaf, parents) = components.split_last().ok_or(-libc::EINVAL)?; + if leaf.is_empty() { + return Err(-libc::EINVAL); + } + + let mut current = entries; + for component in parents { + let dir = current + .iter_mut() + .find(|e| e.name.as_c_str().to_bytes() == component.as_bytes()) + .ok_or(-libc::ENOENT)?; + match &mut dir.entry.content { + VirtualEntryContent::Dir { children } => current = children, + _ => return Err(-libc::ENOTDIR), + } + } + + let name = CString::new(*leaf).map_err(|_| -libc::EINVAL)?; + Ok((current, name)) +} + +/// Add a virtual overlay entry to a virtiofs device, resolving paths with `/`. +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn fs_add_overlay_entry(ctx_id: u32, fs_tag: &str, path: &str, entry: VirtualEntry) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let fs_cfg = match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs) => fs, + None => return -libc::ENOENT, + }; + let (parent_children, name) = + match resolve_overlay_path(&mut fs_cfg.virtual_entries, path) { + Ok(v) => v, + Err(e) => return e, + }; + parent_children.push(VirtualDirEntry { name, entry }); + } + Entry::Vacant(_) => return -libc::ENOENT, + } + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_get_default_init( + data_out: *mut *const u8, + len_out: *mut size_t, +) -> i32 { + if data_out.is_null() || len_out.is_null() { + return -libc::EINVAL; + } + *data_out = DEFAULT_INIT_PAYLOAD.as_ptr(); + *len_out = DEFAULT_INIT_PAYLOAD.len(); + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_fs_add_overlay_file( + ctx_id: u32, + c_fs_tag: *const c_char, + c_path: *const c_char, + data: *const u8, + data_len: size_t, + mode: u32, + one_shot: bool, +) -> i32 { + if c_fs_tag.is_null() || c_path.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + let path = match CStr::from_ptr(c_path).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + // SAFETY: The caller guarantees the memory remains valid for the VM + // lifetime (see the C header contract). + let payload: &'static [u8] = if data_len == 0 { + &[] + } else { + if data.is_null() { + return -libc::EINVAL; + } + slice::from_raw_parts(data, data_len) + }; + + fs_add_overlay_entry( + ctx_id, + fs_tag, + path, + VirtualEntry { + mode, + one_shot, + content: VirtualEntryContent::File { data: payload }, + }, + ) +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_fs_add_overlay_dir( + ctx_id: u32, + c_fs_tag: *const c_char, + c_path: *const c_char, + mode: u32, +) -> i32 { + if c_fs_tag.is_null() || c_path.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + let path = match CStr::from_ptr(c_path).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + fs_add_overlay_entry( + ctx_id, + fs_tag, + path, + VirtualEntry { + mode, + one_shot: false, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + ) +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { From 3f57196a3ca413d694ce89692e243f8d201787a6 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:18:48 +0200 Subject: [PATCH 06/14] lib: add krun_get_default_init() Add API to retrieve the built-in default init binary. Callers that use krun_disable_implicit_init() can use this to obtain the init binary and inject it themselves via krun_fs_add_overlay_file(). Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/libkrun.h b/include/libkrun.h index 37ce25e85..7a12e91a4 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1180,6 +1180,25 @@ int32_t krun_disable_implicit_console(uint32_t ctx_id); */ int32_t krun_disable_implicit_init(uint32_t ctx_id); +/** + * Get a pointer to the built-in default init binary. + * + * This is the same binary that libkrun injects as /init.krun by default. + * Callers that use krun_disable_implicit_init() can use this to inject the + * init binary themselves (e.g. via krun_fs_add_overlay_file with custom + * settings). + * + * The returned pointer is valid for the lifetime of the process (static data). + * + * Arguments: + * "data_out" - receives a pointer to the init binary bytes. + * "len_out" - receives the length in bytes. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_get_default_init(const uint8_t **data_out, size_t *len_out); + /** * Add a virtual overlay file to a virtiofs device. * From 3f85d9868c2abef41f5a9b6de0b43236b324a3b7 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 15:36:56 +0200 Subject: [PATCH 07/14] libkrun.h: document that implicit resource creation will become opt-in Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/libkrun.h b/include/libkrun.h index 7a12e91a4..5a63be917 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1153,6 +1153,13 @@ int32_t krun_get_max_vcpus(void); */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); +/* + * NOTE: Implicit resource creation is a legacy convenience. The 2.0 API + * (see https://github.com/containers/libkrun/issues/634) will not create + * any implicit resources. Callers should start using the + * krun_disable_implicit_* functions now to ease migration. + */ + /* * Do not create an implicit console device in the guest. By using this API, From 60e878c2be5735b5ffe045f8f4629bf9e1b2ccfd Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:21:18 +0200 Subject: [PATCH 08/14] virtio/fs: add NullFs, a minimal empty-root FileSystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NullFs implements the FileSystem trait with just an empty root directory. It can be wrapped with AugmentFs to serve virtual files without any host directory involvement. Fs::new() now accepts Option for shared_dir — None selects NullFs. FsDeviceConfig and FsServer gain the corresponding variants. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/device.rs | 22 +++++++----- src/devices/src/virtio/fs/mod.rs | 1 + src/devices/src/virtio/fs/null_fs.rs | 50 ++++++++++++++++++++++++++++ src/devices/src/virtio/fs/worker.rs | 44 ++++++++++++++++-------- src/vmm/src/vmm_config/fs.rs | 4 ++- 5 files changed, 99 insertions(+), 22 deletions(-) create mode 100644 src/devices/src/virtio/fs/null_fs.rs diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index 945f8393e..f0fb45401 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -47,7 +47,7 @@ pub struct Fs { device_state: DeviceState, config: VirtioFsConfig, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, virtual_entries: Vec, worker_thread: Option>, @@ -60,7 +60,7 @@ pub struct Fs { impl Fs { pub fn new( fs_id: String, - shared_dir: String, + shared_dir: Option, exit_code: Arc, allow_root_dir_delete: bool, read_only: bool, @@ -73,11 +73,11 @@ impl Fs { config.tag[..tag.len()].copy_from_slice(tag.as_slice()); config.num_request_queues = 1; - let fs_cfg = passthrough::Config { - root_dir: shared_dir, + let fs_cfg = shared_dir.map(|root_dir| passthrough::Config { + root_dir, allow_root_dir_delete, ..Default::default() - }; + }); Ok(Fs { avail_features, @@ -107,10 +107,16 @@ impl Fs { pub fn set_export_table(&mut self, export_table: ExportTable) -> u64 { static FS_UNIQUE_ID: AtomicU64 = AtomicU64::new(0); - self.passthrough_cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); - self.passthrough_cfg.export_table = Some(export_table); + let Some(cfg) = self.passthrough_cfg.as_mut() else { + // NullFs-backed devices have no passthrough config and don't + // participate in cross-domain fd export. Consume (and waste) an + // fsid so numbering stays dense, but don't store the table. + return FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + }; + cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + cfg.export_table = Some(export_table); - self.passthrough_cfg.export_fsid + cfg.export_fsid } #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index ae5b7bbdc..f8ef63295 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -6,6 +6,7 @@ pub mod fuse; mod inode_alloc; #[allow(dead_code)] mod multikey; +mod null_fs; mod read_only; mod server; pub mod virtual_entry; diff --git a/src/devices/src/virtio/fs/null_fs.rs b/src/devices/src/virtio/fs/null_fs.rs new file mode 100644 index 000000000..4bb4b6360 --- /dev/null +++ b/src/devices/src/virtio/fs/null_fs.rs @@ -0,0 +1,50 @@ +// A minimal filesystem that serves an empty root directory. +// +// Used with AugmentFs to provide a virtual-only filesystem (e.g. for +// booting from a block device where the virtiofs root only needs init.krun). + +use std::ffi::CStr; +use std::io; +use std::mem; +use std::time::Duration; + +use super::filesystem::{Context, Entry, FileSystem, FsOptions}; +use super::fuse; +use super::virtual_entry::VIRTUAL_BLKSIZE; +use crate::virtio::bindings; + +/// An empty filesystem with just a root directory and nothing in it. +pub struct NullFs; + +type Inode = u64; +type Handle = u64; + +impl FileSystem for NullFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, _capable: FsOptions) -> io::Result { + Ok(FsOptions::empty()) + } + + fn lookup(&self, _ctx: Context, _parent: Inode, _name: &CStr) -> io::Result { + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } + + fn getattr( + &self, + _ctx: Context, + inode: Inode, + _handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + if inode == fuse::ROOT_ID { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = fuse::ROOT_ID; + st.st_mode = libc::S_IFDIR | 0o755; + st.st_nlink = 2; + st.st_blksize = VIRTUAL_BLKSIZE as _; + return Ok((st, Duration::MAX)); + } + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } +} diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index 084a2aa85..b8e722b5d 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -18,6 +18,7 @@ use super::augment_fs::AugmentFs; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; use super::inode_alloc::InodeAllocator; +use super::null_fs::NullFs; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; @@ -27,6 +28,7 @@ use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { ReadWrite(Server>), ReadOnly(Server>), + Null(Server>), } impl FsServer { @@ -55,6 +57,14 @@ impl FsServer { #[cfg(target_os = "macos")] map_sender, ), + FsServer::Null(s) => s.handle_message( + r, + w, + shm_region, + exit_code, + #[cfg(target_os = "macos")] + map_sender, + ), } } } @@ -80,7 +90,7 @@ impl FsWorker { interrupt: InterruptTransport, mem: GuestMemoryMmap, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, virtual_entries: Vec, stop_fd: EventFd, @@ -88,20 +98,28 @@ impl FsWorker { #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { let inode_alloc = Arc::new(InodeAllocator::new()); - let server = if read_only { - let inner = PassthroughFsRo::new(passthrough_cfg, inode_alloc.clone())?; - FsServer::ReadOnly(Server::new(AugmentFs::new( - inner, - &inode_alloc, - virtual_entries, - ))) - } else { - let inner = PassthroughFs::new(passthrough_cfg, inode_alloc.clone())?; - FsServer::ReadWrite(Server::new(AugmentFs::new( - inner, + let server = match passthrough_cfg { + Some(cfg) if read_only => { + let inner = PassthroughFsRo::new(cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + Some(cfg) => { + let inner = PassthroughFs::new(cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + None => FsServer::Null(Server::new(AugmentFs::new( + NullFs, &inode_alloc, virtual_entries, - ))) + ))), }; Ok(Self { queues, diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index dc5906dab..bd6633d32 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -4,7 +4,9 @@ use devices::virtio::fs::virtual_entry::VirtualDirEntry; #[derive(Clone, Debug)] pub struct FsDeviceConfig { pub fs_id: String, - pub shared_dir: String, + /// Host directory to pass through. None means a virtual-only filesystem + /// (NullFs + AugmentFs, no host directory). + pub shared_dir: Option, pub shm_size: Option, pub allow_root_dir_delete: bool, pub read_only: bool, From 32c3d0c1105d59879b8f12ce42cfba4739a19f24 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:23:18 +0200 Subject: [PATCH 09/14] lib: rewrite krun_set_root_disk_remount to use NullFs krun_set_root_disk_remount no longer creates a temporary empty host directory. Instead it configures a NullFs-backed virtiofs device (shared_dir: None) with init.krun overlaid via AugmentFs. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/libkrun/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 283df1141..60dbfd32f 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -2352,7 +2352,7 @@ pub extern "C" fn krun_setgid(ctx_id: u32, gid: libc::gid_t) -> i32 { KRUN_SUCCESS } -#[cfg(all(feature = "blk", not(feature = "tee")))] +#[cfg(all(feature = "blk", not(any(feature = "tee", feature = "aws-nitro"))))] #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_root_disk_remount( @@ -2440,6 +2440,7 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), + allow_root_dir_delete: false, read_only: false, virtual_entries, }); @@ -2561,11 +2562,10 @@ pub unsafe extern "C" fn krun_fs_add_overlay_file( // lifetime (see the C header contract). let payload: &'static [u8] = if data_len == 0 { &[] - } else { - if data.is_null() { - return -libc::EINVAL; - } + } else if !data.is_null() { slice::from_raw_parts(data, data_len) + } else { + return -libc::EINVAL; }; fs_add_overlay_entry( From d0d27645c9cdb31352f28be8b65edabfc84f1775 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:26:18 +0200 Subject: [PATCH 10/14] virtio/fs: remove REMOVE_ROOT_DIR ioctl and allow_root_dir_delete The temporary root directory hack is gone (replaced by NullFs), so the ioctl that cleaned it up and the config flag that gated it are no longer needed. Remove allow_root_dir_delete from FsDeviceConfig, Fs::new(), passthrough Config, and all call sites. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- init/init.c | 11 ----------- src/devices/src/virtio/fs/device.rs | 2 -- src/devices/src/virtio/fs/linux/passthrough.rs | 10 ---------- src/devices/src/virtio/fs/macos/passthrough.rs | 7 ------- src/devices/src/virtio/fs/read_only.rs | 8 -------- src/libkrun/src/lib.rs | 1 - src/vmm/src/builder.rs | 1 - src/vmm/src/vmm_config/fs.rs | 1 - 8 files changed, 41 deletions(-) diff --git a/init/init.c b/init/init.c index 59a5c3d94..2d2be2834 100644 --- a/init/init.c +++ b/init/init.c @@ -43,7 +43,6 @@ #endif #define KRUN_EXIT_CODE_IOCTL 0x7602 -#define KRUN_REMOVE_ROOT_DIR_IOCTL 0x7603 #define KRUN_MAGIC "KRUN" #define KRUN_FOOTER_LEN 12 @@ -1475,16 +1474,6 @@ int main(int argc, char **argv) chdir("/newroot"); - fd = open("/", O_RDONLY); - if (fd < 0) { - perror("Couldn't open temporary root directory for removing"); - exit(-1); - } - if (ioctl(fd, KRUN_REMOVE_ROOT_DIR_IOCTL) < 0) { - perror("Error removing temporary root directory"); - } - close(fd); - if (mount(".", "/", NULL, MS_MOVE, NULL) < 0) { perror("remount root"); exit(-1); diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index f0fb45401..c757c9d3d 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -62,7 +62,6 @@ impl Fs { fs_id: String, shared_dir: Option, exit_code: Arc, - allow_root_dir_delete: bool, read_only: bool, virtual_entries: Vec, ) -> super::Result { @@ -75,7 +74,6 @@ impl Fs { let fs_cfg = shared_dir.map(|root_dir| passthrough::Config { root_dir, - allow_root_dir_delete, ..Default::default() }); diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index 08da133f0..2bfa46349 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -325,7 +325,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -340,7 +339,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -2122,10 +2120,6 @@ impl FileSystem for PassthroughFs { const VIRTIO_IOC_EXIT_CODE_REQ: u32 = request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - const VIRTIO_IOC_REMOVE_ROOT_DIR_CODE: u8 = 3; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_REMOVE_ROOT_DIR_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2160,10 +2154,6 @@ impl FileSystem for PassthroughFs { exit_code.store(arg as i32, Ordering::SeqCst); Ok(Vec::new()) } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index d1a862d0c..3a0500735 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -514,7 +514,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. Not supported for macos. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -529,7 +528,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -2441,17 +2439,12 @@ impl FileSystem for PassthroughFs { // We can't use nix::request_code_none here since it's system-dependent // and we need the value from Linux. const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; match cmd { VIRTIO_IOC_EXIT_CODE_REQ => { exit_code.store(arg as i32, Ordering::SeqCst); Ok(Vec::new()) } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index eb8aebef3..5495db1ed 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -36,10 +36,6 @@ fn erofs() -> io::Error { io::Error::from_raw_os_error(libc::EROFS) } -// Keep the Linux ioctl number so read-only virtio-fs can still handle -// non-mutating control ioctls while rejecting host-side root deletion. -const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - fn read_only_open_flags(flags: u32) -> io::Result { let f = flags as i32; if f & libc::O_ACCMODE != libc::O_RDONLY { @@ -319,10 +315,6 @@ impl FileSystem for PassthroughFsRo { out_size: u32, exit_code: &Arc, ) -> io::Result> { - if cmd == VIRTIO_IOC_REMOVE_ROOT_DIR_REQ { - return Err(erofs()); - } - self.inner.ioctl( ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, ) diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 60dbfd32f..2c7976b3c 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -2440,7 +2440,6 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: false, read_only: false, virtual_entries, }); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 38072ce70..8e8ca4e18 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -2040,7 +2040,6 @@ fn attach_fs_devices( config.fs_id.clone(), config.shared_dir.clone(), exit_code.clone(), - config.allow_root_dir_delete, config.read_only, config.virtual_entries.clone(), ) diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index bd6633d32..92927ec9a 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -8,7 +8,6 @@ pub struct FsDeviceConfig { /// (NullFs + AugmentFs, no host directory). pub shared_dir: Option, pub shm_size: Option, - pub allow_root_dir_delete: bool, pub read_only: bool, #[cfg(not(feature = "aws-nitro"))] pub virtual_entries: Vec, From fdeab1cb0138d24812b47fddb68278b1c1ee37b9 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 16:28:09 +0200 Subject: [PATCH 11/14] virtio/fs: move EXIT_CODE ioctl to AugmentFs The exit-code ioctl is a krun mechanism, not a filesystem operation. Move it to the AugmentFs overlay where it is handled before any delegation to the inner filesystem. The Linux passthrough retains only EXPORT_FD (which needs access to passthrough-internal handle and export tables). The macOS passthrough no longer implements ioctl at all. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/augment_fs.rs | 18 +++++++++--- .../src/virtio/fs/linux/passthrough.rs | 14 ++------- .../src/virtio/fs/macos/passthrough.rs | 29 ++----------------- 3 files changed, 19 insertions(+), 42 deletions(-) diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs index a694e5b96..ab7779508 100644 --- a/src/devices/src/virtio/fs/augment_fs.rs +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -17,7 +17,7 @@ use std::ffi::CStr; use std::ffi::CString; use std::io; use std::mem; -use std::sync::atomic::AtomicI32; +use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::Arc; use std::sync::RwLock; use std::time::Duration; @@ -730,8 +730,18 @@ impl> FileSystem for AugmentFs out_size: u32, exit_code: &Arc, ) -> io::Result> { - self.inner.ioctl( - ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, - ) + // We can't use nix::request_code_none here since it's system-dependent + // and we need the value from Linux. + const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; + + match cmd { + VIRTIO_IOC_EXIT_CODE_REQ => { + exit_code.store(arg as i32, Ordering::SeqCst); + Ok(Vec::new()) + } + _ => self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ), + } } } diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index 2bfa46349..8272a7e01 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -16,7 +16,7 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use caps::{has_cap, CapSet, Capability}; -use nix::{request_code_none, request_code_read}; +use nix::request_code_read; use vm_memory::ByteValued; @@ -2101,10 +2101,10 @@ impl FileSystem for PassthroughFs { handle: Self::Handle, _flags: u32, cmd: u32, - arg: u64, + _arg: u64, _in_size: u32, out_size: u32, - exit_code: &Arc, + _exit_code: &Arc, ) -> io::Result> { const VIRTIO_IOC_MAGIC: u8 = b'v'; @@ -2116,10 +2116,6 @@ impl FileSystem for PassthroughFs { VIRTIO_IOC_EXPORT_FD_SIZE ) as u32; - const VIRTIO_IOC_TYPE_EXIT_CODE: u8 = 2; - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2150,10 +2146,6 @@ impl FileSystem for PassthroughFs { ret.extend_from_slice(&handle.to_ne_bytes()); Ok(ret) } - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 3a0500735..cf43e0d0c 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -8,11 +8,11 @@ use std::collections::HashMap; use std::ffi::{CStr, CString}; use std::fs::File; use std::io; -use std::mem::{self, MaybeUninit}; +use std::mem::MaybeUninit; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::ptr::null_mut; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, AtomicI32, AtomicI64, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, RwLock}; use std::time::Duration; @@ -2423,29 +2423,4 @@ impl FileSystem for PassthroughFs { Ok(()) } - - fn ioctl( - &self, - _ctx: Context, - _inode: Self::Inode, - _handle: Self::Handle, - _flags: u32, - cmd: u32, - arg: u64, - _in_size: u32, - _out_size: u32, - exit_code: &Arc, - ) -> io::Result> { - // We can't use nix::request_code_none here since it's system-dependent - // and we need the value from Linux. - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - - match cmd { - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), - } - } } From 00b5bddceb8443aeb6852a42461cb47577c9e26f Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 15:32:19 +0200 Subject: [PATCH 12/14] tests: add augmentfs integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boot a VM with a pure NullFs root — no host directory at all. Every file in the root (init.krun, guest-agent, .krun_config.json, test data) is injected as a virtual overlay, and /dev, /proc, /sys are virtual empty directories used as mount points. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- tests/test_cases/src/lib.rs | 4 + tests/test_cases/src/test_augmentfs.rs | 304 +++++++++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 tests/test_cases/src/test_augmentfs.rs diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..a8ad7eaa6 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -22,6 +22,9 @@ use test_multiport_console::TestMultiportConsole; mod test_virtiofs_root_ro; use test_virtiofs_root_ro::TestVirtiofsRootRo; +mod test_augmentfs; +use test_augmentfs::TestAugmentFs; + mod test_pjdfstest; use test_pjdfstest::TestPjdfstest; @@ -84,6 +87,7 @@ pub fn test_cases() -> Vec { TestCase::new("net-vmnet-helper", Box::new(TestNet::new_vmnet_helper())), TestCase::new("multiport-console", Box::new(TestMultiportConsole)), TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), + TestCase::new("augmentfs", Box::new(TestAugmentFs)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), diff --git a/tests/test_cases/src/test_augmentfs.rs b/tests/test_cases/src/test_augmentfs.rs new file mode 100644 index 000000000..34edce96b --- /dev/null +++ b/tests/test_cases/src/test_augmentfs.rs @@ -0,0 +1,304 @@ +// Test the AugmentFs overlay over a NullFs. +// +// Boots a VM with NO host filesystem — the root virtiofs is backed entirely +// by virtual inodes: init.krun (one-shot), the guest-agent binary (one-shot), +// a .krun_config.json (one-shot), persistent test files, and virtual +// directories as mount points for /dev, /proc, /sys. + +use macros::{guest, host}; + +pub struct TestAugmentFs; + +fn make_test_payload() -> Vec { + (0..8192u32).map(|i| (i % 251) as u8).collect() +} + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::ptr::null_mut; + + impl Test for TestAugmentFs { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let test_case = CString::new(test_setup.test_case)?; + + // Read the guest-agent binary into memory. Leaked because + // krun_start_enter never returns. + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + let guest_agent_bytes: &'static [u8] = + Vec::leak(std::fs::read(&guest_agent_path).expect("Failed to read guest-agent")); + + // Build JSON config: exec the guest-agent with our test name. + let json = format!( + r#"{{"args": ["/guest-agent", "{}"], "cwd": "/"}}"#, + test_case.to_str().unwrap() + ); + let json_bytes: &'static [u8] = Vec::leak(json.into_bytes()); + + // Deterministic test payload for range-read tests. + let payload: &'static [u8] = Vec::leak(make_test_payload()); + + // A small marker file to test persistent reads. + let marker: &'static [u8] = b"virtual-file-marker-content-12345"; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + // Disable the implicit init — we'll inject it ourselves. + krun_call!(krun_disable_implicit_init(ctx))?; + + // Get the default init binary. + let mut init_data: *const u8 = null_mut(); + let mut init_len: usize = 0; + krun_call!(krun_get_default_init(&mut init_data, &mut init_len))?; + + // Set up root with NO host directory (NullFs). + krun_call!(krun_add_virtiofs3( + ctx, + c"/dev/root".as_ptr(), + std::ptr::null(), // NULL path → NullFs + 0, // no SHM window + false, // not read-only + ))?; + + // Virtual directories needed by init as mount points. + for dir in [c"dev", c"proc", c"sys"] { + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + dir.as_ptr(), + 0o040_755, + ))?; + } + + // Overlay init.krun (one-shot, executable). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"init.krun".as_ptr(), + init_data, + init_len, + 0o100_755, + true, + ))?; + + // Overlay guest-agent (one-shot, executable). After init + // execs it, the file should no longer be visible. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"guest-agent".as_ptr(), + guest_agent_bytes.as_ptr(), + guest_agent_bytes.len(), + 0o100_755, + true, + ))?; + + // Overlay .krun_config.json (one-shot). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c".krun_config.json".as_ptr(), + json_bytes.as_ptr(), + json_bytes.len(), + 0o100_644, + true, + ))?; + + // Overlay a persistent marker file. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"marker.txt".as_ptr(), + marker.as_ptr(), + marker.len(), + 0o100_644, + false, + ))?; + + // Overlay a deterministic 8 KiB payload for range-read tests. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"testdata.bin".as_ptr(), + payload.as_ptr(), + payload.len(), + 0o100_444, + false, + ))?; + + // --- Nested path test (2-level) --- + // etc/ -> etc/nested/ -> etc/nested/deep.txt + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + c"etc".as_ptr(), + 0o040_755, + ))?; + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + c"etc/nested".as_ptr(), + 0o040_755, + ))?; + let nested_content: &'static [u8] = b"deep-nested-content"; + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"etc/nested/deep.txt".as_ptr(), + nested_content.as_ptr(), + nested_content.len(), + 0o100_644, + false, + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::io::{ErrorKind, Read, Seek, SeekFrom}; + use std::path::Path; + + impl Test for TestAugmentFs { + fn in_guest(self: Box) { + // --- One-shot files should be gone --- + assert!( + !Path::new("/.krun_config.json").exists(), + ".krun_config.json should be gone (one-shot)" + ); + assert!( + !Path::new("/init.krun").exists(), + "init.krun should be gone (one-shot)" + ); + + // --- One-shot guest-agent can't see itself --- + assert!( + !Path::new("/guest-agent").exists(), + "guest-agent should be gone (one-shot)" + ); + + // --- Virtual directories should be accessible --- + // init already mounted over these, but let's verify they + // exist as directories (the mount points came from our + // virtual dir overlay). + for dir in ["/dev", "/proc", "/sys"] { + let meta = fs::metadata(dir).unwrap_or_else(|e| panic!("{dir} should exist: {e}")); + assert!(meta.is_dir(), "{dir} should be a directory"); + } + + // Verify the mounts actually worked by checking known entries. + assert!( + Path::new("/dev/null").exists(), + "/dev/null should exist (devtmpfs)" + ); + assert!( + Path::new("/proc/self").exists(), + "/proc/self should exist (procfs)" + ); + assert!( + Path::new("/sys/kernel").exists(), + "/sys/kernel should exist (sysfs)" + ); + + // Verify directory listing works on each mounted fs. + let dev_entries: Vec<_> = fs::read_dir("/dev").expect("read_dir /dev").collect(); + assert!(!dev_entries.is_empty(), "/dev listing should not be empty"); + + let proc_entries: Vec<_> = fs::read_dir("/proc").expect("read_dir /proc").collect(); + assert!( + !proc_entries.is_empty(), + "/proc listing should not be empty" + ); + + let sys_entries: Vec<_> = fs::read_dir("/sys").expect("read_dir /sys").collect(); + assert!(!sys_entries.is_empty(), "/sys listing should not be empty"); + + // --- Persistent files should still exist --- + assert!(Path::new("/marker.txt").exists(), "marker.txt should exist"); + assert!( + Path::new("/testdata.bin").exists(), + "testdata.bin should exist" + ); + + // --- Read + verify marker content --- + let content = fs::read_to_string("/marker.txt").expect("read marker.txt"); + assert_eq!(content, "virtual-file-marker-content-12345"); + + // --- Repeated reads return the same data --- + let content2 = fs::read_to_string("/marker.txt").expect("re-read marker.txt"); + assert_eq!(content, content2, "repeated reads differ"); + + // --- Write should fail --- + let err = fs::OpenOptions::new() + .write(true) + .open("/marker.txt") + .expect_err("write-open should fail"); + assert_eq!(err.kind(), ErrorKind::PermissionDenied); + + // --- stat reports correct size --- + let meta = fs::metadata("/testdata.bin").expect("stat testdata.bin"); + assert_eq!(meta.len(), 8192, "testdata.bin size mismatch"); + + // --- Range reads on the 8 KiB payload --- + let expected = make_test_payload(); + let mut f = fs::File::open("/testdata.bin").expect("open testdata.bin"); + + // Full read. + let got = fs::read("/testdata.bin").expect("full read"); + assert_eq!(got, expected, "full read mismatch"); + + // Read first 256 bytes. + let mut buf = vec![0u8; 256]; + f.read_exact(&mut buf).expect("read first 256"); + assert_eq!(buf, &expected[..256], "first 256 bytes mismatch"); + + // Seek to offset 4000, read 512 bytes. + f.seek(SeekFrom::Start(4000)).expect("seek to 4000"); + let mut buf = vec![0u8; 512]; + f.read_exact(&mut buf).expect("read at offset 4000"); + assert_eq!(buf, &expected[4000..4512], "range [4000..4512] mismatch"); + + // Seek to last 10 bytes. + f.seek(SeekFrom::End(-10)).expect("seek to end-10"); + let mut buf = vec![0u8; 10]; + f.read_exact(&mut buf).expect("read last 10"); + assert_eq!(buf, &expected[8182..8192], "last 10 bytes mismatch"); + + // Read past EOF should return 0 bytes. + f.seek(SeekFrom::Start(8192)).expect("seek to EOF"); + let mut buf = vec![0u8; 100]; + let n = f.read(&mut buf).expect("read past EOF"); + assert_eq!(n, 0, "read past EOF should return 0"); + + // Seek back to start, re-read, verify consistency. + f.seek(SeekFrom::Start(0)).expect("seek to start"); + let mut full = Vec::new(); + f.read_to_end(&mut full).expect("read_to_end"); + assert_eq!(full, expected, "read_to_end mismatch"); + + // --- Nested path test (2-level: etc/nested/deep.txt) --- + let deep = + fs::read_to_string("/etc/nested/deep.txt").expect("read /etc/nested/deep.txt"); + assert_eq!(deep, "deep-nested-content"); + + println!("OK"); + } + } +} From 2596f98ed1b91651179beb7013a8854311bce3e2 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 12 May 2026 15:32:34 +0200 Subject: [PATCH 13/14] tests: add root-disk-remount integration test Boot from an ext4 block device via krun_set_root_disk_remount. The virtiofs root uses NullFs with init.krun and virtual mount-point directories overlaid. The guest verifies it pivoted to the block device root successfully. Uses dlsym for krun_add_disk/krun_set_root_disk_remount so the test compiles without BLK and skips gracefully at runtime. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- tests/run.sh | 5 + tests/test_cases/src/lib.rs | 4 + .../test_cases/src/test_root_disk_remount.rs | 164 ++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 tests/test_cases/src/test_root_disk_remount.rs diff --git a/tests/run.sh b/tests/run.sh index 3d7b1e6ef..87bd65310 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -42,6 +42,11 @@ if [ "$OS" = "Darwin" ]; then export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER="clang" export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="-C link-arg=-target -C link-arg=aarch64-linux-gnu -C link-arg=-fuse-ld=lld -C link-arg=--sysroot=$SYSROOT -C link-arg=-static" echo "Cross-compiling guest-agent for $GUEST_TARGET" + + # e2fsprogs is keg-only on macOS; add it to PATH for mke2fs. + if [ -d "/opt/homebrew/opt/e2fsprogs/sbin" ]; then + export PATH="/opt/homebrew/opt/e2fsprogs/sbin:$PATH" + fi fi cargo build --target=$GUEST_TARGET -p guest-agent diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index a8ad7eaa6..0f0b88290 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -25,6 +25,9 @@ use test_virtiofs_root_ro::TestVirtiofsRootRo; mod test_augmentfs; use test_augmentfs::TestAugmentFs; +mod test_root_disk_remount; +use test_root_disk_remount::TestRootDiskRemount; + mod test_pjdfstest; use test_pjdfstest::TestPjdfstest; @@ -88,6 +91,7 @@ pub fn test_cases() -> Vec { TestCase::new("multiport-console", Box::new(TestMultiportConsole)), TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), TestCase::new("augmentfs", Box::new(TestAugmentFs)), + TestCase::new("root-disk-remount", Box::new(TestRootDiskRemount)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), diff --git a/tests/test_cases/src/test_root_disk_remount.rs b/tests/test_cases/src/test_root_disk_remount.rs new file mode 100644 index 000000000..56011698f --- /dev/null +++ b/tests/test_cases/src/test_root_disk_remount.rs @@ -0,0 +1,164 @@ +// Test that krun_set_root_disk_remount works with NullFs. +// +// Creates a tiny ext4 disk image containing only the guest-agent binary, +// boots from it via krun_set_root_disk_remount (which uses NullFs for the +// initial virtiofs root with init.krun overlaid), and verifies the guest +// successfully pivoted to the block device root. + +use macros::{guest, host}; + +pub struct TestRootDiskRemount; + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32, ShouldRun}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use nix::libc; + use std::ffi::CString; + use std::process::Command; + use std::ptr::null; + + type KrunAddDiskFn = unsafe extern "C" fn( + ctx_id: u32, + block_id: *const std::ffi::c_char, + disk_path: *const std::ffi::c_char, + read_only: bool, + ) -> i32; + + type KrunSetRootDiskRemountFn = unsafe extern "C" fn( + ctx_id: u32, + device: *const std::ffi::c_char, + fstype: *const std::ffi::c_char, + options: *const std::ffi::c_char, + ) -> i32; + + fn get_krun_add_disk() -> KrunAddDiskFn { + let symbol = CString::new("krun_add_disk").unwrap(); + let ptr = unsafe { libc::dlsym(libc::RTLD_DEFAULT, symbol.as_ptr()) }; + assert!(!ptr.is_null(), "krun_add_disk not found"); + unsafe { std::mem::transmute(ptr) } + } + + fn get_krun_set_root_disk_remount() -> KrunSetRootDiskRemountFn { + let symbol = CString::new("krun_set_root_disk_remount").unwrap(); + let ptr = unsafe { libc::dlsym(libc::RTLD_DEFAULT, symbol.as_ptr()) }; + assert!(!ptr.is_null(), "krun_set_root_disk_remount not found"); + unsafe { std::mem::transmute(ptr) } + } + + fn create_disk_image(guest_agent_path: &str, output_path: &str) { + // Populate from a staging directory using mke2fs -d (no root needed). + let staging = format!("{output_path}.staging"); + std::fs::create_dir_all(&staging).expect("mkdir staging"); + + std::fs::copy(guest_agent_path, format!("{staging}/guest-agent")) + .expect("copy guest-agent"); + + // Marker file to verify the guest booted from the block device. + std::fs::write( + format!("{staging}/block-marker"), + "booted-from-block-device", + ) + .expect("write marker"); + + let status = Command::new("mke2fs") + .args(["-q", "-t", "ext4", "-d", &staging, output_path, "32M"]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .expect("mke2fs failed"); + assert!(status.success(), "mke2fs failed"); + + std::fs::remove_dir_all(&staging).expect("cleanup staging"); + } + + impl Test for TestRootDiskRemount { + fn should_run(&self) -> ShouldRun { + if unsafe { krun_call_u32!(krun_has_feature(KRUN_FEATURE_BLK.into())) }.ok() != Some(1) + { + return ShouldRun::No("libkrun compiled without BLK"); + } + ShouldRun::Yes + } + + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let krun_add_disk = get_krun_add_disk(); + let krun_set_root_disk_remount = get_krun_set_root_disk_remount(); + + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + + let disk_path = format!("{}/rootfs.ext4", test_setup.tmp_dir.display()); + create_disk_image(&guest_agent_path, &disk_path); + + let c_disk_path = CString::new(disk_path)?; + let test_case = CString::new(test_setup.test_case)?; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + let argv = [test_case.as_ptr(), null()]; + let envp = [null()]; + krun_call!(krun_set_exec( + ctx, + c"/guest-agent".as_ptr(), + argv.as_ptr(), + envp.as_ptr(), + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + + // Add a block device with the ext4 image. + krun_call!(krun_add_disk( + ctx, + c"vda".as_ptr(), + c_disk_path.as_ptr(), + false, + ))?; + + // Configure block device as root, pivot from NullFs. + krun_call!(krun_set_root_disk_remount( + ctx, + c"/dev/vda".as_ptr(), + c"ext4".as_ptr(), + std::ptr::null(), + ))?; + + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::path::Path; + + impl Test for TestRootDiskRemount { + fn in_guest(self: Box) { + // Verify we're running from the block device root. + let marker = fs::read_to_string("/block-marker") + .expect("Failed to read /block-marker — not on block device root?"); + assert_eq!(marker, "booted-from-block-device"); + + // The init.krun virtual file should be gone (one-shot, and we + // pivoted away from the NullFs root anyway). + assert!(!Path::new("/init.krun").exists()); + + // /proc and /dev should be mounted (init re-mounts after pivot). + assert!(Path::new("/proc/self").exists(), "/proc/self missing"); + assert!(Path::new("/dev/null").exists(), "/dev/null missing"); + + println!("OK"); + } + } +} From da074abbf2739d7276f86f3f80d95442c7aad878 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Wed, 13 May 2026 13:09:22 +0200 Subject: [PATCH 14/14] CI: enable BLK=1 and install e2fsprogs in integration tests Build and test with the block device feature so the root-disk-remount test runs in CI. Install e2fsprogs (provides mke2fs) which the test needs to create the ext4 disk image. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- .github/workflows/integration_tests.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 811ecf3fd..24a8ee15f 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -15,7 +15,7 @@ jobs: run: rustup target add x86_64-unknown-linux-musl - name: Build and install libkrun to test prefix - run: make test-prefix NET=1 + run: make test-prefix NET=1 BLK=1 - name: Clippy (test_cases guest) run: | @@ -45,7 +45,7 @@ jobs: sudo usermod -a -G kvm $USER - name: Install additional packages - run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools buildah dnsmasq iperf3 + run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools buildah dnsmasq iperf3 e2fsprogs - name: Install passt from source run: | @@ -58,7 +58,7 @@ jobs: run: TAG=`curl -sL https://api.github.com/repos/containers/libkrunfw/releases/latest |jq -r .tag_name` && curl -L -o /tmp/libkrunfw-x86_64.tgz https://github.com/containers/libkrunfw/releases/download/$TAG/libkrunfw-x86_64.tgz && mkdir tmp && tar xf /tmp/libkrunfw-x86_64.tgz -C tmp && sudo mv tmp/lib64/* /lib/x86_64-linux-gnu - name: Integration tests - run: KRUN_ENOMEM_WORKAROUND=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" + run: KRUN_ENOMEM_WORKAROUND=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 BLK=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" - name: Upload test logs if: always() @@ -84,7 +84,7 @@ jobs: run: rustup target add aarch64-unknown-linux-musl - name: Build and install libkrun to test prefix - run: make test-prefix NET=1 + run: make test-prefix NET=1 BLK=1 - name: Clippy (test_cases guest) run: | @@ -107,7 +107,7 @@ jobs: cargo clippy --locked --target aarch64-unknown-linux-musl -p guest-agent -- -D warnings - name: Install additional packages - run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools dnsmasq iperf3 git uidmap + run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools dnsmasq iperf3 git uidmap e2fsprogs - name: Install passt from source run: | @@ -123,7 +123,7 @@ jobs: run: rm -fr /tmp/libkrun-tests - name: Integration tests - run: KRUN_ENOMEM_WORKAROUND=1 KRUN_NO_UNSHARE=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" + run: KRUN_ENOMEM_WORKAROUND=1 KRUN_NO_UNSHARE=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 BLK=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" - name: Upload test logs if: always()