diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 811ecf3fd..24a8ee15f 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -15,7 +15,7 @@ jobs: run: rustup target add x86_64-unknown-linux-musl - name: Build and install libkrun to test prefix - run: make test-prefix NET=1 + run: make test-prefix NET=1 BLK=1 - name: Clippy (test_cases guest) run: | @@ -45,7 +45,7 @@ jobs: sudo usermod -a -G kvm $USER - name: Install additional packages - run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools buildah dnsmasq iperf3 + run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools buildah dnsmasq iperf3 e2fsprogs - name: Install passt from source run: | @@ -58,7 +58,7 @@ jobs: run: TAG=`curl -sL https://api.github.com/repos/containers/libkrunfw/releases/latest |jq -r .tag_name` && curl -L -o /tmp/libkrunfw-x86_64.tgz https://github.com/containers/libkrunfw/releases/download/$TAG/libkrunfw-x86_64.tgz && mkdir tmp && tar xf /tmp/libkrunfw-x86_64.tgz -C tmp && sudo mv tmp/lib64/* /lib/x86_64-linux-gnu - name: Integration tests - run: KRUN_ENOMEM_WORKAROUND=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" + run: KRUN_ENOMEM_WORKAROUND=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 BLK=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" - name: Upload test logs if: always() @@ -84,7 +84,7 @@ jobs: run: rustup target add aarch64-unknown-linux-musl - name: Build and install libkrun to test prefix - run: make test-prefix NET=1 + run: make test-prefix NET=1 BLK=1 - name: Clippy (test_cases guest) run: | @@ -107,7 +107,7 @@ jobs: cargo clippy --locked --target aarch64-unknown-linux-musl -p guest-agent -- -D warnings - name: Install additional packages - run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools dnsmasq iperf3 git uidmap + run: sudo apt-get install -y --no-install-recommends build-essential patchelf pkg-config net-tools dnsmasq iperf3 git uidmap e2fsprogs - name: Install passt from source run: | @@ -123,7 +123,7 @@ jobs: run: rm -fr /tmp/libkrun-tests - name: Integration tests - run: KRUN_ENOMEM_WORKAROUND=1 KRUN_NO_UNSHARE=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" + run: KRUN_ENOMEM_WORKAROUND=1 KRUN_NO_UNSHARE=1 KRUN_TEST_BASE_DIR=/tmp/libkrun-tests make test NET=1 BLK=1 IPERF_DURATION=3 TEST_FLAGS="--keep-all --github-summary" - name: Upload test logs if: always() diff --git a/Cargo.lock b/Cargo.lock index ecb90d195..0b419f3b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -527,6 +527,14 @@ dependencies = [ "serde_core", ] +[[package]] +name = "init-blob" +version = "0.1.0-1.18.0" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "iocuddle" version = "0.1.1" @@ -856,6 +864,7 @@ version = "1.18.0" dependencies = [ "crossbeam-channel", "env_logger", + "init-blob", "krun-aws-nitro", "krun-devices", "krun-display", diff --git a/Cargo.toml b/Cargo.toml index 00b06aa00..35b1dbba4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "src/libkrun", + "src/init-blob", "src/input", "src/display", "src/utils", diff --git a/examples/boot_efi.c b/examples/boot_efi.c index 5105d46df..ccbe313b5 100644 --- a/examples/boot_efi.c +++ b/examples/boot_efi.c @@ -169,7 +169,7 @@ int main(int argc, char *const argv[]) } // Set the log level to "off". - err = krun_set_log_level(0); + err = krun_init_log(KRUN_LOG_TARGET_DEFAULT, KRUN_LOG_LEVEL_OFF, KRUN_LOG_STYLE_AUTO, 0); if (err) { errno = -err; perror("Error configuring log level"); @@ -191,13 +191,19 @@ int main(int argc, char *const argv[]) return -1; } + if (err = krun_add_virtio_console_default(ctx_id, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO)) { + errno = -err; + perror("Error configuring console"); + return -1; + } + if (err = krun_set_firmware(ctx_id, cmdline.efi_fw)) { errno = -err; perror("Error configuring EFI FW path"); return -1; } - if (err = krun_set_root_disk(ctx_id, cmdline.disk_image)) { + if (err = krun_add_disk3(ctx_id, "root", cmdline.disk_image, KRUN_DISK_FORMAT_RAW, false, false, KRUN_SYNC_FULL)) { errno = -err; perror("Error configuring disk image"); return -1; diff --git a/examples/chroot_vm.c b/examples/chroot_vm.c index b0ab0a05e..03171ba7b 100644 --- a/examples/chroot_vm.c +++ b/examples/chroot_vm.c @@ -308,6 +308,12 @@ int main(int argc, char *const argv[]) return -1; } + if (err = krun_add_virtio_console_default(ctx_id, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO)) { + errno = -err; + perror("Error configuring console"); + return -1; + } + // Configure vhost-user RNG if requested if (cmdline.vhost_user_rng_socket != NULL) { // Test sentinel-terminated array: auto-detect queue count, use custom size @@ -357,14 +363,8 @@ int main(int argc, char *const argv[]) printf("Using vhost-user sound backend at %s\n", cmdline.vhost_user_snd_socket); } - // Configure vhost-user vsock if requested + // Configure vsock: either vhost-user or built-in with TSI if (cmdline.vhost_user_vsock_socket != NULL) { - // Disable the implicit vsock device to avoid conflict - if (!check_krun_error(krun_disable_implicit_vsock(ctx_id), - "Error disabling implicit vsock")) { - return -1; - } - if (!check_krun_error(krun_add_vhost_user_device(ctx_id, KRUN_VIRTIO_DEVICE_VSOCK, cmdline.vhost_user_vsock_socket, NULL, KRUN_VHOST_USER_VSOCK_NUM_QUEUES, @@ -413,14 +413,22 @@ int main(int argc, char *const argv[]) uint32_t virgl_flags = VIRGLRENDERER_USE_EGL | VIRGLRENDERER_DRM | VIRGLRENDERER_THREAD_SYNC | VIRGLRENDERER_USE_ASYNC_FENCE_CB; - if (err = krun_set_gpu_options(ctx_id, virgl_flags)) { + if (err = krun_set_gpu_options2(ctx_id, virgl_flags, (uint64_t)1 << 33)) { errno = -err; perror("Error configuring gpu"); return -1; } + // Add built-in vsock with TSI when not using vhost-user-vsock + if (cmdline.vhost_user_vsock_socket == NULL) { + if (err = krun_add_vsock(ctx_id, KRUN_TSI_HIJACK_INET)) { + errno = -err; + perror("Error configuring vsock"); + return -1; + } + } + // Map port 18000 in the host to 8000 in the guest (if networking uses TSI) - // Skip port mapping when using vhost-user-vsock (TSI requires built-in vsock) if (cmdline.net_mode == NET_MODE_TSI && cmdline.vhost_user_vsock_socket == NULL) { if (err = krun_set_port_map(ctx_id, &port_map[0])) { errno = -err; diff --git a/examples/consoles.c b/examples/consoles.c index 30a17a492..23ee8226f 100644 --- a/examples/consoles.c +++ b/examples/consoles.c @@ -119,18 +119,12 @@ int main(int argc, char *const argv[]) const char *const *command_args = (argc > 3) ? (const char *const *)&argv[3] : NULL; const char *const envp[] = { 0 }; - krun_set_log_level(KRUN_LOG_LEVEL_WARN); + krun_init_log(KRUN_LOG_TARGET_DEFAULT, KRUN_LOG_LEVEL_WARN, KRUN_LOG_STYLE_AUTO, 0); int err; int ctx_id = krun_create_ctx(); if (ctx_id < 0) { errno = -ctx_id; perror("krun_create_ctx"); return 1; } - if ((err = krun_disable_implicit_console(ctx_id))) { - errno = -err; - perror("krun_disable_implicit_console"); - return 1; - } - int console_id = krun_add_virtio_console_multiport(ctx_id); if (console_id < 0) { errno = -console_id; diff --git a/examples/external_kernel.c b/examples/external_kernel.c index 14649881d..81d3f5c72 100644 --- a/examples/external_kernel.c +++ b/examples/external_kernel.c @@ -218,7 +218,7 @@ int main(int argc, char *const argv[]) } // Set the log level to "off". - err = krun_set_log_level(0); + err = krun_init_log(KRUN_LOG_TARGET_DEFAULT, KRUN_LOG_LEVEL_OFF, KRUN_LOG_STYLE_AUTO, 0); if (err) { errno = -err; @@ -243,9 +243,16 @@ int main(int argc, char *const argv[]) return -1; } + if (err = krun_add_virtio_console_default(ctx_id, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO)) + { + errno = -err; + perror("Error configuring console"); + return -1; + } + if (cmdline.boot_disk) { - if (err = krun_add_disk(ctx_id, "boot", cmdline.boot_disk, 0)) + if (err = krun_add_disk3(ctx_id, "boot", cmdline.boot_disk, KRUN_DISK_FORMAT_RAW, 0, 0, KRUN_SYNC_FULL)) { errno = -err, perror("Error configuring boot disk"); @@ -254,7 +261,7 @@ int main(int argc, char *const argv[]) } if (cmdline.data_disk) { - if (err = krun_add_disk(ctx_id, "data", cmdline.data_disk, 0)) + if (err = krun_add_disk3(ctx_id, "data", cmdline.data_disk, KRUN_DISK_FORMAT_RAW, 0, 0, KRUN_SYNC_FULL)) { errno = -err, perror("Error configuring data disk"); diff --git a/examples/gui_vm/src/main.rs b/examples/gui_vm/src/main.rs index 660a2e21a..851b8e364 100644 --- a/examples/gui_vm/src/main.rs +++ b/examples/gui_vm/src/main.rs @@ -6,23 +6,24 @@ use gtk_display::{ }; use krun_sys::{ - KRUN_LOG_LEVEL_TRACE, KRUN_LOG_LEVEL_WARN, KRUN_LOG_STYLE_ALWAYS, KRUN_LOG_TARGET_DEFAULT, - VIRGLRENDERER_RENDER_SERVER, VIRGLRENDERER_THREAD_SYNC, VIRGLRENDERER_USE_ASYNC_FENCE_CB, - VIRGLRENDERER_USE_EGL, VIRGLRENDERER_VENUS, krun_add_display, krun_add_input_device, - krun_add_input_device_fd, krun_create_ctx, krun_display_set_dpi, + krun_add_display, krun_add_input_device, krun_add_input_device_fd, + krun_add_virtio_console_default, krun_create_ctx, krun_display_set_dpi, krun_display_set_physical_size, krun_display_set_refresh_rate, krun_init_log, krun_set_display_backend, krun_set_exec, krun_set_gpu_options2, krun_set_root, - krun_set_vm_config, krun_start_enter, + krun_set_vm_config, krun_start_enter, KRUN_LOG_LEVEL_TRACE, KRUN_LOG_LEVEL_WARN, + KRUN_LOG_STYLE_ALWAYS, KRUN_LOG_TARGET_DEFAULT, VIRGLRENDERER_RENDER_SERVER, + VIRGLRENDERER_THREAD_SYNC, VIRGLRENDERER_USE_ASYNC_FENCE_CB, VIRGLRENDERER_USE_EGL, + VIRGLRENDERER_VENUS, }; use log::LevelFilter; use regex::{Captures, Regex}; -use std::ffi::{CString, c_void}; +use std::ffi::{c_void, CString}; use std::fmt::Display; use std::fs::{File, OpenOptions}; use std::mem::size_of_val; use anyhow::Context; -use std::os::fd::IntoRawFd; +use std::os::fd::{AsRawFd, IntoRawFd}; use std::path::PathBuf; use std::process::exit; use std::ptr::null; @@ -150,6 +151,13 @@ fn krun_thread( krun_call!(krun_set_vm_config(ctx, 4, 4096))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; + krun_call!(krun_set_gpu_options2( ctx, VIRGLRENDERER_USE_EGL diff --git a/examples/launch-tee.c b/examples/launch-tee.c index 063cdd5f3..9da295ab6 100644 --- a/examples/launch-tee.c +++ b/examples/launch-tee.c @@ -45,7 +45,7 @@ int main(int argc, char *const argv[]) } // Set the log level to "error". - err = krun_set_log_level(1); + err = krun_init_log(KRUN_LOG_TARGET_DEFAULT, KRUN_LOG_LEVEL_ERROR, KRUN_LOG_STYLE_AUTO, 0); if (err) { errno = -err; perror("Error configuring log level"); @@ -67,8 +67,14 @@ int main(int argc, char *const argv[]) return -1; } + if (err = krun_add_virtio_console_default(ctx_id, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO)) { + errno = -err; + perror("Error configuring console"); + return -1; + } + // Use the first command line argument as the disk image containing the root fs. - if (err = krun_set_root_disk(ctx_id, argv[1])) { + if (err = krun_add_disk3(ctx_id, "root", argv[1], KRUN_DISK_FORMAT_RAW, false, false, KRUN_SYNC_FULL)) { errno = -err; perror("Error configuring root disk image"); return -1; @@ -114,7 +120,7 @@ int main(int argc, char *const argv[]) return -1; } - if (err = krun_set_data_disk(ctx_id, argv[3])) { + if (err = krun_add_disk3(ctx_id, "data", argv[3], KRUN_DISK_FORMAT_RAW, false, false, KRUN_SYNC_FULL)) { errno = -err; perror("Error configuring the TEE config data disk"); return -1; diff --git a/examples/nitro.c b/examples/nitro.c index 2a80e02fd..379e28d89 100644 --- a/examples/nitro.c +++ b/examples/nitro.c @@ -180,7 +180,7 @@ int main(int argc, char *const argv[]) // Enable debug output if configured. log_level = (cmdline.debug) ? KRUN_LOG_LEVEL_DEBUG : KRUN_LOG_LEVEL_OFF; - err = krun_set_log_level(log_level); + err = krun_init_log(KRUN_LOG_TARGET_DEFAULT, log_level, KRUN_LOG_STYLE_AUTO, 0); if (err) { errno = -err; perror("Error configuring log level"); @@ -203,9 +203,9 @@ int main(int argc, char *const argv[]) return -1; } - if (err = krun_set_console_output(ctx_id, "/dev/stdout")) { + if (err = krun_add_virtio_console_default(ctx_id, -1, STDOUT_FILENO, -1)) { errno = -err; - perror("Error configuring the console output"); + perror("Error configuring the console"); return -1; } diff --git a/include/libkrun.h b/include/libkrun.h index 3004110f6..9356e0d96 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -10,24 +10,6 @@ extern "C" { #include #include -/** - * Sets the log level for the library. - * - * Arguments: - * "level" can be one of the following values: - * 0: Off - * 1: Error - * 2: Warn - * 3: Info - * 4: Debug - * 5: Trace - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_log_level(uint32_t level); - - #define KRUN_LOG_TARGET_DEFAULT -1 #define KRUN_LOG_LEVEL_OFF 0 @@ -118,60 +100,7 @@ int32_t krun_set_vm_config(uint32_t ctx_id, uint8_t num_vcpus, uint32_t ram_mib) */ int32_t krun_set_root(uint32_t ctx_id, const char *root_path); -/** - * DEPRECATED. Use krun_add_disk instead. - * - * Sets the path to the disk image that contains the file-system to be used as root for the microVM. - * The only supported image format is "raw". - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "disk_path" - a null-terminated string representing the path leading to the disk image that - * contains the root file-system. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_root_disk(uint32_t ctx_id, const char *disk_path); -/** - * DEPRECATED. Use krun_add_disk instead. - * - * Sets the path to the disk image that contains the file-system to be used as - * a data partition for the microVM. The only supported image format is "raw". - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "disk_path" - a null-terminated string representing the path leading to the disk image that - * contains the root file-system. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_data_disk(uint32_t ctx_id, const char *disk_path); - -/** - * Adds a disk image to be used as a general partition for the microVM. The only supported image - * format is "raw". - * - * This API is mutually exclusive with the deprecated krun_set_root_disk and - * krun_set_data_disk methods and must not be used together. - * - * This function deliberately only handles images in the Raw format, because it doesn't allow - * specifying an image format, and probing an image's format is dangerous. For more information, - * see the security note on `krun_add_disk2`, which allows opening non-Raw images. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, const char *disk_path, bool read_only); /* Supported disk image formats */ #define KRUN_DISK_FORMAT_RAW 0 @@ -179,57 +108,6 @@ int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, const char *disk_pa /* Note: Only supports FLAT/ZERO formats without delta links */ #define KRUN_DISK_FORMAT_VMDK 2 -/** - * Adds a disk image to be used as a general partition for the microVM. The supported - * image formats are: "raw" and "qcow2". - * - * This API is mutually exclusive with the deprecated krun_set_root_disk and - * krun_set_data_disk methods and must not be used together. - * - * SECURITY NOTE: - * Non-Raw images can reference other files, which libkrun will automatically open, and to which the - * guest will have access. Libkrun should therefore never be asked to open an image in a non-Raw - * format when it doesn't come from a fully trustworthy source. - * - * Consequently, probing an image's format is quite dangerous and to be avoided if at all possible, - * which is why libkrun provides no facilities for doing so. If it's not clear what format an image - * has, it may also not be clear whether it can be trusted to not reference files to which the guest - * shouldn't have access. - * - * If probing absolutely can't be avoided, it must only be done on images that are fully trusted, i.e. - * before a potentially untrusted guest had write access to it. Specifically, consider that a guest has - * full access to all of a Raw image, and can therefore turn it into a file in an arbitrary format, for - * example, into a Qcow2 image, referencing and granting a malicious guest access to arbitrary files. - * To hand a Raw image to an untrusted and potentially malicious guest, and then to re-probe it after - * the guest was able to write to it (when it can no longer be trusted), would therefore be a severe - * security vulnerability. - * - * Therefore, after having probed a yet fully trusted image once, the result must be remembered so the - * image will from then on always be opened in the format that was detected originally. When adhering - * to this, a guest can write anything they want to a Raw image, it's always going to be opened as a - * Raw image, preventing the security vulnerability outlined above. - * - * However, if at all possible, the image format should be explicitly selected based on knowledge - * obtained separately from the pure image data, for example by the user. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "disk_format" - the disk image format (i.e. KRUN_DISK_FORMAT_{RAW, QCOW2}) - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_add_disk2(uint32_t ctx_id, - const char *block_id, - const char *disk_path, - uint32_t disk_format, - bool read_only); - - /* Supported sync modes */ /** @@ -254,11 +132,11 @@ int32_t krun_add_disk2(uint32_t ctx_id, /** * Adds a disk image to be used as a general partition for the microVM. * - * This API is mutually exclusive with the deprecated krun_set_root_disk and - * krun_set_data_disk methods and must not be used together. - * * SECURITY NOTE: - * See the security note for `krun_add_disk2`. + * Non-Raw images can reference other files, which libkrun will automatically + * open, and to which the guest will have access. Libkrun should therefore never + * be asked to open an image in a non-Raw format when it doesn't come from a + * fully trustworthy source. * * Arguments: * "ctx_id" - the configuration context ID. @@ -283,22 +161,6 @@ int32_t krun_add_disk2(uint32_t ctx_id, bool direct_io, uint32_t sync_mode); -/** - * NO LONGER SUPPORTED. DO NOT USE. - * - * Configures the mapped volumes for the microVM. Only supported on macOS, on Linux use - * user_namespaces and bind-mounts instead. Not available in libkrun-SEV. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "mapped_volumes" - an array of string pointers with format "host_path:guest_path" representing - * the volumes to be mapped inside the microVM - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_mapped_volumes(uint32_t ctx_id, const char *const mapped_volumes[]); - /** * Adds an independent virtio-fs device pointing to a host's directory with a tag. * @@ -306,40 +168,6 @@ int32_t krun_set_mapped_volumes(uint32_t ctx_id, const char *const mapped_volume * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. * "c_path" - full path to the directory in the host to be exposed to the guest. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_add_virtiofs(uint32_t ctx_id, - const char *c_tag, - const char *c_path); - -/** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. This - * variant allows specifying the size of the DAX window. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. - * "shm_size" - size of the DAX SHM window in bytes. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_add_virtiofs2(uint32_t ctx_id, - const char *c_tag, - const char *c_path, - uint64_t shm_size); - -/** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. This - * variant allows specifying the size of the DAX window and a read-only flag. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. * "shm_size" - size of the DAX SHM window in bytes. * "read_only" - if true, the filesystem will be exposed as read-only to the guest. * @@ -371,7 +199,7 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, #define NET_FEATURE_HOST_TSO6 1 << 12 #define NET_FEATURE_HOST_UFO 1 << 14 -/* These are the features enabled by krun_set_passt_fd and krun_set_gvproxy_path. */ +/* These are the default features used by krun_add_net_unixstream and krun_add_net_unixgram. */ #define COMPAT_NET_FEATURES NET_FEATURE_CSUM | NET_FEATURE_GUEST_CSUM | \ NET_FEATURE_GUEST_TSO4 | NET_FEATURE_GUEST_UFO | \ NET_FEATURE_HOST_TSO4 | NET_FEATURE_HOST_UFO @@ -492,57 +320,6 @@ int32_t krun_add_net_tap(uint32_t ctx_id, uint32_t features, uint32_t flags); -/** - * DEPRECATED. Use krun_add_net_unixstream instead. - * - * Configures the networking to use passt. - * Call to this function disables TSI backend to use passt instead. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "fd" - a file descriptor to communicate with passt - * - * Notes: - * If you never call this function, networking uses the TSI backend. - * This function should be called before krun_set_port_map. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_passt_fd(uint32_t ctx_id, int fd); - -/** - * DEPRECATED. Use krun_add_net_unixgram instead. - * - * Configures the networking to use gvproxy in vfkit mode. - * Call to this function disables TSI backend to use gvproxy instead. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "c_path" - a null-terminated string representing the path for - * gvproxy's listen-vfkit unixdgram socket. - * - * Notes: - * If you never call this function, networking uses the TSI backend. - * This function should be called before krun_set_port_map. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_gvproxy_path(uint32_t ctx_id, char *c_path); - -/** - * Sets the MAC address for the virtio-net device when using the passt backend. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "mac" - MAC address as an array of 6 uint8_t entries. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_net_mac(uint32_t ctx_id, uint8_t *const c_mac); - /** * Configures a map of host to guest TCP ports for the microVM. * @@ -565,8 +342,9 @@ int32_t krun_set_net_mac(uint32_t ctx_id, uint8_t *const c_mac); * means that for a map such as "8080:80", applications running inside the guest will also * need to access the service through the "8080" port. * - * If past networking mode is used (krun_set_passt_fd was called), port mapping is not supported - * as an API of libkrun (but you can still do port mapping using command line arguments of passt) + * If passt networking mode is used, port mapping is not supported as an API + * of libkrun (but you can still do port mapping using command line arguments + * of passt) */ int32_t krun_set_port_map(uint32_t ctx_id, const char *const port_map[]); @@ -588,19 +366,6 @@ int32_t krun_set_port_map(uint32_t ctx_id, const char *const port_map[]); * Arguments: * "ctx_id" - the configuration context ID. * "virgl_flags" - flags to pass to virglrenderer. - * - * Returns: - * Zero on success or a negative error number on failure. - */ -int32_t krun_set_gpu_options(uint32_t ctx_id, uint32_t virgl_flags); - -/** - * Enables and configures a virtio-gpu device. This variant allows specifying - * the size of the host window (acting as vRAM in the guest). - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "virgl_flags" - flags to pass to virglrenderer. * "shm_size" - size of the SHM host window in bytes. * * Returns: @@ -974,19 +739,6 @@ int32_t krun_set_env(uint32_t ctx_id, const char *const envp[]); */ int32_t krun_set_tee_config_file(uint32_t ctx_id, const char *filepath); -/** - * Adds a port-path pairing for guest IPC with a process in the host. - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "port" - a vsock port that the guest will connect to for IPC. - * "filepath" - a null-terminated string representing the path of the UNIX - * socket in the host. - */ -int32_t krun_add_vsock_port(uint32_t ctx_id, - uint32_t port, - const char *c_filepath); - /** * Adds a port-path pairing for guest IPC with a process in the host. * @@ -1005,10 +757,6 @@ int32_t krun_add_vsock_port2(uint32_t ctx_id, /** * Add a vsock device with specified TSI features. * - * By default, libkrun creates a vsock device implicitly with TSI hijacking - * enabled based on heuristics. To use this function, you must first call - * krun_disable_implicit_vsock() to disable the implicit vsock device. - * * Currently only one vsock device is supported. Calling this function * multiple times will return an error. * @@ -1034,22 +782,6 @@ int32_t krun_add_vsock(uint32_t ctx_id, uint32_t tsi_features); */ int32_t krun_get_shutdown_eventfd(uint32_t ctx_id); -/** - * Configures the console device to ignore stdin and write the output to "c_filepath". - * - * Arguments: - * "ctx_id" - the configuration context ID. - * "filepath" - a null-terminated string representing the path of the file to write the - * console output. - * - * Notes: - * This API only applies to the implicitly created console. If the implicit console is - * disabled via `krun_disable_implicit_console` the operation is a NOOP. Additionally, - * this API does not have any effect on consoles created via the `krun_add_*_console_default` - * APIs. - */ -int32_t krun_set_console_output(uint32_t ctx_id, const char *c_filepath); - /** * Configures uid which is set right before the microVM is started. * @@ -1153,33 +885,89 @@ int32_t krun_get_max_vcpus(void); */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); -/* - * Do not create an implicit console device in the guest. By using this API, - * libkrun will create zero console devices on behalf of the user. Any - * console devices needed by the user must be added manually via other API - * calls. +/** + * Sets the init configuration from an OCI container-spec JSON string. + * + * The JSON should use OCI image config keys ("Entrypoint", "Cmd", "Env", + * "WorkingDir", "mounts"). This replaces any configuration previously set + * via krun_set_exec, krun_set_workdir, krun_set_env, etc. * * Arguments: * "ctx_id" - the configuration context ID. + * "json" - a null-terminated JSON string with the OCI config. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_disable_implicit_console(uint32_t ctx_id); +int32_t krun_set_oci_config_json(uint32_t ctx_id, const char *json); /** - * Disable the implicit vsock device. + * Injects the built-in init binary and config JSON into the specified + * virtiofs device. * - * By default, libkrun creates a vsock device automatically. This function - * disables that behavior entirely - no vsock device will be created. + * Call this after configuring the init (via krun_set_exec/krun_set_workdir/ + * krun_set_env or krun_set_oci_config_json), and after the target virtiofs + * device has been created (e.g. via krun_set_root or krun_add_virtiofs3). * * Arguments: - * "ctx_id" - the configuration context ID. + * "ctx_id" - the configuration context ID. + * "fs_tag" - the virtiofs tag identifying the target device + * (typically KRUN_FS_ROOT_TAG, i.e. "/dev/root"). + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_inject_init(uint32_t ctx_id, const char *fs_tag); + +/** + * Add a virtual overlay file to a virtiofs device. + * + * The file is backed entirely by host memory (no host file). The data + * pointer is NOT copied — the caller must keep the memory valid for the + * full VM lifetime. + * + * "path" may contain '/' to place the file inside a virtual directory + * previously created with krun_fs_add_overlay_dir (e.g. "etc/hostname"). + * All intermediate directories must already exist; -ENOENT is returned + * if a component is missing, -ENOTDIR if a component is not a directory. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "path" - path of the file (e.g. "init.krun" or "etc/hostname"). + * "data" - pointer to the file content. + * "data_len" - length of the file content in bytes. + * "mode" - file mode bits (e.g. 0100644 for a regular file). + * "one_shot" - if true, the file can only be looked up once. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_file(uint32_t ctx_id, const char *fs_tag, + const char *path, const uint8_t *data, + size_t data_len, uint32_t mode, bool one_shot); + +/** + * Add a virtual overlay directory to a virtiofs device. + * + * The directory is empty and read-only, useful as a mount point. + * + * "path" may contain '/' to nest inside an existing virtual directory + * (e.g. "usr/lib"). All intermediate directories must already exist; + * -ENOENT is returned if a component is missing, -ENOTDIR if a component + * is not a directory. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "path" - path of the directory (e.g. "dev" or "usr/lib"). + * "mode" - directory mode bits (e.g. 040755). * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_disable_implicit_vsock(uint32_t ctx_id); +int32_t krun_fs_add_overlay_dir(uint32_t ctx_id, const char *fs_tag, + const char *path, uint32_t mode); /* * Specify the value of `console=` in the kernel commandline. @@ -1198,9 +986,7 @@ int32_t krun_set_kernel_console(uint32_t ctx_id, const char *console_id); * * The function can be called multiple times for adding multiple virtio-console devices. * In the guest, the consoles will appear in the same order as they are added (that is, - * the first added console will be "hvc0", the second "hvc1", ...). However, if the - * implicit console is not disabled via `krun_disable_implicit_console`, the first - * console created with the function will occupy the "hvc1" ID. + * the first added console will be "hvc0", the second "hvc1", ...). * * This function attaches a multi port virtio-console to the guest. If the input, output and error * file descriptors are TTYs, the device will be created with just a single console port (`err_fd` @@ -1228,9 +1014,7 @@ int32_t krun_add_virtio_console_default(uint32_t ctx_id, * * The function can be called multiple times for adding multiple serial devices. * In the guest, the consoles will appear in the same order as they are added (that is, - * the first added console will be "ttyS0", the second "ttyS1", ...). However, if the - * implicit console is not disabled via `krun_disable_implicit_console` on aarch64 or macOS, - * the first console created with the function will occupy the "ttyS1" ID. + * the first added console will be "ttyS0", the second "ttyS1", ...). * * Arguments: * "ctx_id" - the configuration context ID. @@ -1253,8 +1037,7 @@ int32_t krun_add_serial_console_default(uint32_t ctx_id, * * The function can be called multiple times for adding multiple virtio-console devices. * Each device appears in the guest with port 0 accessible as /dev/hvcN (hvc0, hvc1, etc.) in the order - * devices are added. If the implicit console is not disabled via `krun_disable_implicit_console`, - * the first explicitly added device will occupy the "hvc1" ID. Additional ports within each device + * devices are added. Additional ports within each device * (port 1, 2, ...) appear as /dev/vportNpM character devices. * * Arguments: diff --git a/init/init.c b/init/init.c index 59a5c3d94..1d937712f 100644 --- a/init/init.c +++ b/init/init.c @@ -43,7 +43,6 @@ #endif #define KRUN_EXIT_CODE_IOCTL 0x7602 -#define KRUN_REMOVE_ROOT_DIR_IOCTL 0x7603 #define KRUN_MAGIC "KRUN" #define KRUN_FOOTER_LEN 12 @@ -211,9 +210,8 @@ static char *get_luks_passphrase(int *pass_len) return_str = NULL; /* - * If a user registered the TEE config data disk with - * krun_set_data_disk(), it would appear as /dev/vdb in the guest. - * Mount this device and read the config. + * If a TEE config data disk was registered, it would appear as + * /dev/vdb in the guest. Mount this device and read the config. */ if (mkdir("/dev", 0755) < 0 && errno != EEXIST) { perror("mkdir(/dev)"); @@ -1475,16 +1473,6 @@ int main(int argc, char **argv) chdir("/newroot"); - fd = open("/", O_RDONLY); - if (fd < 0) { - perror("Couldn't open temporary root directory for removing"); - exit(-1); - } - if (ioctl(fd, KRUN_REMOVE_ROOT_DIR_IOCTL) < 0) { - perror("Error removing temporary root directory"); - } - close(fd); - if (mount(".", "/", NULL, MS_MOVE, NULL) < 0) { perror("remount root"); exit(-1); diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index eacb6cc97..1be66e164 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -3,7 +3,7 @@ name = "krun-devices" version = "0.1.0-1.18.0" authors = ["The libkrun Authors"] edition = "2021" -build = "build.rs" + description = "Virtual device emulation for libkrun" license = "Apache-2.0" repository = "https://github.com/containers/libkrun" diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs new file mode 100644 index 000000000..ab7779508 --- /dev/null +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -0,0 +1,747 @@ +// Virtual inode overlay for virtiofs. +// +// `AugmentFs` wraps an inner `FileSystem` implementation and intercepts +// FUSE operations for virtual inodes — synthetic read-only files that exist +// only in memory. All other operations are delegated to the inner filesystem. +// +// Virtual inodes are injected into the root directory (parent = ROOT_ID) and +// are currently only accessible via lookup (they do not appear in readdir). +// +// One-shot files can only be looked up once — the name is removed from the +// directory on first lookup so subsequent lookups return ENOENT. + +#[cfg(target_os = "macos")] +use crossbeam_channel::Sender; +use std::collections::HashMap; +use std::ffi::CStr; +use std::ffi::CString; +use std::io; +use std::mem; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::sync::Arc; +use std::sync::RwLock; +use std::time::Duration; + +#[cfg(target_os = "macos")] +use utils::worker_message::WorkerMessage; + +use super::filesystem::{ + Context, DirEntry, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, + OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, +}; +use super::fuse; +use super::inode_alloc::InodeAllocator; +use super::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualEntryContent, VIRTUAL_BLKSIZE}; +use crate::virtio::bindings; +use crate::virtio::linux_errno; + +type Inode = u64; +type Handle = u64; + +/// Sentinel handle returned for all virtual file opens. This works because +/// virtual file operations dispatch on inode, not handle — there is no +/// per-open state. If per-fd state is ever needed (e.g. writable virtual +/// files), this must be replaced with a real handle allocator. +const VIRTUAL_HANDLE: Handle = 0; + +/// Persistent virtual entries never change. +const VIRTUAL_TIMEOUT: Duration = Duration::MAX; + +/// Overlay that injects virtual inodes into an inner `FileSystem`. +pub struct AugmentFs { + inner: T, + /// Maps (parent_inode, name) → child inode number. One-shot entries + /// are removed on first lookup so the file can only be opened once. + name_to_inode: RwLock>, + /// Maps virtual inode number → (mode, inode data). One-shot entries are + /// removed from this map on release. + inodes: RwLock>, +} + +impl> AugmentFs { + /// Create a new overlay. + /// + /// `entries` are registered as virtual inodes in the root directory. + /// Inode numbers are obtained from `inode_alloc`, the same allocator + /// used by the inner filesystem. + pub fn new(inner: T, inode_alloc: &InodeAllocator, entries: Vec) -> Self { + let mut name_to_inode = HashMap::new(); + let mut inodes = HashMap::new(); + + Self::register_entries( + fuse::ROOT_ID, + entries, + inode_alloc, + &mut name_to_inode, + &mut inodes, + ); + + Self { + inner, + name_to_inode: RwLock::new(name_to_inode), + inodes: RwLock::new(inodes), + } + } + + fn register_entries( + parent: Inode, + entries: Vec, + inode_alloc: &InodeAllocator, + name_to_inode: &mut HashMap<(Inode, CString), Inode>, + inodes: &mut HashMap, + ) { + for entry in entries { + let ino = inode_alloc.next(); + name_to_inode.insert((parent, entry.name), ino); + + // Recurse into directory children before moving the node. + if let VirtualEntryContent::Dir { children } = entry.entry.content { + Self::register_entries(ino, children, inode_alloc, name_to_inode, inodes); + inodes.insert( + ino, + VirtualEntry { + mode: entry.entry.mode, + one_shot: entry.entry.one_shot, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + ); + } else { + inodes.insert(ino, entry.entry); + } + } + } + + fn is_virtual(&self, inode: Inode) -> bool { + self.inodes.read().unwrap().contains_key(&inode) + } + + fn virtual_stat(ino: Inode, vnode: &VirtualEntry) -> (bindings::stat64, Duration) { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = ino; + st.st_mode = vnode.st_mode() as _; + st.st_blksize = VIRTUAL_BLKSIZE as _; + let timeout = if vnode.one_shot { + Duration::ZERO + } else { + VIRTUAL_TIMEOUT + }; + match &vnode.content { + VirtualEntryContent::File { data, .. } => { + st.st_size = data.len() as i64; + st.st_nlink = 1; + st.st_blocks = ((data.len() as i64) + 511) / 512; + } + VirtualEntryContent::Dir { .. } => { + st.st_nlink = 2; + } + } + (st, timeout) + } +} + +impl> FileSystem for AugmentFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, capable: FsOptions) -> io::Result { + self.inner.init(capable) + } + + fn destroy(&self) { + self.inner.destroy() + } + + fn lookup(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result { + let key = (parent, CString::from(name)); + let inode = self.name_to_inode.read().unwrap().get(&key).copied(); + if let Some(inode) = inode { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let one_shot = vnode.one_shot; + let (st, timeout) = Self::virtual_stat(inode, vnode); + + if one_shot { + drop(inodes); + self.name_to_inode.write().unwrap().remove(&key); + } + + return Ok(Entry { + inode, + generation: 0, + attr: st, + attr_flags: 0, + attr_timeout: timeout, + entry_timeout: timeout, + }); + } + } + self.inner.lookup(ctx, parent, name) + } + + fn forget(&self, ctx: Context, inode: Inode, count: u64) { + if !self.is_virtual(inode) { + self.inner.forget(ctx, inode, count) + } + } + + fn batch_forget(&self, ctx: Context, mut requests: Vec<(Inode, u64)>) { + requests.retain(|(ino, _)| !self.is_virtual(*ino)); + self.inner.batch_forget(ctx, requests); + } + + fn getattr( + &self, + ctx: Context, + inode: Inode, + handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + return Ok(Self::virtual_stat(inode, vnode)); + } + } + self.inner.getattr(ctx, inode, handle) + } + + fn setattr( + &self, + ctx: Context, + inode: Inode, + attr: bindings::stat64, + handle: Option, + valid: SetattrValid, + ) -> io::Result<(bindings::stat64, Duration)> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.setattr(ctx, inode, attr, handle, valid) + } + + fn readlink(&self, ctx: Context, inode: Inode) -> io::Result> { + if self.is_virtual(inode) { + return Err(linux_errno::einval()); + } + self.inner.readlink(ctx, inode) + } + + fn symlink( + &self, + ctx: Context, + linkname: &CStr, + parent: Inode, + name: &CStr, + extensions: Extensions, + ) -> io::Result { + self.inner.symlink(ctx, linkname, parent, name, extensions) + } + + fn mknod( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + mode: u32, + rdev: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + self.inner + .mknod(ctx, inode, name, mode, rdev, umask, extensions) + } + + fn mkdir( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + let key = (parent, CString::from(name)); + if self.name_to_inode.read().unwrap().contains_key(&key) { + return Err(linux_errno::eexist()); + } + self.inner.mkdir(ctx, parent, name, mode, umask, extensions) + } + + fn unlink(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.unlink(ctx, parent, name) + } + + fn rmdir(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.rmdir(ctx, parent, name) + } + + fn rename( + &self, + ctx: Context, + olddir: Inode, + oldname: &CStr, + newdir: Inode, + newname: &CStr, + flags: u32, + ) -> io::Result<()> { + self.inner + .rename(ctx, olddir, oldname, newdir, newname, flags) + } + + fn link( + &self, + ctx: Context, + inode: Inode, + newparent: Inode, + newname: &CStr, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.link(ctx, inode, newparent, newname) + } + + fn open( + &self, + ctx: Context, + inode: Inode, + kill_priv: bool, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + if vnode.is_dir() { + return Err(linux_errno::eisdir()); + } + if (flags as i32 & libc::O_ACCMODE) != libc::O_RDONLY { + return Err(linux_errno::eacces()); + } + return Ok((Some(VIRTUAL_HANDLE), OpenOptions::empty())); + } + } + self.inner.open(ctx, inode, kill_priv, flags) + } + + fn create( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + kill_priv: bool, + flags: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result<(Entry, Option, OpenOptions)> { + self.inner + .create(ctx, parent, name, mode, kill_priv, flags, umask, extensions) + } + + fn read( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mut w: W, + size: u32, + offset: u64, + lock_owner: Option, + flags: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let data = vnode.data().ok_or_else(linux_errno::eisdir)?; + let off: usize = offset.try_into().map_err(|_| linux_errno::einval())?; + if off >= data.len() { + return Ok(0); + } + let remaining = data.len() - off; + let len = remaining.min(size as usize); + return w.write(&data[off..(off + len)]); + } + } + self.inner + .read(ctx, inode, handle, w, size, offset, lock_owner, flags) + } + + fn write( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + r: R, + size: u32, + offset: u64, + lock_owner: Option, + delayed_write: bool, + kill_priv: bool, + flags: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.write( + ctx, + inode, + handle, + r, + size, + offset, + lock_owner, + delayed_write, + kill_priv, + flags, + ) + } + + fn flush(&self, ctx: Context, inode: Inode, handle: Handle, lock_owner: u64) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.flush(ctx, inode, handle, lock_owner) + } + + fn fsync(&self, ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.fsync(ctx, inode, datasync, handle) + } + + fn fallocate( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mode: u32, + offset: u64, + length: u64, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner + .fallocate(ctx, inode, handle, mode, offset, length) + } + + fn release( + &self, + ctx: Context, + inode: Inode, + flags: u32, + handle: Handle, + flush: bool, + flock_release: bool, + lock_owner: Option, + ) -> io::Result<()> { + { + let mut inodes = self.inodes.write().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + if vnode.one_shot { + inodes.remove(&inode); + } + return Ok(()); + } + } + self.inner + .release(ctx, inode, flags, handle, flush, flock_release, lock_owner) + } + + fn statfs(&self, ctx: Context, inode: Inode) -> io::Result { + self.inner.statfs(ctx, inode) + } + + fn getxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + size: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::enodata()); + } + self.inner.getxattr(ctx, inode, name, size) + } + + fn listxattr(&self, ctx: Context, inode: Inode, size: u32) -> io::Result { + if self.is_virtual(inode) { + if size == 0 { + return Ok(ListxattrReply::Count(0)); + } + return Ok(ListxattrReply::Names(Vec::new())); + } + self.inner.listxattr(ctx, inode, size) + } + + fn setxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + value: &[u8], + flags: u32, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.setxattr(ctx, inode, name, value, flags) + } + + fn removexattr(&self, ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.removexattr(ctx, inode, name) + } + + fn opendir( + &self, + ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + self.inner.opendir(ctx, inode, flags) + } + + fn readdir( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result, + { + self.inner + .readdir(ctx, inode, handle, size, offset, add_entry) + } + + fn readdirplus( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry, Entry) -> io::Result, + { + self.inner + .readdirplus(ctx, inode, handle, size, offset, add_entry) + } + + fn fsyncdir( + &self, + ctx: Context, + inode: Inode, + datasync: bool, + handle: Handle, + ) -> io::Result<()> { + self.inner.fsyncdir(ctx, inode, datasync, handle) + } + + fn releasedir(&self, ctx: Context, inode: Inode, flags: u32, handle: Handle) -> io::Result<()> { + self.inner.releasedir(ctx, inode, flags, handle) + } + + fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { + if self.is_virtual(inode) { + if mask & (libc::W_OK as u32) != 0 { + return Err(linux_errno::eacces()); + } + return Ok(()); + } + self.inner.access(ctx, inode, mask) + } + + fn lseek( + &self, + ctx: Context, + inode: Inode, + _handle: Handle, + offset: u64, + whence: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let size = vnode.data().ok_or_else(linux_errno::eisdir)?.len() as u64; + // FUSE lseek is only called for SEEK_DATA/SEEK_HOLE. + return match whence as i32 { + libc::SEEK_DATA => { + if offset < size { + Ok(offset) + } else { + Err(linux_errno::enxio()) + } + } + libc::SEEK_HOLE => { + if offset < size { + Ok(size) + } else { + Err(linux_errno::enxio()) + } + } + _ => Err(linux_errno::einval()), + }; + } + } + self.inner.lseek(ctx, inode, _handle, offset, whence) + } + + fn copyfilerange( + &self, + ctx: Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Virtual inodes don't have real file descriptors, so copy_file_range + // cannot work. Return EXDEV to tell the kernel to fall back to + // read+write. + if self.is_virtual(inode_in) || self.is_virtual(inode_out) { + return Err(linux_errno::exdev()); + } + self.inner.copyfilerange( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + fn setupmapping( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + foffset: u64, + len: u64, + flags: u64, + moffset: u64, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let data = vnode.data().ok_or_else(linux_errno::eisdir)?; + #[cfg(target_os = "linux")] + { + if (moffset + len) > shm_size { + return Err(linux_errno::einval()); + } + + let addr = host_shm_base + moffset; + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, + -1, + 0, + ) + }; + if std::ptr::eq(ret, libc::MAP_FAILED) { + return Err(io::Error::last_os_error()); + } + + let foff = foffset as usize; + if foff < data.len() { + let available = data.len() - foff; + let to_copy = (len as usize).min(available); + unsafe { + libc::memcpy( + addr as *mut libc::c_void, + data.as_ptr().add(foff) as *const _, + to_copy, + ) + }; + } + + return Ok(()); + } + + // TODO: implement DAX for virtual files on macOS. + // Needs a shared memory region manager (see setupmapping + // in macos/passthrough.rs for the real-file DAX path). + #[cfg(target_os = "macos")] + { + let _ = data; + return Err(linux_errno::enosys()); + } + } + } + self.inner.setupmapping( + ctx, + inode, + handle, + foffset, + len, + flags, + moffset, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn removemapping( + &self, + ctx: Context, + requests: Vec, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + self.inner.removemapping( + ctx, + requests, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn ioctl( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + flags: u32, + cmd: u32, + arg: u64, + in_size: u32, + out_size: u32, + exit_code: &Arc, + ) -> io::Result> { + // We can't use nix::request_code_none here since it's system-dependent + // and we need the value from Linux. + const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; + + match cmd { + VIRTIO_IOC_EXIT_CODE_REQ => { + exit_code.store(arg as i32, Ordering::SeqCst); + Ok(Vec::new()) + } + _ => self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ), + } + } +} diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index bc877bc24..c757c9d3d 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -17,6 +17,7 @@ use super::super::{ VirtioShmRegion, }; use super::passthrough; +use super::virtual_entry::VirtualDirEntry; use super::worker::FsWorker; use super::ExportTable; use super::{defs, defs::uapi}; @@ -46,8 +47,9 @@ pub struct Fs { device_state: DeviceState, config: VirtioFsConfig, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, + virtual_entries: Vec, worker_thread: Option>, worker_stopfd: EventFd, exit_code: Arc, @@ -58,10 +60,10 @@ pub struct Fs { impl Fs { pub fn new( fs_id: String, - shared_dir: String, + shared_dir: Option, exit_code: Arc, - allow_root_dir_delete: bool, read_only: bool, + virtual_entries: Vec, ) -> super::Result { let avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); @@ -70,11 +72,10 @@ impl Fs { config.tag[..tag.len()].copy_from_slice(tag.as_slice()); config.num_request_queues = 1; - let fs_cfg = passthrough::Config { - root_dir: shared_dir, - allow_root_dir_delete, + let fs_cfg = shared_dir.map(|root_dir| passthrough::Config { + root_dir, ..Default::default() - }; + }); Ok(Fs { avail_features, @@ -84,6 +85,7 @@ impl Fs { shm_region: None, passthrough_cfg: fs_cfg, read_only, + virtual_entries, worker_thread: None, worker_stopfd: EventFd::new(EFD_NONBLOCK).map_err(FsError::EventFd)?, exit_code, @@ -103,10 +105,16 @@ impl Fs { pub fn set_export_table(&mut self, export_table: ExportTable) -> u64 { static FS_UNIQUE_ID: AtomicU64 = AtomicU64::new(0); - self.passthrough_cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); - self.passthrough_cfg.export_table = Some(export_table); + let Some(cfg) = self.passthrough_cfg.as_mut() else { + // NullFs-backed devices have no passthrough config and don't + // participate in cross-domain fd export. Consume (and waste) an + // fsid so numbering stays dense, but don't store the table. + return FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + }; + cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + cfg.export_table = Some(export_table); - self.passthrough_cfg.export_fsid + cfg.export_fsid } #[cfg(target_os = "macos")] @@ -180,6 +188,7 @@ impl VirtioDevice for Fs { queue_evts.push(dq.event); } + let virtual_entries = self.virtual_entries.clone(); let worker = FsWorker::new( worker_queues, queue_evts, @@ -188,6 +197,7 @@ impl VirtioDevice for Fs { self.shm_region.clone(), self.passthrough_cfg.clone(), self.read_only, + virtual_entries, self.worker_stopfd.try_clone().unwrap(), self.exit_code.clone(), #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/inode_alloc.rs b/src/devices/src/virtio/fs/inode_alloc.rs new file mode 100644 index 000000000..1919b1406 --- /dev/null +++ b/src/devices/src/virtio/fs/inode_alloc.rs @@ -0,0 +1,28 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use super::fuse; + +/// Allocates unique FUSE inode numbers. +/// +/// FUSE inode numbers are opaque identifiers with two reserved values: +/// - `0` — invalid / negative-entry cache sentinel (never allocated) +/// - `1` (`ROOT_ID`) — the root directory of the filesystem +/// +/// All other numbers are allocated sequentially starting from `ROOT_ID + 1`. +/// The allocator is `Send + Sync` and safe to share across threads. +pub struct InodeAllocator { + next: AtomicU64, +} + +impl InodeAllocator { + pub fn new() -> Self { + Self { + next: AtomicU64::new(fuse::ROOT_ID + 1), + } + } + + /// Allocate the next inode number. Each call returns a unique value. + pub fn next(&self) -> u64 { + self.next.fetch_add(1, Ordering::Relaxed) + } +} diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index e5ca21a03..8272a7e01 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -16,7 +16,7 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use caps::{has_cap, CapSet, Capability}; -use nix::{request_code_none, request_code_read}; +use nix::request_code_read; use vm_memory::ByteValued; @@ -25,15 +25,13 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const CURRENT_DIR_CSTR: &[u8] = b".\0"; const PARENT_DIR_CSTR: &[u8] = b"..\0"; const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; -const INIT_CSTR: &[u8] = b"init.krun\0"; - -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); type Inode = u64; type Handle = u64; @@ -327,7 +325,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -342,7 +339,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -358,14 +354,12 @@ pub struct PassthroughFs { // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock>>, - next_inode: AtomicU64, - init_inode: u64, + inode_alloc: Arc, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the @@ -392,7 +386,7 @@ enum FileOrLink { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let fd = if let Some(fd) = cfg.proc_sfd_rawfd { fd } else { @@ -438,12 +432,10 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), - init_inode: fuse::ROOT_ID + 1, + inode_alloc, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, proc_self_fd, @@ -579,7 +571,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { @@ -992,25 +984,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("do_lookup: {name:?}"); - let init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == init_name { - let mut st: libc::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1129,11 +1103,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1234,16 +1204,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset.try_into().map_err(|_| einval())?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -1824,10 +1784,6 @@ impl FileSystem for PassthroughFs { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } - if inode == self.init_inode { - return Err(io::Error::from_raw_os_error(libc::ENODATA)); - } - let mut buf = vec![0; size as usize]; // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we @@ -2087,36 +2043,6 @@ impl FileSystem for PassthroughFs { debug!("setupmapping: ino {inode:?} addr={addr:x} len={len}"); - if inode == self.init_inode { - let ret = unsafe { - libc::mmap( - addr as *mut libc::c_void, - len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, - -1, - 0, - ) - }; - if std::ptr::eq(ret, libc::MAP_FAILED) { - return Err(io::Error::last_os_error()); - } - - let to_copy = if len as usize > INIT_BINARY.len() { - INIT_BINARY.len() - } else { - len as usize - }; - unsafe { - libc::memcpy( - addr as *mut libc::c_void, - INIT_BINARY.as_ptr() as *const _, - to_copy, - ) - }; - return Ok(()); - } - let file = self.open_inode(inode, open_flags)?; let fd = file.as_raw_fd(); @@ -2175,10 +2101,10 @@ impl FileSystem for PassthroughFs { handle: Self::Handle, _flags: u32, cmd: u32, - arg: u64, + _arg: u64, _in_size: u32, out_size: u32, - exit_code: &Arc, + _exit_code: &Arc, ) -> io::Result> { const VIRTIO_IOC_MAGIC: u8 = b'v'; @@ -2190,14 +2116,6 @@ impl FileSystem for PassthroughFs { VIRTIO_IOC_EXPORT_FD_SIZE ) as u32; - const VIRTIO_IOC_TYPE_EXIT_CODE: u8 = 2; - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - - const VIRTIO_IOC_REMOVE_ROOT_DIR_CODE: u8 = 3; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_REMOVE_ROOT_DIR_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2228,14 +2146,6 @@ impl FileSystem for PassthroughFs { ret.extend_from_slice(&handle.to_ne_bytes()); Ok(ret) } - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 53680bd92..cf43e0d0c 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -8,11 +8,11 @@ use std::collections::HashMap; use std::ffi::{CStr, CString}; use std::fs::File; use std::io; -use std::mem::{self, MaybeUninit}; +use std::mem::MaybeUninit; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::ptr::null_mut; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, AtomicI32, AtomicI64, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, RwLock}; use std::time::Duration; @@ -29,16 +29,14 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; -const INIT_CSTR: &[u8] = b"init.krun\0"; const XATTR_KEY: &[u8] = b"user.containers.override_stat\0"; const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); - type Inode = u64; type Handle = u64; @@ -516,7 +514,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. Not supported for macos. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -531,7 +528,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -543,12 +539,10 @@ impl Default for Config { /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { inodes: RwLock>>, - next_inode: AtomicU64, - init_inode: u64, + inode_alloc: Arc, handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, map_windows: Mutex>, @@ -560,7 +554,7 @@ pub struct PassthroughFs { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let root = CString::new(cfg.root_dir.as_str()).expect("CString::new failed"); // Safe because this doesn't modify any memory and we check the return value. @@ -579,12 +573,10 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), - init_inode: fuse::ROOT_ID + 1, + inode_alloc, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, map_windows: Mutex::new(HashMap::new()), @@ -723,7 +715,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { @@ -1201,25 +1193,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("lookup: {name:?}"); - let _init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == _init_name { - let mut st: bindings::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1339,11 +1313,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1456,18 +1426,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset - .try_into() - .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -2053,10 +2011,6 @@ impl FileSystem for PassthroughFs { return Err(linux_error(io::Error::from_raw_os_error(libc::ENOSYS))); } - if inode == self.init_inode { - return Err(linux_error(io::Error::from_raw_os_error(libc::ENODATA))); - } - if name.to_bytes() == XATTR_KEY { return Err(linux_error(io::Error::from_raw_os_error(libc::EACCES))); } @@ -2469,34 +2423,4 @@ impl FileSystem for PassthroughFs { Ok(()) } - - fn ioctl( - &self, - _ctx: Context, - _inode: Self::Inode, - _handle: Self::Handle, - _flags: u32, - cmd: u32, - arg: u64, - _in_size: u32, - _out_size: u32, - exit_code: &Arc, - ) -> io::Result> { - // We can't use nix::request_code_none here since it's system-dependent - // and we need the value from Linux. - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - - match cmd { - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } - _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), - } - } } diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 7ce9d48c2..f8ef63295 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -1,11 +1,15 @@ +mod augment_fs; mod device; #[allow(dead_code)] mod filesystem; pub mod fuse; +mod inode_alloc; #[allow(dead_code)] mod multikey; +mod null_fs; mod read_only; mod server; +pub mod virtual_entry; mod worker; #[cfg(target_os = "linux")] diff --git a/src/devices/src/virtio/fs/null_fs.rs b/src/devices/src/virtio/fs/null_fs.rs new file mode 100644 index 000000000..4bb4b6360 --- /dev/null +++ b/src/devices/src/virtio/fs/null_fs.rs @@ -0,0 +1,50 @@ +// A minimal filesystem that serves an empty root directory. +// +// Used with AugmentFs to provide a virtual-only filesystem (e.g. for +// booting from a block device where the virtiofs root only needs init.krun). + +use std::ffi::CStr; +use std::io; +use std::mem; +use std::time::Duration; + +use super::filesystem::{Context, Entry, FileSystem, FsOptions}; +use super::fuse; +use super::virtual_entry::VIRTUAL_BLKSIZE; +use crate::virtio::bindings; + +/// An empty filesystem with just a root directory and nothing in it. +pub struct NullFs; + +type Inode = u64; +type Handle = u64; + +impl FileSystem for NullFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, _capable: FsOptions) -> io::Result { + Ok(FsOptions::empty()) + } + + fn lookup(&self, _ctx: Context, _parent: Inode, _name: &CStr) -> io::Result { + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } + + fn getattr( + &self, + _ctx: Context, + inode: Inode, + _handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + if inode == fuse::ROOT_ID { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = fuse::ROOT_ID; + st.st_mode = libc::S_IFDIR | 0o755; + st.st_nlink = 2; + st.st_blksize = VIRTUAL_BLKSIZE as _; + return Ok((st, Duration::MAX)); + } + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } +} diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index e975f2dda..5495db1ed 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -25,6 +25,7 @@ use super::filesystem::{ OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::fuse; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use crate::virtio::bindings; @@ -35,10 +36,6 @@ fn erofs() -> io::Error { io::Error::from_raw_os_error(libc::EROFS) } -// Keep the Linux ioctl number so read-only virtio-fs can still handle -// non-mutating control ioctls while rejecting host-side root deletion. -const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - fn read_only_open_flags(flags: u32) -> io::Result { let f = flags as i32; if f & libc::O_ACCMODE != libc::O_RDONLY { @@ -60,9 +57,9 @@ pub struct PassthroughFsRo { } impl PassthroughFsRo { - pub fn new(cfg: passthrough::Config) -> io::Result { + pub fn new(cfg: passthrough::Config, inode_alloc: Arc) -> io::Result { Ok(Self { - inner: PassthroughFs::new(cfg)?, + inner: PassthroughFs::new(cfg, inode_alloc)?, }) } } @@ -318,10 +315,6 @@ impl FileSystem for PassthroughFsRo { out_size: u32, exit_code: &Arc, ) -> io::Result> { - if cmd == VIRTIO_IOC_REMOVE_ROOT_DIR_REQ { - return Err(erofs()); - } - self.inner.ioctl( ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, ) diff --git a/src/devices/src/virtio/fs/virtual_entry.rs b/src/devices/src/virtio/fs/virtual_entry.rs new file mode 100644 index 000000000..06f6915b3 --- /dev/null +++ b/src/devices/src/virtio/fs/virtual_entry.rs @@ -0,0 +1,56 @@ +// Virtual entry types for the virtiofs overlay. + +use std::ffi::CString; + +/// Block size reported by virtual entries in st_blksize. +pub const VIRTUAL_BLKSIZE: i64 = 4096; + +/// A synthetic filesystem entry that exists only in memory. +#[derive(Clone, Debug)] +pub struct VirtualEntry { + /// Permission bits. File type bits (S_IFMT) are ignored — the type + /// is derived from the `content` variant. + pub mode: u32, + /// If true, the entry can only be looked up once. + pub one_shot: bool, + pub content: VirtualEntryContent, +} + +#[derive(Clone, Debug)] +pub enum VirtualEntryContent { + /// A read-only file backed by a static byte slice. + File { data: &'static [u8] }, + /// A directory containing other virtual entries. + Dir { children: Vec }, +} + +impl VirtualEntry { + pub fn is_dir(&self) -> bool { + matches!(self.content, VirtualEntryContent::Dir { .. }) + } + + /// Returns the full st_mode: file type bits from the variant OR'd + /// with the permission bits from self.mode. + #[allow(clippy::unnecessary_cast)] // libc::S_IF* is u16 on macOS, u32 on Linux + pub fn st_mode(&self) -> u32 { + let file_type = match self.content { + VirtualEntryContent::File { .. } => libc::S_IFREG as u32, + VirtualEntryContent::Dir { .. } => libc::S_IFDIR as u32, + }; + file_type | (self.mode & !(libc::S_IFMT as u32)) + } + + pub fn data(&self) -> Option<&'static [u8]> { + match &self.content { + VirtualEntryContent::File { data } => Some(data), + VirtualEntryContent::Dir { .. } => None, + } + } +} + +/// A named entry in a virtual directory. +#[derive(Clone, Debug)] +pub struct VirtualDirEntry { + pub name: CString, + pub entry: VirtualEntry, +} diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index c612b3e9b..b8e722b5d 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -14,16 +14,21 @@ use utils::eventfd::EventFd; use vm_memory::GuestMemoryMmap; use super::super::{FsError, Queue}; +use super::augment_fs::AugmentFs; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; +use super::inode_alloc::InodeAllocator; +use super::null_fs::NullFs; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; +use super::virtual_entry::VirtualDirEntry; use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { - ReadWrite(Server), - ReadOnly(Server), + ReadWrite(Server>), + ReadOnly(Server>), + Null(Server>), } impl FsServer { @@ -52,6 +57,14 @@ impl FsServer { #[cfg(target_os = "macos")] map_sender, ), + FsServer::Null(s) => s.handle_message( + r, + w, + shm_region, + exit_code, + #[cfg(target_os = "macos")] + map_sender, + ), } } } @@ -77,16 +90,36 @@ impl FsWorker { interrupt: InterruptTransport, mem: GuestMemoryMmap, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, + virtual_entries: Vec, stop_fd: EventFd, exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { - let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new(passthrough_cfg)?)) - } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new(passthrough_cfg)?)) + let inode_alloc = Arc::new(InodeAllocator::new()); + let server = match passthrough_cfg { + Some(cfg) if read_only => { + let inner = PassthroughFsRo::new(cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + Some(cfg) => { + let inner = PassthroughFs::new(cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + None => FsServer::Null(Server::new(AugmentFs::new( + NullFs, + &inode_alloc, + virtual_entries, + ))), }; Ok(Self { queues, diff --git a/src/devices/src/virtio/linux_errno.rs b/src/devices/src/virtio/linux_errno.rs index 59aca5789..105f977b5 100644 --- a/src/devices/src/virtio/linux_errno.rs +++ b/src/devices/src/virtio/linux_errno.rs @@ -183,3 +183,37 @@ pub fn linux_errno_raw(errno: i32) -> i32 { _ => LINUX_EIO, } } + +// Helper functions returning io::Error with Linux errno values. +use std::io; + +pub fn eperm() -> io::Error { + io::Error::from_raw_os_error(LINUX_EPERM) +} +pub fn enoent() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENOENT) +} +pub fn eacces() -> io::Error { + io::Error::from_raw_os_error(LINUX_EACCES) +} +pub fn eexist() -> io::Error { + io::Error::from_raw_os_error(LINUX_EEXIST) +} +pub fn einval() -> io::Error { + io::Error::from_raw_os_error(LINUX_EINVAL) +} +pub fn eisdir() -> io::Error { + io::Error::from_raw_os_error(LINUX_EISDIR) +} +pub fn exdev() -> io::Error { + io::Error::from_raw_os_error(LINUX_EXDEV) +} +pub fn enosys() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENOSYS) +} +pub fn enodata() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENODATA) +} +pub fn enxio() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENXIO) +} diff --git a/src/init-blob/Cargo.toml b/src/init-blob/Cargo.toml new file mode 100644 index 000000000..fd799901e --- /dev/null +++ b/src/init-blob/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "init-blob" +version = "0.1.0-1.18.0" +edition = "2021" +description = "Default init binary blob for libkrun guests" +license = "Apache-2.0" +repository = "https://github.com/containers/libkrun" +build = "build.rs" + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" + +[lib] +path = "src/lib.rs" diff --git a/src/devices/build.rs b/src/init-blob/build.rs similarity index 100% rename from src/devices/build.rs rename to src/init-blob/build.rs diff --git a/src/init-blob/src/config.rs b/src/init-blob/src/config.rs new file mode 100644 index 000000000..77ab7e74d --- /dev/null +++ b/src/init-blob/src/config.rs @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: Apache-2.0 +// +//! Builder for the `/.krun_config.json` file consumed by the in-guest init. +//! +//! The JSON schema matches what `init/init.c` (`config_parse_file()`) expects: +//! +//! ```json +//! { +//! "Entrypoint": ["/usr/bin/bash"], +//! "Cmd": ["--login"], +//! "Env": ["HOME=/root", "TERM=xterm-256color"], +//! "WorkingDir": "/home/user", +//! "mounts": [{"destination": "/tmp", "type": "tmpfs", "source": "tmpfs"}] +//! } +//! ``` +//! +//! # Example +//! +//! ``` +//! use init_blob::InitConfig; +//! +//! let json_bytes = InitConfig::builder() +//! .entrypoint(["/usr/bin/bash"]) +//! .args(["--login"]) +//! .env(["HOME=/root", "TERM=xterm-256color"]) +//! .workdir("/home/user") +//! .build() +//! .to_json(); +//! ``` + +use std::borrow::Cow; +use std::path::Path; + +use serde::{Deserialize, Serialize}; + +/// Kernel cmdline `init=` path (absolute, as seen by the guest). +pub const INIT_PATH: &str = "/init.krun"; + +/// A file that the init process expects to find on the guest root filesystem. +/// +/// The caller decides how to materialize these (virtiofs overlay, block +/// device, etc.) — init-blob only describes *what* init needs. +pub struct GuestFile { + /// Path on the guest root filesystem. + pub path: &'static Path, + /// File contents. + pub data: Cow<'static, [u8]>, + /// Permission bits (e.g. `0o755` for executables). + pub mode: u32, + /// If true, the file is only needed during early init and can be + /// removed after first use. + pub one_shot: bool, +} + +/// Init configuration for the in-guest init process. +/// +/// Constructed via [`InitConfigBuilder`] or [`InitConfig::from_oci_spec_json`]. +/// +/// This type is the abstraction boundary between the host API and the +/// guest init — callers describe *what* to run, and the init-blob crate +/// handles the serialization format and guest-side conventions. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(default)] +pub struct InitConfig { + /// The entrypoint executable (and its fixed prefix args). + /// Maps to the `"Entrypoint"` JSON key. + #[serde(rename = "Entrypoint", skip_serializing_if = "Vec::is_empty")] + pub entrypoint: Vec, + + /// Additional command arguments appended after the entrypoint. + /// Maps to the `"Cmd"` JSON key. + #[serde(rename = "Cmd", skip_serializing_if = "Vec::is_empty")] + pub args: Vec, + + /// Environment variables in `KEY=value` form. + /// Maps to the `"Env"` JSON key. + #[serde(rename = "Env", skip_serializing_if = "Vec::is_empty")] + pub env: Vec, + + /// Working directory inside the guest. + /// Maps to the `"WorkingDir"` JSON key. + #[serde(rename = "WorkingDir", skip_serializing_if = "Option::is_none")] + pub workdir: Option, + + /// Additional mounts to perform inside the guest. + #[serde(skip_serializing_if = "Vec::is_empty")] + pub mounts: Vec, + + /// Resource limits in `"id=cur:max"` form (e.g. `"7=0:0"`). + #[serde(skip)] + pub rlimits: Vec, +} + +/// A mount specification for the guest init. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct Mount { + pub destination: String, + #[serde(rename = "type")] + pub fs_type: String, + pub source: String, +} + +impl InitConfig { + /// Start building a new init configuration. + pub fn builder() -> InitConfigBuilder { + InitConfigBuilder::default() + } + + /// Construct from an OCI container-spec JSON string. + /// + /// The JSON is expected to use the same key names as the OCI image + /// config (`Entrypoint`, `Cmd`, `Env`, `WorkingDir`, `mounts`). + /// Currently this is a thin deserialization — the internal + /// representation happens to match the OCI schema, but callers + /// should not rely on that. + /// + /// # Errors + /// + /// Returns `Err` if the JSON is syntactically invalid or contains + /// unexpected types (e.g. `Entrypoint` is not an array of strings). + pub fn from_oci_spec_json(json: &str) -> Result { + // TODO: actually validate the config (e.g. entrypoint is non-empty, + // env entries contain '=', workdir is absolute, etc.) + serde_json::from_str(json) + } + + /// Serialize to a JSON string. + pub fn to_json(&self) -> Box { + serde_json::to_string(self) + .expect("InitConfig serialization cannot fail") + .into_boxed_str() + } + + /// Returns the files that the init process expects on the guest root + /// filesystem: the init binary itself and the config JSON. + pub fn guest_files(&self) -> [GuestFile; 2] { + let config_json = self.to_json(); + [ + GuestFile { + path: Path::new(INIT_PATH), + data: Cow::Borrowed(super::INIT_BINARY), + mode: 0o755, + one_shot: true, + }, + GuestFile { + path: Path::new("/.krun_config.json"), + data: Cow::Owned(config_json.into_string().into_bytes()), + mode: 0o644, + one_shot: true, + }, + ] + } +} + +/// Builder for [`InitConfig`]. +#[derive(Clone, Debug, Default)] +pub struct InitConfigBuilder { + config: InitConfig, +} + +impl InitConfigBuilder { + /// Set the entrypoint executable (and optional fixed prefix arguments). + /// + /// ``` + /// # use init_blob::InitConfig; + /// InitConfig::builder().entrypoint(["/usr/bin/bash", "-l"]); + /// ``` + pub fn entrypoint(mut self, argv: impl IntoIterator>) -> Self { + self.config.entrypoint = argv.into_iter().map(Into::into).collect(); + self + } + + /// Set additional command arguments (appended after entrypoint). + pub fn args(mut self, argv: impl IntoIterator>) -> Self { + self.config.args = argv.into_iter().map(Into::into).collect(); + self + } + + /// Set environment variables. Each entry should be `"KEY=value"`. + pub fn env(mut self, vars: impl IntoIterator>) -> Self { + self.config.env = vars.into_iter().map(Into::into).collect(); + self + } + + /// Set the guest working directory. + pub fn workdir(mut self, dir: impl Into) -> Self { + self.config.workdir = Some(dir.into()); + self + } + + /// Add a mount specification. + pub fn mount( + mut self, + destination: impl Into, + fs_type: impl Into, + source: impl Into, + ) -> Self { + self.config.mounts.push(Mount { + destination: destination.into(), + fs_type: fs_type.into(), + source: source.into(), + }); + self + } + + /// Consume the builder and return the finished [`InitConfig`]. + pub fn build(self) -> InitConfig { + self.config + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_config_produces_empty_object() { + let cfg = InitConfig::builder().build(); + let json: serde_json::Value = serde_json::from_str(&cfg.to_json()).unwrap(); + assert_eq!(json, serde_json::json!({})); + } + + #[test] + fn full_config_round_trips() { + let cfg = InitConfig::builder() + .entrypoint(["/usr/bin/bash"]) + .args(["--login"]) + .env(["HOME=/root", "TERM=xterm-256color"]) + .workdir("/home/user") + .mount("/tmp", "tmpfs", "tmpfs") + .build(); + + let json: serde_json::Value = serde_json::from_str(&cfg.to_json()).unwrap(); + assert_eq!(json["Entrypoint"], serde_json::json!(["/usr/bin/bash"])); + assert_eq!(json["Cmd"], serde_json::json!(["--login"])); + assert_eq!( + json["Env"], + serde_json::json!(["HOME=/root", "TERM=xterm-256color"]) + ); + assert_eq!(json["WorkingDir"], serde_json::json!("/home/user")); + assert_eq!(json["mounts"][0]["destination"], "/tmp"); + assert_eq!(json["mounts"][0]["type"], "tmpfs"); + } + + #[test] + fn skip_serializing_empty_fields() { + let cfg = InitConfig::builder().workdir("/tmp").build(); + + let json: serde_json::Value = serde_json::from_str(&cfg.to_json()).unwrap(); + assert!(json.get("Entrypoint").is_none()); + assert!(json.get("Cmd").is_none()); + assert!(json.get("Env").is_none()); + assert!(json.get("mounts").is_none()); + assert_eq!(json["WorkingDir"], "/tmp"); + } + + #[test] + fn from_oci_spec_json_parses() { + let json = + r#"{"Entrypoint":["/bin/sh"],"Cmd":["-c","echo hi"],"Env":["A=1"],"WorkingDir":"/"}"#; + let cfg = InitConfig::from_oci_spec_json(json).unwrap(); + assert_eq!(cfg.entrypoint, ["/bin/sh"]); + assert_eq!(cfg.args, ["-c", "echo hi"]); + assert_eq!(cfg.env, ["A=1"]); + assert_eq!(cfg.workdir.as_deref(), Some("/")); + } + + #[test] + fn from_oci_spec_json_rejects_bad_types() { + // Entrypoint should be an array, not a string. + let json = r#"{"Entrypoint":"/bin/sh"}"#; + assert!(InitConfig::from_oci_spec_json(json).is_err()); + } + + #[test] + fn from_oci_spec_json_ignores_unknown_fields() { + let json = r#"{"Entrypoint":["/bin/sh"],"Labels":{"foo":"bar"}}"#; + let cfg = InitConfig::from_oci_spec_json(json).unwrap(); + assert_eq!(cfg.entrypoint, ["/bin/sh"]); + } + + #[test] + fn guest_files_returns_init_binary_and_config() { + let cfg = InitConfig::builder().entrypoint(["/bin/sh"]).build(); + let files = cfg.guest_files(); + + assert_eq!(files.len(), 2); + + assert_eq!(files[0].path, Path::new("/init.krun")); + assert_eq!(files[0].mode, 0o755); + assert!(files[0].one_shot); + assert!(!files[0].data.is_empty()); + + assert_eq!(files[1].path, Path::new("/.krun_config.json")); + assert_eq!(files[1].mode, 0o644); + assert!(files[1].one_shot); + let json: serde_json::Value = serde_json::from_slice(&files[1].data).unwrap(); + assert_eq!(json["Entrypoint"], serde_json::json!(["/bin/sh"])); + } +} diff --git a/src/init-blob/src/lib.rs b/src/init-blob/src/lib.rs new file mode 100644 index 000000000..5909392ff --- /dev/null +++ b/src/init-blob/src/lib.rs @@ -0,0 +1,4 @@ +pub static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); + +mod config; +pub use config::{GuestFile, InitConfig, InitConfigBuilder, Mount, INIT_PATH}; diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 4e54bf99c..27525ea7e 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -31,6 +31,7 @@ krun_display = { package = "krun-display", version = "0.1.0", path = "../display krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } +init-blob = { path = "../init-blob" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index a7b7eee6a..6fb69bea3 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -14,16 +14,15 @@ use env_logger::{Env, Target}; #[cfg(feature = "gpu")] use krun_display::DisplayBackend; +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +use devices::virtio::fs::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualEntryContent}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; -#[cfg(all(feature = "blk", not(feature = "tee")))] -use rand::distr::{Alphanumeric, SampleString}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; use std::env; -#[cfg(target_os = "linux")] use std::ffi::CString; use std::ffi::{c_void, CStr}; use std::fs::File; @@ -87,8 +86,32 @@ const KRUNFW_NAME: &str = "libkrunfw.5.dylib"; #[cfg(feature = "aws-nitro")] static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); -// Path to the init binary to be executed inside the VM. -const INIT_PATH: &str = "/init.krun"; +/// Convert an [`init_blob::GuestFile`] into a virtiofs [`VirtualDirEntry`]. +/// +/// Owned data is leaked to satisfy the `'static` lifetime required by +/// [`VirtualEntryContent::File`] — acceptable because the VM runs once +/// per process. +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn guest_file_to_virtual_entry(gf: init_blob::GuestFile) -> VirtualDirEntry { + use std::borrow::Cow; + + let file_name = gf + .path + .file_name() + .expect("GuestFile path must have a filename"); + let data: &'static [u8] = match gf.data { + Cow::Borrowed(b) => b, + Cow::Owned(v) => Box::leak(v.into_boxed_slice()), + }; + VirtualDirEntry { + name: CString::new(file_name.as_encoded_bytes()).unwrap(), + entry: VirtualEntry { + mode: gf.mode, + one_shot: gf.one_shot, + content: VirtualEntryContent::File { data }, + }, + } +} static KRUNFW: LazyLock> = LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); @@ -126,36 +149,17 @@ impl KrunfwBindings { } } -#[derive(Clone)] -#[cfg(feature = "net")] -enum LegacyNetworkConfig { - VirtioNetPasst(RawFd), - VirtioNetGvproxy(PathBuf), -} - #[derive(Default)] struct ContextConfig { krunfw: Option, vmr: VmResources, - workdir: Option, - exec_path: Option, - env: Option, - args: Option, - rlimits: Option, - #[cfg(feature = "net")] - legacy_net_cfg: Option, - #[cfg(feature = "net")] - legacy_mac: Option<[u8; 6]>, + init_config: init_blob::InitConfig, net_index: u8, tsi_port_map: Option>, vsock_config: VsockConfig, #[cfg(feature = "blk")] block_cfgs: Vec, #[cfg(feature = "blk")] - root_block_cfg: Option, - #[cfg(feature = "blk")] - data_block_cfg: Option, - #[cfg(feature = "blk")] block_root: Option, #[cfg(feature = "tee")] tee_config_file: Option, @@ -163,34 +167,14 @@ struct ContextConfig { shutdown_efd: Option, gpu_virgl_flags: Option, gpu_shm_size: Option, + /// Console output path, only used by the aws-nitro TryFrom path. + #[cfg(feature = "aws-nitro")] console_output: Option, vmm_uid: Option, vmm_gid: Option, } impl ContextConfig { - fn set_workdir(&mut self, workdir: String) { - self.workdir = Some(workdir); - } - - fn get_workdir(&self) -> String { - match &self.workdir { - Some(workdir) => format!("KRUN_WORKDIR={workdir}"), - None => "".to_string(), - } - } - - fn set_exec_path(&mut self, exec_path: String) { - self.exec_path = Some(exec_path); - } - - fn get_exec_path(&self) -> String { - match &self.exec_path { - Some(exec_path) => format!("KRUN_INIT={exec_path}"), - None => "".to_string(), - } - } - #[cfg(all(feature = "blk", not(feature = "tee")))] fn set_block_root(&mut self, device: String, fstype: Option, options: Option) { self.block_root = Some(BlockRootConfig { @@ -219,74 +203,14 @@ impl ContextConfig { "".to_string() } - fn set_env(&mut self, env: String) { - self.env = Some(env); - } - - fn get_env(&self) -> String { - match &self.env { - Some(env) => env.clone(), - None => "".to_string(), - } - } - - fn set_args(&mut self, args: String) { - self.args = Some(args); - } - - fn get_args(&self) -> String { - match &self.args { - Some(args) => args.clone(), - None => "".to_string(), - } - } - - fn set_rlimits(&mut self, rlimits: String) { - self.rlimits = Some(rlimits); - } - - fn get_rlimits(&self) -> String { - match &self.rlimits { - Some(rlimits) => format!("KRUN_RLIMITS={rlimits}"), - None => "".to_string(), - } - } - #[cfg(feature = "blk")] fn add_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { self.block_cfgs.push(block_cfg); } - #[cfg(feature = "blk")] - fn set_root_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { - self.root_block_cfg = Some(block_cfg); - } - - #[cfg(feature = "blk")] - fn set_data_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { - self.data_block_cfg = Some(block_cfg); - } - #[cfg(feature = "blk")] fn get_block_cfg(&self) -> Vec { - // For backwards compat, when cfgs is empty (the new API is not used), this needs to be - // root and then data, in that order. Also for backwards compat, root/data are setters and - // need to discard redundant calls. So we have simple setters above and fix up here. - // - // When the new API is used, this is simpler. - if self.block_cfgs.is_empty() { - [&self.root_block_cfg, &self.data_block_cfg] - .into_iter() - .filter_map(|cfg| cfg.clone()) - .collect() - } else { - self.block_cfgs.clone() - } - } - - #[cfg(feature = "net")] - fn set_net_mac(&mut self, mac: [u8; 6]) { - self.legacy_mac = Some(mac); + self.block_cfgs.clone() } fn set_port_map(&mut self, new_port_map: HashMap) -> Result<(), ()> { @@ -359,20 +283,25 @@ impl TryFrom for NitroEnclave { return Err(-libc::EINVAL); }; - let Some(exec_path) = ctx.exec_path else { - error!("exec path not specified"); - return Err(-libc::EINVAL); + let exec_path = match ctx.init_config.entrypoint.first() { + Some(p) => p.clone(), + None => { + error!("exec path not specified"); + return Err(-libc::EINVAL); + } }; - let Some(exec_env) = ctx.env else { + if ctx.init_config.env.is_empty() { error!("execution env not specified"); return Err(-libc::EINVAL); - }; + } + let exec_env = ctx.init_config.env.join(" "); - let Some(exec_args) = ctx.args else { + if ctx.init_config.args.is_empty() { error!("execution args not specified"); return Err(-libc::EINVAL); - }; + } + let exec_args = ctx.init_config.args.join(" "); let net_unixfd = { let mut list = ctx.vmr.net.list; @@ -443,26 +372,6 @@ fn log_level_to_filter_str(level: u32) -> &'static str { } } -#[no_mangle] -pub extern "C" fn krun_set_log_level(level: u32) -> i32 { - let filter = log_level_to_filter_str(level); - env_logger::Builder::from_env(Env::default().default_filter_or(filter)) - .format_timestamp_micros() - .init(); - - #[cfg(feature = "aws-nitro")] - { - // Notify krun-awsnitro to enable debug for log level. - if level == 4 { - let mut debug = KRUN_NITRO_DEBUG.lock().unwrap(); - - *debug = true; - } - } - - KRUN_SUCCESS -} - mod log_defs { pub const KRUN_LOG_STYLE_AUTO: u32 = 0; pub const KRUN_LOG_STYLE_ALWAYS: u32 = 1; @@ -510,6 +419,14 @@ pub unsafe extern "C" fn krun_init_log(target: RawFd, level: u32, style: u32, op }; builder.format_timestamp_micros().target(target).init(); + #[cfg(feature = "aws-nitro")] + { + // Notify krun-awsnitro to enable debug for log level. + if level >= 4 { + *KRUN_NITRO_DEBUG.lock().unwrap() = true; + } + } + KRUN_SUCCESS } @@ -578,7 +495,7 @@ pub extern "C" fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) - #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(not(feature = "tee"))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) -> i32 { let root_path = match CStr::from_ptr(c_root_path).to_str() { Ok(root) => root, @@ -593,11 +510,11 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) let cfg = ctx_cfg.get_mut(); cfg.vmr.add_fs_device(FsDeviceConfig { fs_id, - shared_dir, + shared_dir: Some(shared_dir), // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: false, read_only: false, + virtual_entries: Vec::new(), }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -608,30 +525,7 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_add_virtiofs( - ctx_id: u32, - c_tag: *const c_char, - c_path: *const c_char, -) -> i32 { - krun_add_virtiofs3(ctx_id, c_tag, c_path, 0, false) -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_add_virtiofs2( - ctx_id: u32, - c_tag: *const c_char, - c_path: *const c_char, - shm_size: u64, -) -> i32 { - krun_add_virtiofs3(ctx_id, c_tag, c_path, shm_size, false) -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] pub unsafe extern "C" fn krun_add_virtiofs3( ctx_id: u32, c_tag: *const c_char, @@ -639,7 +533,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( shm_size: u64, read_only: bool, ) -> i32 { - if c_tag.is_null() || c_path.is_null() { + if c_tag.is_null() { return -libc::EINVAL; } @@ -647,9 +541,15 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Ok(tag) => tag, Err(_) => return -libc::EINVAL, }; - let path = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(_) => return -libc::EINVAL, + + // NULL path means NullFs (virtual-only filesystem, no host directory). + let path = if c_path.is_null() { + None + } else { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(path), + Err(_) => return -libc::EINVAL, + } }; let shm = if shm_size > 0 { @@ -666,10 +566,10 @@ pub unsafe extern "C" fn krun_add_virtiofs3( let cfg = ctx_cfg.get_mut(); cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), - shared_dir: path.to_string(), + shared_dir: path.map(|p| p.to_string()), shm_size: shm, - allow_root_dir_delete: false, read_only, + virtual_entries: Vec::new(), }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -679,105 +579,6 @@ pub unsafe extern "C" fn krun_add_virtiofs3( } #[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_set_mapped_volumes( - _ctx_id: u32, - _c_mapped_volumes: *const *const c_char, -) -> i32 { - -libc::EINVAL -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_add_disk( - ctx_id: u32, - c_block_id: *const c_char, - c_disk_path: *const c_char, - read_only: bool, -) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; - - let block_id = match CStr::from_ptr(c_block_id).to_str() { - Ok(block_id) => block_id, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: block_id.to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: ImageType::Raw, - is_disk_read_only: read_only, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.add_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_add_disk2( - ctx_id: u32, - c_block_id: *const c_char, - c_disk_path: *const c_char, - disk_format: u32, - read_only: bool, -) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; - - let block_id = match CStr::from_ptr(c_block_id).to_str() { - Ok(block_id) => block_id, - Err(_) => return -libc::EINVAL, - }; - - let format = match ImageType::try_from(disk_format) { - Ok(format) => format, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: block_id.to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: format, - is_disk_read_only: read_only, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.add_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - #[allow(clippy::missing_safety_doc)] #[no_mangle] #[cfg(feature = "blk")] @@ -830,70 +631,6 @@ pub unsafe extern "C" fn krun_add_disk3( KRUN_SUCCESS } -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_set_root_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: "root".to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: ImageType::Raw, - is_disk_read_only: false, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.set_root_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_set_data_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: "data".to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: ImageType::Raw, - is_disk_read_only: false, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.set_data_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - /* * Send the VFKIT magic after establishing the connection, * as required by gvproxy in vfkit mode. @@ -922,19 +659,7 @@ const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; const NET_FEATURE_HOST_TSO6: u32 = 1 << 12; #[cfg(feature = "net")] const NET_FEATURE_HOST_UFO: u32 = 1 << 14; -/* - * These are the flags enabled by default on each virtio-net instance - * before the introduction of "krun_add_net_*". They are now used in - * the legacy API ("krun_set_passt_fd" and "krun_set_gvproxy_path") - * for compatiblity reasons. - */ -#[cfg(feature = "net")] -const NET_COMPAT_FEATURES: u32 = NET_FEATURE_CSUM - | NET_FEATURE_GUEST_CSUM - | NET_FEATURE_GUEST_TSO4 - | NET_FEATURE_GUEST_UFO - | NET_FEATURE_HOST_TSO4 - | NET_FEATURE_HOST_UFO; + #[cfg(feature = "net")] const NET_ALL_FEATURES: u32 = NET_FEATURE_CSUM | NET_FEATURE_GUEST_CSUM @@ -1130,75 +855,6 @@ pub unsafe extern "C" fn krun_add_net_tap( -libc::EINVAL } -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_set_passt_fd(ctx_id: u32, fd: c_int) -> i32 { - if fd < 0 { - return -libc::EINVAL; - } - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - // The legacy interface only supports a single network interface. - if cfg.net_index != 0 { - return -libc::EINVAL; - } - cfg.legacy_net_cfg = Some(LegacyNetworkConfig::VirtioNetPasst(fd)); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_set_gvproxy_path(ctx_id: u32, c_path: *const c_char) -> i32 { - let path_str = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(e) => { - debug!("Error parsing gvproxy_path: {e:?}"); - return -libc::EINVAL; - } - }; - - let path = PathBuf::from(path_str); - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - // The legacy interface only supports a single network interface. - if cfg.net_index != 0 { - return -libc::EINVAL; - } - cfg.legacy_net_cfg = Some(LegacyNetworkConfig::VirtioNetGvproxy(path)); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_set_net_mac(ctx_id: u32, c_mac: *const u8) -> i32 { - let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { - Ok(m) => m, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_net_mac(mac); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_port_map(ctx_id: u32, c_port_map: *const *const c_char) -> i32 { @@ -1256,30 +912,29 @@ pub unsafe extern "C" fn krun_set_port_map(ctx_id: u32, c_port_map: *const *cons #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_rlimits(ctx_id: u32, c_rlimits: *const *const c_char) -> i32 { - let rlimits = if c_rlimits.is_null() { + if c_rlimits.is_null() { return -libc::EINVAL; - } else { - let mut strvec = Vec::new(); - - let array: &[*const c_char] = slice::from_raw_parts(c_rlimits, MAX_ARGS); - for item in array.iter().take(MAX_ARGS) { - if item.is_null() { - break; - } else { - let s = match CStr::from_ptr(*item).to_str() { - Ok(s) => s, - Err(_) => return -libc::EINVAL, - }; - strvec.push(s); - } - } + } - format!("\"{}\"", strvec.join(",")) + let array: &[*const c_char] = slice::from_raw_parts(c_rlimits, MAX_ARGS); + let rlimit_strs = match collect_str_array(array) { + Ok(v) => v, + Err(_) => return -libc::EINVAL, }; + // FIXME: rlimits should be a proper field in the config JSON (the OCI + // runtime spec has `process.rlimits`), not smuggled as an env var. + // The current init reads them from `KRUN_RLIMITS` in the process + // environment, which conflates init-internal knobs with the payload's + // env vars — the payload can see and even override them. This needs + // a coordinated fix in both the init binary and the config schema. + let rlimits_value = rlimit_strs.join(","); + match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { - ctx_cfg.get_mut().set_rlimits(rlimits); + let env = &mut ctx_cfg.get_mut().init_config.env; + env.retain(|e| !e.starts_with("KRUN_RLIMITS=")); + env.push(format!("KRUN_RLIMITS={rlimits_value}")); } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1297,7 +952,7 @@ pub unsafe extern "C" fn krun_set_workdir(ctx_id: u32, c_workdir_path: *const c_ match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { - ctx_cfg.get_mut().set_workdir(workdir_path.to_string()); + ctx_cfg.get_mut().init_config.workdir = Some(workdir_path.to_string()); } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1305,22 +960,18 @@ pub unsafe extern "C" fn krun_set_workdir(ctx_id: u32, c_workdir_path: *const c_ KRUN_SUCCESS } -unsafe fn collapse_str_array(array: &[*const c_char]) -> Result { - let mut strvec = Vec::new(); - +/// Collect a null-terminated C string array into a `Vec`. +unsafe fn collect_str_array(array: &[*const c_char]) -> Result, std::str::Utf8Error> { + let mut out = Vec::new(); for item in array.iter().take(MAX_ARGS) { if item.is_null() { break; - } else { - let s = CStr::from_ptr(*item).to_str()?; - strvec.push(format!("\"{s}\"")); } + out.push(CStr::from_ptr(*item).to_str()?.to_owned()); } - - Ok(strvec.join(" ")) + Ok(out) } -#[allow(clippy::format_collect)] #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_exec( @@ -1339,38 +990,38 @@ pub unsafe extern "C" fn krun_set_exec( let args = if !c_argv.is_null() { let argv_array: &[*const c_char] = slice::from_raw_parts(c_argv, MAX_ARGS); - match collapse_str_array(argv_array) { - Ok(s) => s, + match collect_str_array(argv_array) { + Ok(v) => v, Err(e) => { debug!("Error parsing args: {e:?}"); return -libc::EINVAL; } } } else { - "".to_string() + Vec::new() }; - let env = if !c_envp.is_null() { + let env_vars = if !c_envp.is_null() { let envp_array: &[*const c_char] = slice::from_raw_parts(c_envp, MAX_ARGS); - match collapse_str_array(envp_array) { - Ok(s) => s, + match collect_str_array(envp_array) { + Ok(v) => v, Err(e) => { - debug!("Error parsing args: {e:?}"); + debug!("Error parsing env: {e:?}"); return -libc::EINVAL; } } } else { env::vars() - .map(|(key, value)| format!(" {key}=\"{value}\"")) + .map(|(key, value)| format!("{key}={value}")) .collect() }; match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); - cfg.set_exec_path(exec_path.to_string()); - cfg.set_env(env); - cfg.set_args(args); + cfg.init_config.entrypoint = vec![exec_path.to_string()]; + cfg.init_config.args = args; + cfg.init_config.env = env_vars; } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1378,29 +1029,27 @@ pub unsafe extern "C" fn krun_set_exec( KRUN_SUCCESS } -#[allow(clippy::format_collect)] #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_env(ctx_id: u32, c_envp: *const *const c_char) -> i32 { - let env = if !c_envp.is_null() { + let env_vars = if !c_envp.is_null() { let envp_array: &[*const c_char] = slice::from_raw_parts(c_envp, MAX_ARGS); - match collapse_str_array(envp_array) { - Ok(s) => s, + match collect_str_array(envp_array) { + Ok(v) => v, Err(e) => { - debug!("Error parsing args: {e:?}"); + debug!("Error parsing env: {e:?}"); return -libc::EINVAL; } } } else { env::vars() - .map(|(key, value)| format!(" {key}=\"{value}\"")) + .map(|(key, value)| format!("{key}={value}")) .collect() }; match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_env(env); + ctx_cfg.get_mut().init_config.env = env_vars; } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1428,16 +1077,6 @@ pub unsafe extern "C" fn krun_set_tee_config_file(ctx_id: u32, c_filepath: *cons KRUN_SUCCESS } -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_vsock_port( - ctx_id: u32, - port: u32, - c_filepath: *const c_char, -) -> i32 { - krun_add_vsock_port2(ctx_id, port, c_filepath, false) -} - #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_add_vsock_port2( @@ -1478,20 +1117,6 @@ pub unsafe extern "C" fn krun_add_vsock_port2( KRUN_SUCCESS } -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_gpu_options(ctx_id: u32, virgl_flags: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_gpu_virgl_flags(virgl_flags); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_gpu_options2( @@ -1871,27 +1496,12 @@ pub unsafe extern "C" fn krun_add_vhost_user_device( -libc::ENOTSUP } -#[allow(unused_assignments)] -#[no_mangle] -pub extern "C" fn krun_get_shutdown_eventfd(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if let Some(efd) = cfg.shutdown_efd.as_ref() { - #[cfg(target_os = "macos")] - return efd.get_write_fd(); - #[cfg(target_os = "linux")] - return efd.as_raw_fd(); - } else { - -libc::EINVAL - } - } - Entry::Vacant(_) => -libc::ENOENT, - } -} - +// FIXME: aws-nitro builds its own NitroEnclave from ContextConfig and needs +// the console output path directly. This should be replaced with a proper +// console configuration in the nitro path. #[allow(clippy::missing_safety_doc)] #[no_mangle] +#[cfg(feature = "aws-nitro")] pub unsafe extern "C" fn krun_set_console_output(ctx_id: u32, c_filepath: *const c_char) -> i32 { let filepath = match CStr::from_ptr(c_filepath).to_str() { Ok(f) => f, @@ -1912,6 +1522,25 @@ pub unsafe extern "C" fn krun_set_console_output(ctx_id: u32, c_filepath: *const } } +#[allow(unused_assignments)] +#[no_mangle] +pub extern "C" fn krun_get_shutdown_eventfd(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if let Some(efd) = cfg.shutdown_efd.as_ref() { + #[cfg(target_os = "macos")] + return efd.get_write_fd(); + #[cfg(target_os = "linux")] + return efd.as_raw_fd(); + } else { + -libc::EINVAL + } + } + Entry::Vacant(_) => -libc::ENOENT, + } +} + #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_nested_virt(ctx_id: u32, enabled: bool) -> i32 { @@ -2318,7 +1947,7 @@ pub extern "C" fn krun_setgid(ctx_id: u32, gid: libc::gid_t) -> i32 { KRUN_SUCCESS } -#[cfg(all(feature = "blk", not(feature = "tee")))] +#[cfg(all(feature = "blk", not(any(feature = "tee", feature = "aws-nitro"))))] #[allow(clippy::missing_safety_doc)] #[no_mangle] pub unsafe extern "C" fn krun_set_root_disk_remount( @@ -2379,25 +2008,33 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( return -libc::EINVAL; } - // To boot from a filesystem other than virtiofs, - // we need to setup a temporary root from which init.krun can be executed. - // Otherwise, it would have to be copied to the target filesystem beforehand. - // Instead, init.krun will run from virtiofs and then switch to the real root. - let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); - let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); - - if let Err(e) = std::fs::create_dir_all(&empty_root) { - error!("Failed to create empty root directory: {e:?}"); - return -libc::EINVAL; + // Boot from a block device: the virtiofs root only needs to + // provide mount points for /dev, /proc, /sys. The init binary + // and config JSON are injected at krun_start_enter time. + // Use a NullFs (no host directory) with the inode overlay. + let mut virtual_entries = Vec::new(); + // init.c needs these directories as mount points before + // pivoting to the block device root. + for name in ["dev", "proc", "sys", "newroot"] { + virtual_entries.push(VirtualDirEntry { + name: CString::new(name).unwrap(), + entry: VirtualEntry { + mode: 0o755, + one_shot: false, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + }); } ctx_cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: "/dev/root".into(), - shared_dir: empty_root.to_string_lossy().into(), + shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: true, read_only: false, + virtual_entries, }); ctx_cfg.set_block_root(device, fstype, options); @@ -2408,12 +2045,35 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( KRUN_SUCCESS } +/// Sets the init configuration from an OCI container-spec JSON string. +/// +/// The JSON should use OCI image config keys (`Entrypoint`, `Cmd`, `Env`, +/// `WorkingDir`, `mounts`). This replaces any configuration previously set +/// via `krun_set_exec`, `krun_set_workdir`, `krun_set_env`, etc. +#[allow(clippy::missing_safety_doc)] #[no_mangle] -pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_set_oci_config_json(ctx_id: u32, c_json: *const c_char) -> i32 { + if c_json.is_null() { + return -libc::EINVAL; + } + + let json = match CStr::from_ptr(c_json).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let config = match init_blob::InitConfig::from_oci_spec_json(json) { + Ok(c) => c, + Err(e) => { + debug!("Error parsing OCI config JSON: {e}"); + return -libc::EINVAL; + } + }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.disable_implicit_console = true; + ctx_cfg.get_mut().init_config = config; } Entry::Vacant(_) => return -libc::ENOENT, } @@ -2421,12 +2081,37 @@ pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { KRUN_SUCCESS } +/// Injects the init binary and config JSON into the specified virtiofs device. +/// +/// Call this after configuring the init via `krun_set_exec`/`krun_set_workdir`/ +/// `krun_set_env` or `krun_set_oci_config_json`, and after the target virtiofs +/// device has been created (e.g. via `krun_set_root` or `krun_add_virtiofs3`). +/// +/// `c_fs_tag` identifies which virtiofs device receives the init files +/// (typically `KRUN_FS_ROOT_TAG`, i.e. `"/dev/root"`). +#[allow(clippy::missing_safety_doc)] #[no_mangle] -pub extern "C" fn krun_disable_implicit_vsock(ctx_id: u32) -> i32 { +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_inject_init(ctx_id: u32, c_fs_tag: *const c_char) -> i32 { + if c_fs_tag.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); - cfg.vsock_config = VsockConfig::Disabled; + let Some(fs) = cfg.vmr.fs.iter_mut().find(|f| f.fs_id == fs_tag) else { + debug!("krun_inject_init: virtiofs device '{fs_tag}' not found"); + return -libc::ENOENT; + }; + for gf in cfg.init_config.guest_files() { + fs.virtual_entries.push(guest_file_to_virtual_entry(gf)); + } } Entry::Vacant(_) => return -libc::ENOENT, } @@ -2434,6 +2119,141 @@ pub extern "C" fn krun_disable_implicit_vsock(ctx_id: u32) -> i32 { KRUN_SUCCESS } +/// Resolve a path like "a/b/c" into parent directory children + leaf name. +/// Errors with a libc errno if any intermediate component is missing or not a Dir. +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn resolve_overlay_path<'a>( + entries: &'a mut Vec, + path: &str, +) -> Result<(&'a mut Vec, CString), i32> { + let path = path.strip_prefix('/').unwrap_or(path); + let components: Vec<&str> = path.split('/').collect(); + let (leaf, parents) = components.split_last().ok_or(-libc::EINVAL)?; + if leaf.is_empty() { + return Err(-libc::EINVAL); + } + + let mut current = entries; + for component in parents { + let dir = current + .iter_mut() + .find(|e| e.name.as_c_str().to_bytes() == component.as_bytes()) + .ok_or(-libc::ENOENT)?; + match &mut dir.entry.content { + VirtualEntryContent::Dir { children } => current = children, + _ => return Err(-libc::ENOTDIR), + } + } + + let name = CString::new(*leaf).map_err(|_| -libc::EINVAL)?; + Ok((current, name)) +} + +/// Add a virtual overlay entry to a virtiofs device, resolving paths with `/`. +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn fs_add_overlay_entry(ctx_id: u32, fs_tag: &str, path: &str, entry: VirtualEntry) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let fs_cfg = match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs) => fs, + None => return -libc::ENOENT, + }; + let (parent_children, name) = + match resolve_overlay_path(&mut fs_cfg.virtual_entries, path) { + Ok(v) => v, + Err(e) => return e, + }; + parent_children.push(VirtualDirEntry { name, entry }); + } + Entry::Vacant(_) => return -libc::ENOENT, + } + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_fs_add_overlay_file( + ctx_id: u32, + c_fs_tag: *const c_char, + c_path: *const c_char, + data: *const u8, + data_len: size_t, + mode: u32, + one_shot: bool, +) -> i32 { + if c_fs_tag.is_null() || c_path.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + let path = match CStr::from_ptr(c_path).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + // SAFETY: The caller guarantees the memory remains valid for the VM + // lifetime (see the C header contract). + let payload: &'static [u8] = if data_len == 0 { + &[] + } else if !data.is_null() { + slice::from_raw_parts(data, data_len) + } else { + return -libc::EINVAL; + }; + + fs_add_overlay_entry( + ctx_id, + fs_tag, + path, + VirtualEntry { + mode, + one_shot, + content: VirtualEntryContent::File { data: payload }, + }, + ) +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_fs_add_overlay_dir( + ctx_id: u32, + c_fs_tag: *const c_char, + c_path: *const c_char, + mode: u32, +) -> i32 { + if c_fs_tag.is_null() || c_path.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + let path = match CStr::from_ptr(c_path).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + fs_add_overlay_entry( + ctx_id, + fs_tag, + path, + VirtualEntry { + mode, + one_shot: false, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + ) +} + #[no_mangle] pub extern "C" fn krun_add_vsock(ctx_id: u32, tsi_features: u32) -> i32 { let tsi_flags = match TsiFlags::from_bits(tsi_features) { @@ -2695,39 +2515,24 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { return -libc::EINVAL; } + let block_root = ctx_cfg.get_block_root(); let kernel_cmdline = KernelCmdlineConfig { - prolog: Some(format!("{DEFAULT_KERNEL_CMDLINE} init={INIT_PATH}")), - krun_env: Some(format!( - " {} {} {} {} {}", - ctx_cfg.get_exec_path(), - ctx_cfg.get_workdir(), - ctx_cfg.get_block_root(), - ctx_cfg.get_rlimits(), - ctx_cfg.get_env(), + prolog: Some(format!( + "{DEFAULT_KERNEL_CMDLINE} init={}", + init_blob::INIT_PATH )), - epilog: Some(format!(" -- {}", ctx_cfg.get_args())), + krun_env: if block_root.is_empty() { + None + } else { + Some(format!(" {block_root}")) + }, + epilog: None, }; if ctx_cfg.vmr.set_kernel_cmdline(kernel_cmdline).is_err() { return -libc::EINVAL; } - #[cfg(feature = "net")] - { - if let Some(legacy_net_cfg) = ctx_cfg.legacy_net_cfg.clone() { - let backend = match legacy_net_cfg { - LegacyNetworkConfig::VirtioNetGvproxy(path) => { - VirtioNetBackend::UnixgramPath(path, true) - } - LegacyNetworkConfig::VirtioNetPasst(fd) => VirtioNetBackend::UnixstreamFd(fd), - }; - let mac = ctx_cfg - .legacy_mac - .unwrap_or([0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]); - create_virtio_net(&mut ctx_cfg, backend, mac, NET_COMPAT_FEATURES); - } - } - match &ctx_cfg.vsock_config { VsockConfig::Disabled => (), VsockConfig::Explicit { tsi_flags } => { @@ -2740,33 +2545,6 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { }; ctx_cfg.vmr.set_vsock_device(vsock_device_config).unwrap(); } - VsockConfig::Implicit => { - // Implicit vsock configuration - use heuristics - // Check if TSI should be enabled based on network configuration - #[cfg(feature = "net")] - let enable_tsi = ctx_cfg.vmr.net.list.is_empty() && ctx_cfg.legacy_net_cfg.is_none(); - #[cfg(not(feature = "net"))] - let enable_tsi = true; - - let has_ipc_map = ctx_cfg.unix_ipc_port_map.is_some(); - - if enable_tsi || has_ipc_map { - let (tsi_flags, host_port_map) = if enable_tsi { - (TsiFlags::HIJACK_INET, ctx_cfg.tsi_port_map) - } else { - (TsiFlags::empty(), None) - }; - - let vsock_device_config = VsockDeviceConfig { - vsock_id: "vsock0".to_string(), - guest_cid: 3, - host_port_map, - unix_ipc_port_map: ctx_cfg.unix_ipc_port_map.clone(), - tsi_flags, - }; - ctx_cfg.vmr.set_vsock_device(vsock_device_config).unwrap(); - } - } } if let Some(virgl_flags) = ctx_cfg.gpu_virgl_flags { @@ -2776,10 +2554,6 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { ctx_cfg.vmr.set_gpu_shm_size(shm_size); } - if let Some(console_output) = ctx_cfg.console_output { - ctx_cfg.vmr.set_console_output(console_output); - } - if let Some(gid) = ctx_cfg.vmm_gid { if unsafe { libc::setgid(gid) } != 0 { error!("Failed to set gid {gid}"); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index b92b931d4..23793390b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -14,7 +14,6 @@ use std::fs::File; use std::io::{self, IsTerminal, Read}; use std::os::fd::AsRawFd; use std::os::fd::{BorrowedFd, FromRawFd}; -use std::path::PathBuf; use std::sync::atomic::AtomicI32; use std::sync::{Arc, Mutex}; @@ -728,16 +727,7 @@ pub fn build_microvm( let mut serial_devices = Vec::new(); - // Create the legacy serial device if we're booting from a firmware - if vm_resources.firmware_config.is_some() && !vm_resources.disable_implicit_console { - serial_devices.push(setup_serial_device( - event_manager, - None, - None, - // Uncomment this to get EFI output when debugging EDK2. - //Some(Box::new(io::stdout())), - )?); - }; + // We can't call to `setup_terminal_raw_mode` until `Vmm` is created, // so let's keep track of FDs connected to legacy serial devices here @@ -995,18 +985,6 @@ pub fn build_microvm( } } let mut console_id = 0; - if !vm_resources.disable_implicit_console { - attach_console_devices( - &mut vmm, - event_manager, - intc.clone(), - vm_resources, - None, - console_id, - )?; - console_id += 1; - } - for console_cfg in vm_resources.virtio_consoles.iter() { attach_console_devices( &mut vmm, @@ -2040,8 +2018,8 @@ fn attach_fs_devices( config.fs_id.clone(), config.shared_dir.clone(), exit_code.clone(), - config.allow_root_dir_delete, config.read_only, + config.virtual_entries.clone(), ) .unwrap(), )); @@ -2076,39 +2054,16 @@ fn attach_fs_devices( fn autoconfigure_console_ports( vmm: &mut Vmm, - vm_resources: &VmResources, + _vm_resources: &VmResources, cfg: Option<&DefaultVirtioConsoleConfig>, - creating_implicit_console: bool, ) -> std::result::Result, StartMicrovmError> { use self::StartMicrovmError::*; - let mut console_output_path: Option = None; - if let Some(path) = vm_resources.console_output.clone() { - if !vm_resources.disable_implicit_console && creating_implicit_console { - console_output_path = Some(path) - } - } - - if let Some(console_output_path) = console_output_path { - let file = File::create(console_output_path).map_err(OpenConsoleFile)?; - // Manually emulate our Legacy behavior: In the case of output_path we have always used the - // stdin to determine the console size - let stdin_fd = unsafe { BorrowedFd::borrow_raw(STDIN_FILENO) }; - let term_fd = if isatty(stdin_fd).is_ok_and(|v| v) { - port_io::term_fd(stdin_fd.as_raw_fd()).unwrap() - } else { - port_io::term_fixed_size(0, 0) - }; - Ok(vec![PortDescription::console( - Some(port_io::input_empty().unwrap()), - Some(port_io::output_file(file).unwrap()), - term_fd, - )]) - } else { - let (input_fd, output_fd, err_fd) = match cfg { - Some(c) => (c.input_fd, c.output_fd, c.err_fd), - None => (STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO), - }; + let (input_fd, output_fd, err_fd) = match cfg { + Some(c) => (c.input_fd, c.output_fd, c.err_fd), + None => (STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO), + }; + { let input_is_terminal = input_fd >= 0 && isatty(unsafe { BorrowedFd::borrow_raw(input_fd) }).unwrap_or(false); let output_is_terminal = @@ -2269,16 +2224,11 @@ fn attach_console_devices( ) -> std::result::Result<(), StartMicrovmError> { use self::StartMicrovmError::*; - let creating_implicit_console = cfg.is_none(); - let ports = match cfg { - None => autoconfigure_console_ports(vmm, vm_resources, None, creating_implicit_console)?, - Some(VirtioConsoleConfigMode::Autoconfigure(autocfg)) => autoconfigure_console_ports( - vmm, - vm_resources, - Some(autocfg), - creating_implicit_console, - )?, + None => autoconfigure_console_ports(vmm, vm_resources, None)?, + Some(VirtioConsoleConfigMode::Autoconfigure(autocfg)) => { + autoconfigure_console_ports(vmm, vm_resources, Some(autocfg))? + } Some(VirtioConsoleConfigMode::Explicit(ports)) => create_explicit_ports(vmm, ports)?, }; diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index ffa9e6eac..9b15bc66a 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -8,7 +8,6 @@ use std::fs::File; #[cfg(feature = "tee")] use std::io::BufReader; use std::os::fd::RawFd; -use std::path::PathBuf; #[cfg(feature = "tee")] use serde::{Deserialize, Serialize}; @@ -131,13 +130,11 @@ pub enum PortConfig { /// Configuration for the vsock device #[derive(Debug, Default, Clone, Eq, PartialEq)] pub enum VsockConfig { - /// Default behavior - vsock created implicitly with heuristics-based TSI + /// No vsock device #[default] - Implicit, + Disabled, /// Explicit configuration with specified TSI features Explicit { tsi_flags: TsiFlags }, - /// Vsock device disabled - Disabled, } /// A data structure that encapsulates the device configurations @@ -189,16 +186,13 @@ pub struct VmResources { #[cfg(feature = "vhost-user")] /// Vhost-user device configurations pub vhost_user_devices: Vec, - /// File to send console output. - pub console_output: Option, /// SMBIOS OEM Strings pub smbios_oem_strings: Option>, /// Whether to enable nested virtualization. pub nested_enabled: bool, /// Whether to enable split irqchip pub split_irqchip: bool, - /// Do not create an implicit console device in the guest - pub disable_implicit_console: bool, + /// The console id to use for console= in the kernel cmdline pub kernel_console: Option, /// Serial consoles to attach to the guest @@ -358,10 +352,6 @@ impl VmResources { self.gpu_shm_size = Some(shm_size); } - pub fn set_console_output(&mut self, console_output: PathBuf) { - self.console_output = Some(console_output); - } - /// Sets a network device to be attached when the VM starts. #[cfg(feature = "net")] pub fn add_network_interface( @@ -400,8 +390,6 @@ impl VmResources { #[cfg(test)] mod tests { - #[cfg(feature = "gpu")] - use crate::resources::DisplayBackendConfig; use crate::resources::VmResources; use crate::vmm_config::kernel_cmdline::KernelCmdlineConfig; use crate::vmm_config::machine_config::{CpuFeaturesTemplate, VmConfig, VmConfigError}; @@ -440,11 +428,10 @@ mod tests { input_backends: Vec::new(), #[cfg(feature = "vhost-user")] vhost_user_devices: Vec::new(), - console_output: None, smbios_oem_strings: None, nested_enabled: false, split_irqchip: false, - disable_implicit_console: false, + serial_consoles: Vec::new(), virtio_consoles: Vec::new(), kernel_console: None, diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index ccf86f5cd..92927ec9a 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -1,8 +1,14 @@ +#[cfg(not(feature = "aws-nitro"))] +use devices::virtio::fs::virtual_entry::VirtualDirEntry; + #[derive(Clone, Debug)] pub struct FsDeviceConfig { pub fs_id: String, - pub shared_dir: String, + /// Host directory to pass through. None means a virtual-only filesystem + /// (NullFs + AugmentFs, no host directory). + pub shared_dir: Option, pub shm_size: Option, - pub allow_root_dir_delete: bool, pub read_only: bool, + #[cfg(not(feature = "aws-nitro"))] + pub virtual_entries: Vec, } diff --git a/tests/run.sh b/tests/run.sh index 3d7b1e6ef..87bd65310 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -42,6 +42,11 @@ if [ "$OS" = "Darwin" ]; then export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER="clang" export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="-C link-arg=-target -C link-arg=aarch64-linux-gnu -C link-arg=-fuse-ld=lld -C link-arg=--sysroot=$SYSROOT -C link-arg=-static" echo "Cross-compiling guest-agent for $GUEST_TARGET" + + # e2fsprogs is keg-only on macOS; add it to PATH for mke2fs. + if [ -d "/opt/homebrew/opt/e2fsprogs/sbin" ]; then + export PATH="/opt/homebrew/opt/e2fsprogs/sbin:$PATH" + fi fi cargo build --target=$GUEST_TARGET -p guest-agent diff --git a/tests/test_cases/src/common.rs b/tests/test_cases/src/common.rs index 3d8881ff8..7e5a18eea 100644 --- a/tests/test_cases/src/common.rs +++ b/tests/test_cases/src/common.rs @@ -62,6 +62,7 @@ pub fn setup_fs_and_enter_with_env( argv.as_ptr(), envp.as_ptr(), ))?; + krun_call!(krun_inject_init(ctx, c"/dev/root".as_ptr()))?; krun_call!(krun_start_enter(ctx))?; } unreachable!() diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..0f0b88290 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -22,6 +22,12 @@ use test_multiport_console::TestMultiportConsole; mod test_virtiofs_root_ro; use test_virtiofs_root_ro::TestVirtiofsRootRo; +mod test_augmentfs; +use test_augmentfs::TestAugmentFs; + +mod test_root_disk_remount; +use test_root_disk_remount::TestRootDiskRemount; + mod test_pjdfstest; use test_pjdfstest::TestPjdfstest; @@ -84,6 +90,8 @@ pub fn test_cases() -> Vec { TestCase::new("net-vmnet-helper", Box::new(TestNet::new_vmnet_helper())), TestCase::new("multiport-console", Box::new(TestMultiportConsole)), TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), + TestCase::new("augmentfs", Box::new(TestAugmentFs)), + TestCase::new("root-disk-remount", Box::new(TestRootDiskRemount)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), diff --git a/tests/test_cases/src/test_augmentfs.rs b/tests/test_cases/src/test_augmentfs.rs new file mode 100644 index 000000000..c205e2a43 --- /dev/null +++ b/tests/test_cases/src/test_augmentfs.rs @@ -0,0 +1,283 @@ +// Test the AugmentFs overlay over a NullFs. +// +// Boots a VM with NO host filesystem — the root virtiofs is backed entirely +// by virtual inodes: init.krun (one-shot), the guest-agent binary (one-shot), +// a .krun_config.json (one-shot), persistent test files, and virtual +// directories as mount points for /dev, /proc, /sys. + +use macros::{guest, host}; + +pub struct TestAugmentFs; + +fn make_test_payload() -> Vec { + (0..8192u32).map(|i| (i % 251) as u8).collect() +} + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::os::fd::AsRawFd; + + impl Test for TestAugmentFs { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let test_case = CString::new(test_setup.test_case)?; + + // Read the guest-agent binary into memory. Leaked because + // krun_start_enter never returns. + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + let guest_agent_bytes: &'static [u8] = + Vec::leak(std::fs::read(&guest_agent_path).expect("Failed to read guest-agent")); + + // Build OCI config JSON: exec the guest-agent with our test name. + let json = CString::new(format!( + r#"{{"Entrypoint": ["/guest-agent"], "Cmd": ["{}"], "WorkingDir": "/"}}"#, + test_case.to_str().unwrap() + ))?; + + // Deterministic test payload for range-read tests. + let payload: &'static [u8] = Vec::leak(make_test_payload()); + + // A small marker file to test persistent reads. + let marker: &'static [u8] = b"virtual-file-marker-content-12345"; + + unsafe { + krun_call!(krun_init_log(KRUN_LOG_TARGET_DEFAULT, KRUN_LOG_LEVEL_TRACE, KRUN_LOG_STYLE_AUTO, 0))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; + + // Set up root with NO host directory (NullFs). + krun_call!(krun_add_virtiofs3( + ctx, + c"/dev/root".as_ptr(), + std::ptr::null(), // NULL path → NullFs + 0, // no SHM window + false, // not read-only + ))?; + + // Virtual directories needed by init as mount points. + for dir in [c"dev", c"proc", c"sys"] { + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + dir.as_ptr(), + 0o040_755, + ))?; + } + + // Configure init from OCI JSON and inject init binary + + // config into the root virtiofs. + krun_call!(krun_set_oci_config_json(ctx, json.as_ptr()))?; + krun_call!(krun_inject_init(ctx, c"/dev/root".as_ptr()))?; + + // Overlay guest-agent (one-shot, executable). After init + // execs it, the file should no longer be visible. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"guest-agent".as_ptr(), + guest_agent_bytes.as_ptr(), + guest_agent_bytes.len(), + 0o100_755, + true, + ))?; + + // Overlay a persistent marker file. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"marker.txt".as_ptr(), + marker.as_ptr(), + marker.len(), + 0o100_644, + false, + ))?; + + // Overlay a deterministic 8 KiB payload for range-read tests. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"testdata.bin".as_ptr(), + payload.as_ptr(), + payload.len(), + 0o100_444, + false, + ))?; + + // --- Nested path test (2-level) --- + // etc/ -> etc/nested/ -> etc/nested/deep.txt + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + c"etc".as_ptr(), + 0o040_755, + ))?; + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + c"etc/nested".as_ptr(), + 0o040_755, + ))?; + let nested_content: &'static [u8] = b"deep-nested-content"; + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"etc/nested/deep.txt".as_ptr(), + nested_content.as_ptr(), + nested_content.len(), + 0o100_644, + false, + ))?; + + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::io::{ErrorKind, Read, Seek, SeekFrom}; + use std::path::Path; + + impl Test for TestAugmentFs { + fn in_guest(self: Box) { + // --- One-shot files should be gone --- + assert!( + !Path::new("/.krun_config.json").exists(), + ".krun_config.json should be gone (one-shot)" + ); + assert!( + !Path::new("/init.krun").exists(), + "init.krun should be gone (one-shot)" + ); + + // --- One-shot guest-agent can't see itself --- + assert!( + !Path::new("/guest-agent").exists(), + "guest-agent should be gone (one-shot)" + ); + + // --- Virtual directories should be accessible --- + // init already mounted over these, but let's verify they + // exist as directories (the mount points came from our + // virtual dir overlay). + for dir in ["/dev", "/proc", "/sys"] { + let meta = fs::metadata(dir).unwrap_or_else(|e| panic!("{dir} should exist: {e}")); + assert!(meta.is_dir(), "{dir} should be a directory"); + } + + // Verify the mounts actually worked by checking known entries. + assert!( + Path::new("/dev/null").exists(), + "/dev/null should exist (devtmpfs)" + ); + assert!( + Path::new("/proc/self").exists(), + "/proc/self should exist (procfs)" + ); + assert!( + Path::new("/sys/kernel").exists(), + "/sys/kernel should exist (sysfs)" + ); + + // Verify directory listing works on each mounted fs. + let dev_entries: Vec<_> = fs::read_dir("/dev").expect("read_dir /dev").collect(); + assert!(!dev_entries.is_empty(), "/dev listing should not be empty"); + + let proc_entries: Vec<_> = fs::read_dir("/proc").expect("read_dir /proc").collect(); + assert!( + !proc_entries.is_empty(), + "/proc listing should not be empty" + ); + + let sys_entries: Vec<_> = fs::read_dir("/sys").expect("read_dir /sys").collect(); + assert!(!sys_entries.is_empty(), "/sys listing should not be empty"); + + // --- Persistent files should still exist --- + assert!(Path::new("/marker.txt").exists(), "marker.txt should exist"); + assert!( + Path::new("/testdata.bin").exists(), + "testdata.bin should exist" + ); + + // --- Read + verify marker content --- + let content = fs::read_to_string("/marker.txt").expect("read marker.txt"); + assert_eq!(content, "virtual-file-marker-content-12345"); + + // --- Repeated reads return the same data --- + let content2 = fs::read_to_string("/marker.txt").expect("re-read marker.txt"); + assert_eq!(content, content2, "repeated reads differ"); + + // --- Write should fail --- + let err = fs::OpenOptions::new() + .write(true) + .open("/marker.txt") + .expect_err("write-open should fail"); + assert_eq!(err.kind(), ErrorKind::PermissionDenied); + + // --- stat reports correct size --- + let meta = fs::metadata("/testdata.bin").expect("stat testdata.bin"); + assert_eq!(meta.len(), 8192, "testdata.bin size mismatch"); + + // --- Range reads on the 8 KiB payload --- + let expected = make_test_payload(); + let mut f = fs::File::open("/testdata.bin").expect("open testdata.bin"); + + // Full read. + let got = fs::read("/testdata.bin").expect("full read"); + assert_eq!(got, expected, "full read mismatch"); + + // Read first 256 bytes. + let mut buf = vec![0u8; 256]; + f.read_exact(&mut buf).expect("read first 256"); + assert_eq!(buf, &expected[..256], "first 256 bytes mismatch"); + + // Seek to offset 4000, read 512 bytes. + f.seek(SeekFrom::Start(4000)).expect("seek to 4000"); + let mut buf = vec![0u8; 512]; + f.read_exact(&mut buf).expect("read at offset 4000"); + assert_eq!(buf, &expected[4000..4512], "range [4000..4512] mismatch"); + + // Seek to last 10 bytes. + f.seek(SeekFrom::End(-10)).expect("seek to end-10"); + let mut buf = vec![0u8; 10]; + f.read_exact(&mut buf).expect("read last 10"); + assert_eq!(buf, &expected[8182..8192], "last 10 bytes mismatch"); + + // Read past EOF should return 0 bytes. + f.seek(SeekFrom::Start(8192)).expect("seek to EOF"); + let mut buf = vec![0u8; 100]; + let n = f.read(&mut buf).expect("read past EOF"); + assert_eq!(n, 0, "read past EOF should return 0"); + + // Seek back to start, re-read, verify consistency. + f.seek(SeekFrom::Start(0)).expect("seek to start"); + let mut full = Vec::new(); + f.read_to_end(&mut full).expect("read_to_end"); + assert_eq!(full, expected, "read_to_end mismatch"); + + // --- Nested path test (2-level: etc/nested/deep.txt) --- + let deep = + fs::read_to_string("/etc/nested/deep.txt").expect("read /etc/nested/deep.txt"); + assert_eq!(deep, "deep-nested-content"); + + println!("OK"); + } + } +} diff --git a/tests/test_cases/src/test_multiport_console.rs b/tests/test_cases/src/test_multiport_console.rs index b9c4c1fd6..7717d0610 100644 --- a/tests/test_cases/src/test_multiport_console.rs +++ b/tests/test_cases/src/test_multiport_console.rs @@ -50,11 +50,14 @@ mod host { impl Test for TestMultiportConsole { fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; - krun_call!(krun_disable_implicit_console(ctx))?; - // Add a default console (as with other tests this uses stdout for writing "OK") krun_call!(krun_add_virtio_console_default( ctx, diff --git a/tests/test_cases/src/test_net/mod.rs b/tests/test_cases/src/test_net/mod.rs index 9eb973a81..acac92956 100644 --- a/tests/test_cases/src/test_net/mod.rs +++ b/tests/test_cases/src/test_net/mod.rs @@ -86,6 +86,7 @@ mod host { use crate::common::setup_fs_and_enter; use crate::{krun_call, krun_call_u32, Test, TestOutcome, TestSetup}; use krun_sys::*; + use std::os::fd::AsRawFd; use std::thread; impl Test for TestNet { @@ -116,13 +117,24 @@ mod host { thread::spawn(move || tcp_tester.run_server(listener)); unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; krun_call!(krun_set_vm_config(ctx, 1, 512))?; // Backend-specific setup (self.setup_backend)(ctx, &test_setup)?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; } Ok(()) diff --git a/tests/test_cases/src/test_net_perf.rs b/tests/test_cases/src/test_net_perf.rs index 4c9c7acbe..3e16d3bba 100644 --- a/tests/test_cases/src/test_net_perf.rs +++ b/tests/test_cases/src/test_net_perf.rs @@ -155,6 +155,7 @@ mod host { use crate::common::setup_fs_and_enter; use crate::{krun_call, krun_call_u32, Test, TestOutcome, TestSetup}; use krun_sys::*; + use std::os::fd::AsRawFd; use std::process::{Child, Command, Stdio}; const CONTAINERFILE: &str = "\ @@ -360,6 +361,12 @@ RUN dnf install -y iperf3 && dnf clean all // Backend-specific setup (self.setup_backend)(ctx, &test_setup)?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; } Ok(()) diff --git a/tests/test_cases/src/test_pjdfstest.rs b/tests/test_cases/src/test_pjdfstest.rs index 741895cb5..ecfeb2006 100644 --- a/tests/test_cases/src/test_pjdfstest.rs +++ b/tests/test_cases/src/test_pjdfstest.rs @@ -9,6 +9,7 @@ mod host { use crate::{krun_call, krun_call_u32, ShouldRun, Test, TestOutcome, TestSetup}; use krun_sys::*; use std::ffi::CString; + use std::os::fd::AsRawFd; use macros::env_or_default; @@ -54,6 +55,12 @@ mod host { unsafe { let ctx = krun_call_u32!(krun_create_ctx())?; krun_call!(krun_set_vm_config(ctx, 2, 1024))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter_with_env(ctx, test_setup, &[host_os_env.as_c_str()])?; } Ok(()) diff --git a/tests/test_cases/src/test_root_disk_remount.rs b/tests/test_cases/src/test_root_disk_remount.rs new file mode 100644 index 000000000..c52cdc422 --- /dev/null +++ b/tests/test_cases/src/test_root_disk_remount.rs @@ -0,0 +1,183 @@ +// Test that krun_set_root_disk_remount works with NullFs. +// +// Creates a tiny ext4 disk image containing only the guest-agent binary, +// boots from it via krun_set_root_disk_remount (which uses NullFs for the +// initial virtiofs root with init.krun overlaid), and verifies the guest +// successfully pivoted to the block device root. + +use macros::{guest, host}; + +pub struct TestRootDiskRemount; + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32, ShouldRun}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use nix::libc; + use std::ffi::CString; + use std::os::fd::AsRawFd; + use std::process::Command; + use std::ptr::null; + + type KrunAddDisk3Fn = unsafe extern "C" fn( + ctx_id: u32, + block_id: *const std::ffi::c_char, + disk_path: *const std::ffi::c_char, + disk_format: u32, + read_only: bool, + direct_io: bool, + sync_mode: u32, + ) -> i32; + + type KrunSetRootDiskRemountFn = unsafe extern "C" fn( + ctx_id: u32, + device: *const std::ffi::c_char, + fstype: *const std::ffi::c_char, + options: *const std::ffi::c_char, + ) -> i32; + + fn get_krun_add_disk3() -> KrunAddDisk3Fn { + let symbol = CString::new("krun_add_disk3").unwrap(); + let ptr = unsafe { libc::dlsym(libc::RTLD_DEFAULT, symbol.as_ptr()) }; + assert!(!ptr.is_null(), "krun_add_disk3 not found"); + unsafe { std::mem::transmute(ptr) } + } + + fn get_krun_set_root_disk_remount() -> KrunSetRootDiskRemountFn { + let symbol = CString::new("krun_set_root_disk_remount").unwrap(); + let ptr = unsafe { libc::dlsym(libc::RTLD_DEFAULT, symbol.as_ptr()) }; + assert!(!ptr.is_null(), "krun_set_root_disk_remount not found"); + unsafe { std::mem::transmute(ptr) } + } + + fn create_disk_image(guest_agent_path: &str, output_path: &str) { + // Populate from a staging directory using mke2fs -d (no root needed). + let staging = format!("{output_path}.staging"); + std::fs::create_dir_all(&staging).expect("mkdir staging"); + + std::fs::copy(guest_agent_path, format!("{staging}/guest-agent")) + .expect("copy guest-agent"); + + // Marker file to verify the guest booted from the block device. + std::fs::write( + format!("{staging}/block-marker"), + "booted-from-block-device", + ) + .expect("write marker"); + + let status = Command::new("mke2fs") + .args(["-q", "-t", "ext4", "-d", &staging, output_path, "32M"]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .expect("mke2fs failed"); + assert!(status.success(), "mke2fs failed"); + + std::fs::remove_dir_all(&staging).expect("cleanup staging"); + } + + impl Test for TestRootDiskRemount { + fn should_run(&self) -> ShouldRun { + if unsafe { krun_call_u32!(krun_has_feature(KRUN_FEATURE_BLK.into())) }.ok() != Some(1) + { + return ShouldRun::No("libkrun compiled without BLK"); + } + ShouldRun::Yes + } + + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let krun_add_disk3 = get_krun_add_disk3(); + let krun_set_root_disk_remount = get_krun_set_root_disk_remount(); + + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + + let disk_path = format!("{}/rootfs.ext4", test_setup.tmp_dir.display()); + create_disk_image(&guest_agent_path, &disk_path); + + let c_disk_path = CString::new(disk_path)?; + let test_case = CString::new(test_setup.test_case)?; + + unsafe { + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; + + let argv = [test_case.as_ptr(), null()]; + let envp = [null()]; + krun_call!(krun_set_exec( + ctx, + c"/guest-agent".as_ptr(), + argv.as_ptr(), + envp.as_ptr(), + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + + // Add a block device with the ext4 image. + krun_call!(krun_add_disk3( + ctx, + c"vda".as_ptr(), + c_disk_path.as_ptr(), + KRUN_DISK_FORMAT_RAW, + false, + false, + KRUN_SYNC_FULL, + ))?; + + // Configure block device as root, pivot from NullFs. + krun_call!(krun_set_root_disk_remount( + ctx, + c"/dev/vda".as_ptr(), + c"ext4".as_ptr(), + std::ptr::null(), + ))?; + + krun_call!(krun_inject_init(ctx, c"/dev/root".as_ptr()))?; + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::path::Path; + + impl Test for TestRootDiskRemount { + fn in_guest(self: Box) { + // Verify we're running from the block device root. + let marker = fs::read_to_string("/block-marker") + .expect("Failed to read /block-marker — not on block device root?"); + assert_eq!(marker, "booted-from-block-device"); + + // The init.krun virtual file should be gone (one-shot, and we + // pivoted away from the NullFs root anyway). + assert!(!Path::new("/init.krun").exists()); + + // /proc and /dev should be mounted (init re-mounts after pivot). + assert!(Path::new("/proc/self").exists(), "/proc/self missing"); + assert!(Path::new("/dev/null").exists(), "/dev/null missing"); + + println!("OK"); + } + } +} diff --git a/tests/test_cases/src/test_tsi_tcp_guest_connect.rs b/tests/test_cases/src/test_tsi_tcp_guest_connect.rs index 426cdad05..f574eb3e1 100644 --- a/tests/test_cases/src/test_tsi_tcp_guest_connect.rs +++ b/tests/test_cases/src/test_tsi_tcp_guest_connect.rs @@ -24,6 +24,7 @@ mod host { use crate::{krun_call, krun_call_u32}; use crate::{Test, TestSetup}; use krun_sys::*; + use std::os::fd::AsRawFd; use std::thread; impl Test for TestTsiTcpGuestConnect { @@ -31,9 +32,21 @@ mod host { let listener = self.tcp_tester.create_server_socket(); thread::spawn(move || self.tcp_tester.run_server(listener)); unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_add_vsock(ctx, KRUN_TSI_HIJACK_INET))?; krun_call!(krun_set_vm_config(ctx, 1, 512))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; } Ok(()) diff --git a/tests/test_cases/src/test_tsi_tcp_guest_listen.rs b/tests/test_cases/src/test_tsi_tcp_guest_listen.rs index 41e0ffc2d..8197cb855 100644 --- a/tests/test_cases/src/test_tsi_tcp_guest_listen.rs +++ b/tests/test_cases/src/test_tsi_tcp_guest_listen.rs @@ -23,6 +23,7 @@ mod host { use crate::{krun_call, krun_call_u32, Test, TestSetup}; use krun_sys::*; use std::ffi::CString; + use std::os::fd::AsRawFd; use std::ptr::null; use std::thread; use std::time::Duration; @@ -35,14 +36,26 @@ mod host { self.tcp_tester.run_client(); }); - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; let port_mapping = format!("{PORT}:{PORT}"); let port_mapping = CString::new(port_mapping).unwrap(); let port_map = [port_mapping.as_ptr(), null()]; + krun_call!(krun_add_vsock(ctx, KRUN_TSI_HIJACK_INET))?; krun_call!(krun_set_port_map(ctx, port_map.as_ptr()))?; krun_call!(krun_set_vm_config(ctx, 1, 512))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; println!("OK"); } diff --git a/tests/test_cases/src/test_virtiofs_misc.rs b/tests/test_cases/src/test_virtiofs_misc.rs index 2bd8b69cb..137d58fa9 100644 --- a/tests/test_cases/src/test_virtiofs_misc.rs +++ b/tests/test_cases/src/test_virtiofs_misc.rs @@ -11,13 +11,25 @@ mod host { use crate::{Test, TestOutcome, TestSetup}; use krun_sys::*; use std::io::Read; + use std::os::fd::AsRawFd; impl Test for TestVirtioFsMisc { fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; krun_call!(krun_set_vm_config(ctx, 1, 1024))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; } Ok(()) diff --git a/tests/test_cases/src/test_virtiofs_root_ro.rs b/tests/test_cases/src/test_virtiofs_root_ro.rs index 1fff83ce2..26bce9f78 100644 --- a/tests/test_cases/src/test_virtiofs_root_ro.rs +++ b/tests/test_cases/src/test_virtiofs_root_ro.rs @@ -20,6 +20,7 @@ mod host { use krun_sys::*; use std::ffi::CString; use std::fs; + use std::os::fd::AsRawFd; use std::os::unix::ffi::OsStrExt; use std::ptr::null; @@ -40,9 +41,20 @@ mod host { let envp = [null()]; unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; krun_call!(krun_set_vm_config(ctx, 1, 512))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; // Use "/dev/root" tag (KRUN_FS_ROOT_TAG) with read_only=true krun_call!(krun_add_virtiofs3( @@ -60,6 +72,7 @@ mod host { argv.as_ptr(), envp.as_ptr(), ))?; + krun_call!(krun_inject_init(ctx, c"/dev/root".as_ptr()))?; krun_call!(krun_start_enter(ctx))?; } Ok(()) diff --git a/tests/test_cases/src/test_vm_config.rs b/tests/test_cases/src/test_vm_config.rs index 9ccae5de1..604856912 100644 --- a/tests/test_cases/src/test_vm_config.rs +++ b/tests/test_cases/src/test_vm_config.rs @@ -13,13 +13,25 @@ mod host { use crate::{krun_call, krun_call_u32}; use crate::{Test, TestSetup}; use krun_sys::*; + use std::os::fd::AsRawFd; impl Test for TestVmConfig { fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; krun_call!(krun_set_vm_config(ctx, self.num_cpus, self.ram_mib))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; } Ok(()) diff --git a/tests/test_cases/src/test_vsock_guest_connect.rs b/tests/test_cases/src/test_vsock_guest_connect.rs index bb0482f29..bc82b3800 100644 --- a/tests/test_cases/src/test_vsock_guest_connect.rs +++ b/tests/test_cases/src/test_vsock_guest_connect.rs @@ -39,6 +39,7 @@ mod host { use krun_sys::*; use std::ffi::CString; use std::io::Write; + use std::os::fd::AsRawFd; use std::os::unix::net::UnixListener; use std::os::unix::prelude::OsStrExt; use std::{mem, thread}; @@ -63,14 +64,27 @@ mod host { thread::spawn(move || server(listener)); unsafe { - krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + krun_call!(krun_init_log( + KRUN_LOG_TARGET_DEFAULT, + KRUN_LOG_LEVEL_TRACE, + KRUN_LOG_STYLE_AUTO, + 0 + ))?; let ctx = krun_call_u32!(krun_create_ctx())?; - krun_call!(krun_add_vsock_port( + krun_call!(krun_add_vsock(ctx, 0))?; + krun_call!(krun_add_vsock_port2( ctx, VSOCK_PORT, - sock_path_cstr.as_ptr() + sock_path_cstr.as_ptr(), + false, ))?; krun_call!(krun_set_vm_config(ctx, 1, 1024))?; + krun_call!(krun_add_virtio_console_default( + ctx, + std::io::stdin().as_raw_fd(), + std::io::stdout().as_raw_fd(), + std::io::stderr().as_raw_fd(), + ))?; setup_fs_and_enter(ctx, test_setup)?; } Ok(())