From e037a409879e088dfe3475f7e00173a39ba14b18 Mon Sep 17 00:00:00 2001 From: Dmitry Porokh Date: Wed, 27 May 2026 14:03:26 -0700 Subject: [PATCH 1/2] refactor(api): move machine controller to separate crate Signed-off-by: Dmitry Porokh --- Cargo.lock | 221 +++++++++++++----- crates/api/Cargo.toml | 152 +++++------- crates/api/src/api.rs | 4 +- crates/api/src/attestation/measured_boot.rs | 2 +- crates/api/src/cfg/file.rs | 84 +------ crates/api/src/handlers/attestation.rs | 2 +- crates/api/src/handlers/instance.rs | 2 +- crates/api/src/handlers/machine_validation.rs | 4 +- .../dpu_nic_firmware.rs | 2 +- crates/api/src/machine_update_manager/mod.rs | 2 +- crates/api/src/machine_validation/mod.rs | 2 +- crates/api/src/run.rs | 2 +- crates/api/src/setup.rs | 102 ++++---- crates/api/src/state_controller/mod.rs | 1 - .../api/src/tests/common/api_fixtures/mod.rs | 28 +-- crates/api/src/tests/dpf/duplicate_events.rs | 2 +- crates/api/src/tests/dpf/happy_path.rs | 6 +- crates/api/src/tests/dpf/reprovisioning.rs | 2 +- crates/api/src/tests/dpf/stale_labels.rs | 2 +- crates/api/src/tests/dpf/waiting_for_ready.rs | 2 +- crates/api/src/tests/dpu_nic_firmware.rs | 2 +- crates/api/src/tests/dpu_reprovisioning.rs | 2 +- .../api/src/tests/host_bmc_firmware_test.rs | 6 +- .../src/tests/machine_admin_force_delete.rs | 2 +- crates/api/src/tests/machine_creator.rs | 2 +- crates/api/src/tests/machine_setup.rs | 82 +++++++ crates/api/src/tests/machine_states.rs | 8 +- .../api/src/tests/machine_update_manager.rs | 2 +- crates/api/src/tests/machine_validation.rs | 6 +- crates/api/src/tests/mod.rs | 1 + crates/machine-controller/Cargo.toml | 76 ++++++ .../src}/config/bom_validation.rs | 0 .../src}/config/controller.rs | 0 .../src}/config/firmware_global.rs | 4 +- .../src/config/machine_validation.rs | 84 +++++++ .../src}/config/mod.rs | 12 + .../src}/config/power_manager.rs | 0 .../src}/context.rs | 4 +- .../machine => machine-controller/src}/dpf.rs | 4 +- .../src}/handler.rs | 91 +------- .../src}/handler/attestation.rs | 2 +- .../src}/handler/bios_config.rs | 4 +- .../src}/handler/dpf.rs | 4 +- .../src}/handler/helpers.rs | 0 .../src}/handler/machine_validation.rs | 8 +- .../src}/handler/power.rs | 6 +- .../src}/handler/sku.rs | 6 +- .../src}/health_report.rs | 0 .../machine => machine-controller/src}/io.rs | 4 +- .../mod.rs => machine-controller/src/lib.rs} | 2 +- .../src}/metrics.rs | 2 +- .../src}/redfish.rs | 4 +- .../src}/write_ops.rs | 0 53 files changed, 606 insertions(+), 448 deletions(-) create mode 100644 crates/api/src/tests/machine_setup.rs create mode 100644 crates/machine-controller/Cargo.toml rename crates/{api/src/state_controller/machine => machine-controller/src}/config/bom_validation.rs (100%) rename crates/{api/src/state_controller/machine => machine-controller/src}/config/controller.rs (100%) rename crates/{api/src/state_controller/machine => machine-controller/src}/config/firmware_global.rs (98%) create mode 100644 crates/machine-controller/src/config/machine_validation.rs rename crates/{api/src/state_controller/machine => machine-controller/src}/config/mod.rs (78%) rename crates/{api/src/state_controller/machine => machine-controller/src}/config/power_manager.rs (100%) rename crates/{api/src/state_controller/machine => machine-controller/src}/context.rs (93%) rename crates/{api/src/state_controller/machine => machine-controller/src}/dpf.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/attestation.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/bios_config.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/dpf.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/helpers.rs (100%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/machine_validation.rs (97%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/power.rs (97%) rename crates/{api/src/state_controller/machine => machine-controller/src}/handler/sku.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/health_report.rs (100%) rename crates/{api/src/state_controller/machine => machine-controller/src}/io.rs (99%) rename crates/{api/src/state_controller/machine/mod.rs => machine-controller/src/lib.rs} (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/metrics.rs (99%) rename crates/{api/src/state_controller/machine => machine-controller/src}/redfish.rs (97%) rename crates/{api/src/state_controller/machine => machine-controller/src}/write_ops.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 5ff7ef2b6d..70f9aec032 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1164,9 +1164,7 @@ dependencies = [ "axum", "axum-extra", "base64", - "bmc-explorer", "bmc-vendor", - "bms-dsx-exchange", "byteorder", "bytes", "carbide-api-db", @@ -1175,7 +1173,6 @@ dependencies = [ "carbide-dpa-interface-controller", "carbide-dpf", "carbide-firmware", - "carbide-health-metrics", "carbide-health-report", "carbide-host-support", "carbide-ib-fabric", @@ -1183,14 +1180,19 @@ dependencies = [ "carbide-ipmi", "carbide-ipxe-renderer", "carbide-libmlx", + "carbide-machine-controller", "carbide-macros", "carbide-measured-boot", "carbide-metrics-utils", + "carbide-mqtt-common", "carbide-network", "carbide-network-segment-controller", "carbide-nvlink-manager", + "carbide-power-shelf-controller", "carbide-preingestion-manager", "carbide-prost-builder", + "carbide-rack", + "carbide-rack-controller", "carbide-redfish", "carbide-rpc", "carbide-rpc-utils", @@ -1198,9 +1200,8 @@ dependencies = [ "carbide-site-explorer", "carbide-spdm-controller", "carbide-sqlx-testing", - "carbide-ssh", + "carbide-state-controller-common", "carbide-switch-controller", - "carbide-tls", "carbide-utils", "carbide-uuid", "carbide-version", @@ -1210,7 +1211,6 @@ dependencies = [ "component-manager", "config-version", "const_format", - "crypto-bigint 0.7.0-rc.28", "ctor", "dashmap", "data-encoding", @@ -1226,29 +1226,21 @@ dependencies = [ "http", "http-body-util", "hyper", - "hyper-rustls", - "hyper-timeout", "hyper-util", "ipnetwork", "itertools 0.14.0", "jsonwebtoken", - "k8s-openapi", - "kube", "lazy_static", "libnmxc", - "libnmxm", "libredfish", "librms", "logfmt", "mac_address", - "mockall", "mockito", "mqttea", "nras", "num_cpus", "oauth2", - "oid-registry", - "once_cell", "opentelemetry", "opentelemetry-otlp", "opentelemetry-prometheus", @@ -1259,7 +1251,6 @@ dependencies = [ "prometheus", "prometheus-text-parser", "prost", - "prost-types", "rand 0.10.1", "rcgen", "regex", @@ -1280,7 +1271,6 @@ dependencies = [ "strum 0.28.0", "temp-dir", "tempfile", - "tera", "thiserror 2.0.18", "time", "tokio", @@ -1292,7 +1282,6 @@ dependencies = [ "tonic-reflection", "tower", "tower-http", - "tower-test", "tracing", "tracing-log", "tracing-opentelemetry", @@ -1301,7 +1290,6 @@ dependencies = [ "url", "urlencoding", "uuid", - "version-compare 0.2.1", "x509-parser", ] @@ -1331,7 +1319,7 @@ dependencies = [ "eyre", "futures", "futures-util", - "hickory-proto 0.26.1", + "hickory-proto", "ipnetwork", "itertools 0.14.0", "lazy_static", @@ -2185,6 +2173,50 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-machine-controller" +version = "0.0.0" +dependencies = [ + "async-trait", + "bmc-vendor", + "carbide-api-db", + "carbide-api-model", + "carbide-dpf", + "carbide-firmware", + "carbide-health-metrics", + "carbide-health-report", + "carbide-ipmi", + "carbide-measured-boot", + "carbide-redfish", + "carbide-rpc", + "carbide-secrets", + "carbide-state-controller-common", + "carbide-utils", + "carbide-uuid", + "chrono", + "config-version", + "duration-str", + "eyre", + "figment", + "futures", + "futures-util", + "itertools 0.14.0", + "lazy_static", + "libredfish", + "mac_address", + "mockall", + "opentelemetry", + "regex", + "serde", + "serde_json", + "sqlx", + "state-controller", + "tokio", + "tracing", + "uuid", + "version-compare 0.2.1", +] + [[package]] name = "carbide-machine-validation" version = "0.0.0" @@ -2259,6 +2291,18 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "carbide-mqtt-common" +version = "0.1.0" +dependencies = [ + "async-trait", + "mqttea", + "opentelemetry", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "carbide-network" version = "0.0.0" @@ -2321,6 +2365,28 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-power-shelf-controller" +version = "0.0.0" +dependencies = [ + "async-trait", + "carbide-api-db", + "carbide-api-model", + "carbide-health-metrics", + "carbide-rack", + "carbide-secrets", + "carbide-utils", + "carbide-uuid", + "config-version", + "eyre", + "librms", + "mac_address", + "opentelemetry", + "sqlx", + "state-controller", + "tracing", +] + [[package]] name = "carbide-preingestion-manager" version = "0.0.1" @@ -2389,6 +2455,60 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-rack" +version = "0.0.0" +dependencies = [ + "async-trait", + "bms-dsx-exchange", + "carbide-api-db", + "carbide-api-model", + "carbide-health-report", + "carbide-mqtt-common", + "carbide-secrets", + "carbide-uuid", + "chrono", + "eyre", + "librms", + "mac_address", + "mqttea", + "opentelemetry", + "serde_json", + "sqlx", + "state-controller", + "tokio", + "tokio-util", + "tonic", + "tracing", +] + +[[package]] +name = "carbide-rack-controller" +version = "0.0.0" +dependencies = [ + "async-trait", + "carbide-api-db", + "carbide-api-model", + "carbide-health-metrics", + "carbide-rack", + "carbide-secrets", + "carbide-utils", + "carbide-uuid", + "chrono", + "config-version", + "duration-str", + "eyre", + "librms", + "mac_address", + "opentelemetry", + "serde", + "serde_json", + "sqlx", + "state-controller", + "tonic", + "tracing", +] + [[package]] name = "carbide-redfish" version = "0.0.1" @@ -2744,6 +2864,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-state-controller-common" +version = "0.0.0" +dependencies = [ + "carbide-utils", + "duration-str", + "serde", + "state-controller", +] + [[package]] name = "carbide-switch-controller" version = "0.0.0" @@ -3945,14 +4075,14 @@ checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04" [[package]] name = "dhcproto" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "425ab19f6a915beac79cac8ec2810c1311b502ae14d7f294682081cf5ae4c5bb" +checksum = "c278d2f17dbcb7332f3b31788be67f76017096c5eedc293e1259f2d48b0f891f" dependencies = [ "dhcproto-macros", - "hickory-proto 0.25.2", + "hickory-proto", "ipnet", - "rand 0.9.4", + "rand 0.10.1", "thiserror 2.0.18", ] @@ -5127,7 +5257,7 @@ dependencies = [ "futures-channel", "futures-io", "futures-util", - "hickory-proto 0.26.1", + "hickory-proto", "idna 1.1.0", "ipnet", "jni", @@ -5139,28 +5269,6 @@ dependencies = [ "url", ] -[[package]] -name = "hickory-proto" -version = "0.25.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" -dependencies = [ - "async-trait", - "cfg-if", - "data-encoding", - "enum-as-inner", - "futures-channel", - "futures-util", - "idna 1.1.0", - "ipnet", - "once_cell", - "rand 0.9.4", - "thiserror 2.0.18", - "tinyvec", - "tracing", - "url", -] - [[package]] name = "hickory-proto" version = "0.26.1" @@ -5190,7 +5298,7 @@ dependencies = [ "cfg-if", "futures-util", "hickory-net", - "hickory-proto 0.26.1", + "hickory-proto", "ipconfig", "ipnet", "jni", @@ -9111,12 +9219,8 @@ dependencies = [ "flume", "futures-util", "log", - "rustls-native-certs", - "rustls-pemfile", - "rustls-webpki 0.102.8", "thiserror 2.0.18", "tokio", - "tokio-rustls", "tokio-stream", "tokio-util", ] @@ -9353,7 +9457,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.13", + "rustls-webpki", "subtle", "zeroize", ] @@ -9403,7 +9507,7 @@ dependencies = [ "rustls", "rustls-native-certs", "rustls-platform-verifier-android", - "rustls-webpki 0.103.13", + "rustls-webpki", "security-framework", "security-framework-sys", "webpki-root-certs", @@ -9416,17 +9520,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" -[[package]] -name = "rustls-webpki" -version = "0.102.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" -dependencies = [ - "ring", - "rustls-pki-types", - "untrusted 0.9.0", -] - [[package]] name = "rustls-webpki" version = "0.103.13" diff --git a/crates/api/Cargo.toml b/crates/api/Cargo.toml index 98cb44550e..60ca99b174 100644 --- a/crates/api/Cargo.toml +++ b/crates/api/Cargo.toml @@ -30,65 +30,58 @@ name = "carbide-api" path = "src/main.rs" [dependencies] -# [local-dependencies] -# DO NOT PUT DEPENDENCIES OTHER THAN LOCAL DEPS HERE, THEY SHOULD ALL HAVE 'path =' IN THEM. +# External dependencies. PLEASE KEEP ALPHABETIZED ORDER. bmc-vendor = { path = "../bmc-vendor" } -bmc-explorer = { path = "../bmc-explorer" } -bms-dsx-exchange = { path = "../bms-dsx-exchange" } -config-version = { path = "../config-version", features = ["sqlx"] } +carbide-api-model = { path = "../api-model", default-features = false } +carbide-api-db = { path = "../api-db", default-features = false } +carbide-authn = { path = "../authn" } carbide-dpa-interface-controller = { path = "../dpa-interface-controller" } -carbide-host-support = { path = "../host-support", default-features = false } -carbide-mqtt-common = { path = "../mqtt-common" } -carbide-network = { path = "../network", features = ["sqlx"] } -carbide-rack = { path = "../rack" } -carbide-rack-controller = { path = "../rack-controller" } -carbide-power-shelf-controller = { path = "../power-shelf-controller" } -carbide-secrets = { path = "../secrets" } -carbide-version = { path = "../version" } +carbide-dpf = { path = "../dpf" } carbide-firmware = { path = "../firmware" } carbide-health-report = { path = "../health-report" } -carbide-health-metrics = { path = "../health-metrics" } +carbide-host-support = { path = "../host-support", default-features = false } carbide-ib-fabric = { path = "../ib-fabric" } carbide-ib-partition-controller = { path = "../ib-partition-controller" } -carbide-ipxe-renderer = { path = "../ipxe-renderer" } carbide-ipmi = { path = "../ipmi" } -carbide-redfish = { path = "../redfish" } +carbide-ipxe-renderer = { path = "../ipxe-renderer" } +carbide-libmlx = { path = "../libmlx" } +carbide-machine-controller = { path = "../machine-controller" } +carbide-measured-boot = { path = "../measured-boot", features = ["sqlx"] } +carbide-metrics-utils = { path = "../metrics-utils" } +carbide-mqtt-common = { path = "../mqtt-common" } +carbide-network = { path = "../network", features = ["sqlx"] } carbide-network-segment-controller = { path = "../network-segment-controller" } +carbide-nvlink-manager = { path = "../nvlink-manager" } +carbide-power-shelf-controller = { path = "../power-shelf-controller" } +carbide-preingestion-manager = { path = "../preingestion-manager" } +carbide-rack = { path = "../rack" } +carbide-rack-controller = { path = "../rack-controller" } +carbide-redfish = { path = "../redfish" } +carbide-rpc = { path = "../rpc", features = ["sqlx", "model"] } +carbide-rpc-utils = { path = "../rpc-utils" } +carbide-secrets = { path = "../secrets" } carbide-site-explorer = { path = "../site-explorer" } carbide-spdm-controller = { path = "../spdm-controller" } carbide-state-controller-common = { path = "../state-controller-common" } carbide-switch-controller = { path = "../switch-controller" } -carbide-preingestion-manager = { path = "../preingestion-manager" } -carbide-nvlink-manager = { path = "../nvlink-manager" } +carbide-utils = { path = "../utils", features = ["sqlx"] } +carbide-uuid = { path = "../uuid", features = ["sqlx"] } +carbide-version = { path = "../version" } +component-manager = { path = "../component-manager" } +config-version = { path = "../config-version", features = ["sqlx"] } dns-record = { path = "../dns-record" } -libnmxm = { path = "../libnmxm" } libnmxc = { path = "../libnmxc" } logfmt = { path = "../logfmt" } mqttea = { path = "../mqttea" } -carbide-rpc = { path = "../rpc", features = ["sqlx", "model"] } -carbide-rpc-utils = { path = "../rpc-utils" } -carbide-utils = { path = "../utils", features = ["sqlx"] } -carbide-ssh = { path = "../ssh" } -carbide-tls = { path = "../tls" } -carbide-uuid = { path = "../uuid", features = ["sqlx"] } -carbide-measured-boot = { path = "../measured-boot", features = ["sqlx"] } -carbide-metrics-utils = { path = "../metrics-utils" } -carbide-libmlx = { path = "../libmlx" } -carbide-api-model = { path = "../api-model", default-features = false } -carbide-api-db = { path = "../api-db", default-features = false } -carbide-authn = { path = "../authn" } -spancounter = { path = "../spancounter" } nras = { path = "../nras" } -carbide-dpf = { path = "../dpf" } -component-manager = { path = "../component-manager" } -sqlx-query-tracing = { path = "../sqlx-query-tracing" } +spancounter = { path = "../spancounter" } state-controller = { path = "../state-controller" } -# DO NOT PUT DEPENDENCIES OTHER THAN LOCAL DEPS HERE, THEY SHOULD ALL HAVE 'path =' IN THEM. +sqlx-query-tracing = { path = "../sqlx-query-tracing" } -#these are alphabetized +# External dependencies. PLEASE KEEP ALPHABETIZED ORDER. ansi-to-html = { workspace = true } arc-swap = { workspace = true } -askama = { features = ["serde_json"], workspace = true } +askama = { workspace = true, features = ["serde_json"] } askama_escape = { workspace = true } asn1-rs = { workspace = true } async-trait = { workspace = true } @@ -97,15 +90,14 @@ axum-extra = { workspace = true, features = ["cookie", "cookie-private", "typed- base64 = { workspace = true } byteorder = { workspace = true } bytes = { workspace = true } -casbin = { features = ["glob"], workspace = true } +casbin = { workspace = true, features = ["glob"] } chrono = { workspace = true } clap = { workspace = true } -crypto-bigint = { workspace = true } dashmap = { workspace = true } data-encoding = { workspace = true } duration-str = { workspace = true } eyre = { workspace = true } -figment = { features = ["env", "toml"], workspace = true } +figment = { workspace = true, features = ["env", "toml"] } futures = { workspace = true } futures-util = { workspace = true } hex = { workspace = true } @@ -114,32 +106,20 @@ hostname = { workspace = true } http = { workspace = true } http-body-util = { workspace = true } hyper = { workspace = true, features = ["full"] } -hyper-rustls = { workspace = true } -hyper-timeout = { workspace = true } hyper-util = { workspace = true } ipnetwork = { workspace = true, features = ["serde"] } itertools = { workspace = true } -jsonwebtoken = { features = ["rust_crypto"], workspace = true } -k8s-openapi = { features = ["latest"], workspace = true } -kube = { default-features = false, features = [ - "runtime", - "derive", - "client", - "rustls-tls", -], workspace = true } +jsonwebtoken = { workspace = true, features = ["rust_crypto"] } lazy_static = { workspace = true } libredfish = { workspace = true } librms = { workspace = true } mac_address = { workspace = true } num_cpus = { workspace = true } -oauth2 = { default-features = false, workspace = true } -oid-registry = { workspace = true } +oauth2 = { workspace = true, default-features = false } opentelemetry = { workspace = true, features = ["logs"] } opentelemetry-otlp = { workspace = true, features = ["grpc-tonic"] } -opentelemetry-prometheus.workspace = true -opentelemetry-semantic-conventions = { features = [ - "semconv_experimental", -], workspace = true } +opentelemetry-prometheus = { workspace = true } +opentelemetry-semantic-conventions = { workspace = true, features = ["semconv_experimental"] } opentelemetry_sdk = { workspace = true, features = [ "logs", "rt-tokio", @@ -149,19 +129,15 @@ opentelemetry_sdk = { workspace = true, features = [ pkcs1 = { workspace = true } p256 = { workspace = true } prometheus = { workspace = true } -prost-types = { workspace = true } rand = { workspace = true } regex = { workspace = true } -reqwest = { default-features = false, features = [ - "rustls", - "stream", -], workspace = true } +reqwest = { workspace = true, default-features = false, features = ["rustls", "stream"] } rsa = { workspace = true } rumqttc = { workspace = true } rustls = { workspace = true } rustls-pemfile = { workspace = true } rustls-pki-types = { workspace = true } -serde = { features = ["derive"], workspace = true } +serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } serde_yaml = { workspace = true } sha2 = { workspace = true } @@ -178,7 +154,6 @@ sqlx = { workspace = true, features = [ ] } strum = { workspace = true } temp-dir = { workspace = true } -tera = { workspace = true } thiserror = { workspace = true } time = { workspace = true } tokio = { workspace = true } @@ -189,25 +164,16 @@ toml = { workspace = true } tonic = { workspace = true } tonic-reflection = { workspace = true } tower = { workspace = true } -tower-http = { features = [ - "add-extension", - "auth", - "normalize-path", -], workspace = true } +tower-http = { workspace = true, features = ["add-extension", "auth", "normalize-path"] } tracing = { workspace = true } tracing-log = { workspace = true } tracing-opentelemetry = { workspace = true } -tracing-subscriber = { features = [ - "env-filter", - "local-time", -], workspace = true } -tss-esapi = { optional = true, workspace = true } -url = { features = ["serde"], workspace = true } +tracing-subscriber = { workspace = true, features = ["env-filter", "local-time"] } +tss-esapi = { workspace = true, optional = true } +url = { workspace = true, features = ["serde"] } urlencoding = { workspace = true } -uuid = { features = ["v4", "serde"], workspace = true } -version-compare = { workspace = true } -x509-parser = { features = ["verify"], workspace = true } -#these are alphabetized +uuid = { workspace = true, features = ["v4", "serde"] } +x509-parser = { workspace = true, features = ["verify"] } [features] default = ["linux-build"] @@ -217,28 +183,26 @@ linux-build = ["tss-esapi"] carbide-version = { path = "../version" } [dev-dependencies] -figment = { features = ["env", "test", "toml"], workspace = true } -ctor = { workspace = true } -lazy_static = { workspace = true } -const_format = { workspace = true } -mockall = { workspace = true } -rcgen = { workspace = true } +# External dependencies. PLEASE KEEP ALPHABETIZED ORDER. +carbide-ib-fabric = { path = "../ib-fabric", features = ["test-support"] } +carbide-machine-controller = { path = "../machine-controller", features = ["test-support"] } carbide-macros = { path = "../macros" } -carbide-sqlx-testing = { path = "../sqlx-testing", default-features = false } -carbide-prost-builder = { path = "../prost-builder" } carbide-nvlink-manager = { path = "../nvlink-manager", features = ["test-support"] } +carbide-prost-builder = { path = "../prost-builder" } carbide-rack = { path = "../rack", features = ["test-support"] } carbide-redfish = { path = "../redfish", features = ["test-support"] } +carbide-sqlx-testing = { path = "../sqlx-testing", default-features = false } carbide-utils = { path = "../utils", features = ["test-support"] } -state-controller = { path = "../state-controller", features = ["test-support"] } -carbide-ib-fabric = { path = "../ib-fabric", features = ["test-support"] } prometheus-text-parser = { path = "../prometheus-text-parser" } -prost = { workspace = true } -tower-test = { workspace = true } -hyper = { features = ["client", "http1"], workspace = true } -http = { workspace = true } +state-controller = { path = "../state-controller", features = ["test-support"] } + +# External dependencies. PLEASE KEEP ALPHABETIZED ORDER. +const_format = { workspace = true } +ctor = { workspace = true } +figment = { workspace = true, features = ["env", "test", "toml"] } mockito = { workspace = true } -once_cell = { workspace = true } +prost = { workspace = true } +rcgen = { workspace = true } tempfile = { workspace = true } [lints] diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 3615362609..4b3b7de1df 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -31,6 +31,8 @@ use ::rpc::protos::dns::{ }; use ::rpc::protos::{measured_boot as measured_boot_pb, mlx_device as mlx_device_pb}; use carbide_ib_fabric::ib::IBFabricManager; +use carbide_machine_controller::dpf::DpfOperations; +use carbide_machine_controller::io::MachineStateControllerIO; use carbide_rack::bms_client::BmsDsxExchangeHandle; use carbide_redfish::libredfish::RedfishClientPool; use carbide_site_explorer::EndpointExplorer; @@ -57,8 +59,6 @@ use crate::dynamic_settings::DynamicSettings; use crate::ethernet_virtualization::EthVirtData; use crate::logging::log_limiter::LogLimiter; use crate::scout_stream::ConnectionRegistry; -use crate::state_controller::machine::dpf::DpfOperations; -use crate::state_controller::machine::io::MachineStateControllerIO; use crate::{CarbideError, CarbideResult}; pub struct Api { diff --git a/crates/api/src/attestation/measured_boot.rs b/crates/api/src/attestation/measured_boot.rs index 39e1fae2fe..0b8cf617fb 100644 --- a/crates/api/src/attestation/measured_boot.rs +++ b/crates/api/src/attestation/measured_boot.rs @@ -21,6 +21,7 @@ use std::io::Write; use std::process::Command; use byteorder::{BigEndian, ByteOrder}; +use carbide_machine_controller::{MeasuringOutcome, handle_measuring_state}; use carbide_uuid::machine::MachineId; use carbide_uuid::measured_boot::MeasurementReportId; use db::db_read::DbReader; @@ -31,7 +32,6 @@ use sqlx::PgConnection; use temp_dir::TempDir; use crate::attestation::get_ek_cert_by_machine_id; -use crate::state_controller::machine::{MeasuringOutcome, handle_measuring_state}; use crate::{CarbideError, CarbideResult}; /// VerifyQuoteState is a simple enum used to track diff --git a/crates/api/src/cfg/file.rs b/crates/api/src/cfg/file.rs index 97a774b56f..c08be945c5 100644 --- a/crates/api/src/cfg/file.rs +++ b/crates/api/src/cfg/file.rs @@ -24,6 +24,11 @@ use bmc_vendor::BMCVendor; use carbide_authn::config::{AllowedCertCriteria, TrustConfig}; use carbide_firmware::FirmwareConfig; use carbide_ib_fabric::config::{IBFabricConfig, IbFabricDefinition}; +use carbide_machine_controller::config::power_manager::default_power_options; +use carbide_machine_controller::config::{ + BomValidationConfig, FirmwareGlobal, MachineStateControllerConfig, + MachineStateHandlerSiteConfig, MachineValidationConfig, PowerManagerOptions, TimePeriod, +}; use carbide_nvlink_manager::config::NvLinkConfig; use carbide_preingestion_manager::PreingestionManagerConfig; use carbide_rack_controller::config::{RackValidationConfig, RmsConfig}; @@ -51,12 +56,6 @@ use model::tenant::identity_config::SigningAlgorithm; use regex::Regex; use serde::{Deserialize, Deserializer, Serialize}; -use crate::state_controller::machine::config::power_manager::default_power_options; -use crate::state_controller::machine::config::{ - BomValidationConfig, FirmwareGlobal, MachineStateControllerConfig, - MachineStateHandlerSiteConfig, PowerManagerOptions, -}; - static BF2_NIC: &str = "24.47.2682"; static BF2_BMC: &str = "BF-25.10-20"; static BF2_CEC: &str = "4-15"; @@ -1751,15 +1750,6 @@ pub struct MachineUpdater { pub max_concurrent_machine_updates_percent: Option, } -/// A UTC time window defined by a start and end timestamp. -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] -pub struct TimePeriod { - /// Start of the time window (UTC). - pub start: chrono::DateTime, - /// End of the time window (UTC). - pub end: chrono::DateTime, -} - pub fn default_max_find_by_ids() -> u32 { 100 } @@ -1831,70 +1821,6 @@ impl MeasuredBootMetricsCollectorConfig { } } -/// Controls which machine validation tests are active. -#[derive(Default, Clone, Copy, Debug, Deserialize, Serialize)] -pub enum MachineValidationTestSelectionMode { - /// Only update tests in DB that are specified in the - /// `tests` config list. - #[default] - Default, - /// Enable all tests in DB, but allow per-test overrides - /// from the `tests` config list. - EnableAll, - /// Disable all tests in DB, but allow per-test overrides - /// from the `tests` config list. - DisableAll, -} - -/// Configuration for machine validation tests (memory -/// latency, SSD I/O, etc.) run after ingestion to verify -/// hardware health. -#[derive(Default, Clone, Debug, Deserialize, Serialize)] -pub struct MachineValidationConfig { - /// Enables machine validation testing. - #[serde(default)] - pub enabled: bool, - - /// Controls whether to run all tests, no tests, or use - /// per-test configuration. - #[serde(default)] - pub test_selection_mode: MachineValidationTestSelectionMode, - - #[serde( - default = "MachineValidationConfig::default_run_interval", - deserialize_with = "deserialize_duration", - serialize_with = "as_std_duration" - )] - pub run_interval: std::time::Duration, - - /// Per-test enable/disable overrides. - #[serde(default)] - pub tests: Vec, -} - -/// Per-test override for machine validation. -/// -/// Example: -/// ```toml -/// tests = [ -/// { id = "MmMemLatency", enable = true }, -/// { id = "FioSSD", enable = true } -/// ] -/// ``` -#[derive(Default, Clone, Debug, Deserialize, Serialize)] -pub struct MachineValidationTestConfig { - /// Unique test identifier (e.g., "MmMemLatency"). - pub id: String, - /// Whether this test is enabled. - pub enable: bool, -} - -impl MachineValidationConfig { - const fn default_run_interval() -> std::time::Duration { - std::time::Duration::from_secs(60) - } -} - /// The VPC isolation behavior enforced within a site. #[derive(Clone, Copy, Debug, Default, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] diff --git a/crates/api/src/handlers/attestation.rs b/crates/api/src/handlers/attestation.rs index 5cb5b0e8d8..feed451e80 100644 --- a/crates/api/src/handlers/attestation.rs +++ b/crates/api/src/handlers/attestation.rs @@ -16,6 +16,7 @@ */ use ::rpc::common::MachineIdList; use ::rpc::forge::{self as rpc}; +use carbide_machine_controller::handler::attestation::trigger_attestation; use carbide_uuid::machine::MachineId; use db::ObjectFilter; use model::machine::machine_search_config::MachineSearchConfig; @@ -24,7 +25,6 @@ use tonic::{Request, Response, Status}; use crate::CarbideError; use crate::api::{Api, log_machine_id, log_request_data}; -use crate::state_controller::machine::handler::attestation::trigger_attestation; pub(crate) async fn trigger_machine_attestation( api: &Api, diff --git a/crates/api/src/handlers/instance.rs b/crates/api/src/handlers/instance.rs index 292dd91671..60913c3ab0 100644 --- a/crates/api/src/handlers/instance.rs +++ b/crates/api/src/handlers/instance.rs @@ -1637,7 +1637,7 @@ pub async fn force_delete_instance( id: instance.machine_id.to_string(), })?; - crate::state_controller::machine::handler::release_vpc_dpu_loopback( + carbide_machine_controller::handler::release_vpc_dpu_loopback( &snapshot, Some(api.common_pools.as_ref()), &mut txn, diff --git a/crates/api/src/handlers/machine_validation.rs b/crates/api/src/handlers/machine_validation.rs index a946b7d610..783bea2f74 100644 --- a/crates/api/src/handlers/machine_validation.rs +++ b/crates/api/src/handlers/machine_validation.rs @@ -15,6 +15,9 @@ * limitations under the License. */ use ::rpc::forge::{self as rpc, GetMachineValidationExternalConfigResponse}; +use carbide_machine_controller::config::machine_validation::{ + MachineValidationConfig, MachineValidationTestSelectionMode, +}; use config_version::ConfigVersion; use db::{self, machine_validation_suites}; use model::machine::machine_search_config::MachineSearchConfig; @@ -32,7 +35,6 @@ use tonic::{Request, Response, Status}; use crate::CarbideError; use crate::api::{Api, log_request_data}; -use crate::cfg::file::{MachineValidationConfig, MachineValidationTestSelectionMode}; use crate::handlers::utils::convert_and_log_machine_id; /// Temporary: when `true`, MV mutation handlers return `FailedPrecondition` and do not write to the DB. diff --git a/crates/api/src/machine_update_manager/dpu_nic_firmware.rs b/crates/api/src/machine_update_manager/dpu_nic_firmware.rs index 5060bf9da8..80a79619b1 100644 --- a/crates/api/src/machine_update_manager/dpu_nic_firmware.rs +++ b/crates/api/src/machine_update_manager/dpu_nic_firmware.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use std::sync::atomic::Ordering; use async_trait::async_trait; +use carbide_machine_controller::dpf::DpfOperations; use carbide_uuid::machine::MachineId; use db::dpu_machine_update; use model::dpu_machine_update::{DpuMachineUpdate, OutdatedDpfDpu}; @@ -30,7 +31,6 @@ use super::dpu_nic_firmware_metrics::DpuNicFirmwareUpdateMetrics; use super::machine_update_module::MachineUpdateModule; use crate::cfg::file::CarbideConfig; use crate::machine_update_manager::MachineUpdateManager; -use crate::state_controller::machine::dpf::DpfOperations; use crate::{CarbideResult, DatabaseError}; /// DpuNicFirmwareUpdate is a module used [MachineUpdateManager](crate::machine_update_manager::MachineUpdateManager) diff --git a/crates/api/src/machine_update_manager/mod.rs b/crates/api/src/machine_update_manager/mod.rs index 1920f68dc6..d0406cd788 100644 --- a/crates/api/src/machine_update_manager/mod.rs +++ b/crates/api/src/machine_update_manager/mod.rs @@ -26,6 +26,7 @@ use std::sync::Arc; use std::sync::atomic::Ordering; use std::time::Duration; +use carbide_machine_controller::dpf::DpfOperations; use carbide_utils::periodic_timer::PeriodicTimer; use carbide_uuid::machine::MachineId; use db::work_lock_manager::WorkLockManagerHandle; @@ -44,7 +45,6 @@ use self::dpu_nic_firmware::DpuNicFirmwareUpdate; use self::metrics::MachineUpdateManagerMetrics; use crate::CarbideResult; use crate::cfg::file::{CarbideConfig, MaxConcurrentUpdates}; -use crate::state_controller::machine::dpf::DpfOperations; /// The MachineUpdateManager periodically runs [modules](machine_update_module::MachineUpdateModule) to initiate upgrades of machine components. /// On each iteration the MachineUpdateManager will: diff --git a/crates/api/src/machine_validation/mod.rs b/crates/api/src/machine_validation/mod.rs index 6e4c8290a7..de47adfd3a 100644 --- a/crates/api/src/machine_validation/mod.rs +++ b/crates/api/src/machine_validation/mod.rs @@ -21,6 +21,7 @@ use std::default::Default; use std::io; use std::sync::Arc; +use carbide_machine_controller::config::machine_validation::MachineValidationConfig; use carbide_utils::periodic_timer::PeriodicTimer; use db::ObjectFilter; use tokio::task::JoinSet; @@ -28,7 +29,6 @@ use tokio_util::sync::CancellationToken; use self::metrics::MachineValidationMetrics; use crate::CarbideResult; -use crate::cfg::file::MachineValidationConfig; pub struct MachineValidationManager { database_connection: sqlx::PgPool, diff --git a/crates/api/src/run.rs b/crates/api/src/run.rs index be384b5905..078d0e71fe 100644 --- a/crates/api/src/run.rs +++ b/crates/api/src/run.rs @@ -75,7 +75,7 @@ pub async fn run( } else { setup_logging( debug, - crate::state_controller::machine::extra_logfmt_logging_fields(), + carbide_machine_controller::extra_logfmt_logging_fields(), None::, ) .wrap_err("setup_telemetry")? diff --git a/crates/api/src/setup.rs b/crates/api/src/setup.rs index e7cb404a53..2f4100aa34 100644 --- a/crates/api/src/setup.rs +++ b/crates/api/src/setup.rs @@ -33,6 +33,12 @@ use carbide_ib_partition_controller::context::IBPartitionStateHandlerServices; use carbide_ib_partition_controller::handler::IBPartitionStateHandler; use carbide_ib_partition_controller::io::IBPartitionStateControllerIO; use carbide_ipmi::IPMITool; +use carbide_machine_controller::context::MachineStateHandlerServices; +use carbide_machine_controller::dpf::{ + CarbideBmcPasswordProvider, CarbideDPFLabeler, DpfOperations, DpfSdkOps, +}; +use carbide_machine_controller::handler::MachineStateHandlerBuilder; +use carbide_machine_controller::io::MachineStateControllerIO; use carbide_network_segment_controller::context::NetworkSegmentStateHandlerServices; use carbide_network_segment_controller::handler::NetworkSegmentStateHandler; use carbide_network_segment_controller::io::NetworkSegmentStateControllerIO; @@ -102,9 +108,6 @@ use crate::measured_boot::metrics_collector::MeasuredBootMetricsCollector; use crate::mqtt_state_change_hook::hook::MqttStateChangeHook; use crate::scout_stream::ConnectionRegistry; use crate::state_controller::common_services::CommonStateHandlerServices; -use crate::state_controller::machine::context::MachineStateHandlerServices; -use crate::state_controller::machine::handler::MachineStateHandlerBuilder; -use crate::state_controller::machine::io::MachineStateControllerIO; use crate::{attestation, db_init, ethernet_virtualization, listener}; /// The resolved set of network declarations passed from `start_api` into @@ -644,58 +647,51 @@ pub async fn start_api( // Create DPF SDK and initialize CRs if enabled // If we end up having static DPUDeployments, we could move the static CRs outside of the API. - let dpf_sdk: Option> = - if carbide_config.dpf.enabled { - tracing::info!("Initializing DPF SDK"); - let repo = carbide_dpf::KubeRepository::new() - .await - .map_err(|e| eyre::eyre!("Failed to create DPF repository: {e}"))?; - - let provider = crate::state_controller::machine::dpf::CarbideBmcPasswordProvider::new( - credential_manager.clone(), - ); - - let mandatory_services = carbide_config.dpf.services.clone(); - let dpf_mandatory_services = vec![ - crate::dpf_services::dts_service(&mandatory_services.dts), - crate::dpf_services::doca_hbn_service(&mandatory_services.doca_hbn), - crate::dpf_services::dhcp_server_service(&mandatory_services.dhcp_server), - crate::dpf_services::dpu_agent_service(&mandatory_services.dpu_agent), - crate::dpf_services::fmds_service(&mandatory_services.fmds), - crate::dpf_services::otelcol_service(&mandatory_services.otel), - ]; - - // This is just temparary code until we make v2 only option. (just 2 weeks) - // Soon v2 flag will be removed and will become only mode for dpf handling. - let init_config = carbide_dpf::InitDpfResourcesConfig { - bfb_url: carbide_config.dpf.bfb_url.clone(), - flavor_name: carbide_config.dpf.flavor_name.clone(), - deployment_name: carbide_config.dpf.deployment_name.clone(), - services: dpf_mandatory_services, - }; + let dpf_sdk: Option> = if carbide_config.dpf.enabled { + tracing::info!("Initializing DPF SDK"); + let repo = carbide_dpf::KubeRepository::new() + .await + .map_err(|e| eyre::eyre!("Failed to create DPF repository: {e}"))?; + + let provider = CarbideBmcPasswordProvider::new(credential_manager.clone()); + + let mandatory_services = carbide_config.dpf.services.clone(); + let dpf_mandatory_services = vec![ + crate::dpf_services::dts_service(&mandatory_services.dts), + crate::dpf_services::doca_hbn_service(&mandatory_services.doca_hbn), + crate::dpf_services::dhcp_server_service(&mandatory_services.dhcp_server), + crate::dpf_services::dpu_agent_service(&mandatory_services.dpu_agent), + crate::dpf_services::fmds_service(&mandatory_services.fmds), + crate::dpf_services::otelcol_service(&mandatory_services.otel), + ]; + + // This is just temparary code until we make v2 only option. (just 2 weeks) + // Soon v2 flag will be removed and will become only mode for dpf handling. + let init_config = carbide_dpf::InitDpfResourcesConfig { + bfb_url: carbide_config.dpf.bfb_url.clone(), + flavor_name: carbide_config.dpf.flavor_name.clone(), + deployment_name: carbide_config.dpf.deployment_name.clone(), + services: dpf_mandatory_services, + }; - let sdk = carbide_dpf::DpfSdkBuilder::new(repo, carbide_dpf::NAMESPACE, provider) - .with_labeler( - crate::state_controller::machine::dpf::CarbideDPFLabeler::new( - carbide_config.dpf.node_label_key.clone(), - ), - ) - .with_bmc_password_refresh_interval(std::time::Duration::from_secs(60)) - .with_join_set(join_set) - .initialize(&init_config) - .await - .map_err(|err| eyre::eyre!("Failed to initialize DPF SDK: {err}"))?; - - Some(Arc::new( - crate::state_controller::machine::dpf::DpfSdkOps::new( - Arc::new(sdk), - db_pool.clone(), - join_set, - )?, + let sdk = carbide_dpf::DpfSdkBuilder::new(repo, carbide_dpf::NAMESPACE, provider) + .with_labeler(CarbideDPFLabeler::new( + carbide_config.dpf.node_label_key.clone(), )) - } else { - None - }; + .with_bmc_password_refresh_interval(std::time::Duration::from_secs(60)) + .with_join_set(join_set) + .initialize(&init_config) + .await + .map_err(|err| eyre::eyre!("Failed to initialize DPF SDK: {err}"))?; + + Some(Arc::new(DpfSdkOps::new( + Arc::new(sdk), + db_pool.clone(), + join_set, + )?)) + } else { + None + }; let component_manager = if let Some(cd_config) = &carbide_config.component_manager { match component_manager::component_manager::build_component_manager( diff --git a/crates/api/src/state_controller/mod.rs b/crates/api/src/state_controller/mod.rs index 2d086487fa..cb3ca789f1 100644 --- a/crates/api/src/state_controller/mod.rs +++ b/crates/api/src/state_controller/mod.rs @@ -16,4 +16,3 @@ */ pub mod common_services; -pub mod machine; diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index 6484de7311..8f99a96e94 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -33,6 +33,16 @@ use carbide_ib_partition_controller::context::IBPartitionStateHandlerServices; use carbide_ib_partition_controller::handler::IBPartitionStateHandler; use carbide_ib_partition_controller::io::IBPartitionStateControllerIO; use carbide_ipmi::IPMITool; +use carbide_machine_controller::config::{ + BomValidationConfig, FirmwareGlobal, MachineStateControllerConfig, MachineValidationConfig, + PowerManagerOptions, +}; +use carbide_machine_controller::context::MachineStateHandlerServices; +use carbide_machine_controller::dpf::DpfOperations; +use carbide_machine_controller::handler::{ + MachineStateHandler, MachineStateHandlerBuilder, PowerOptionConfig, ReachabilityParams, +}; +use carbide_machine_controller::io::MachineStateControllerIO; use carbide_network_segment_controller::context::NetworkSegmentStateHandlerServices; use carbide_network_segment_controller::handler::NetworkSegmentStateHandler; use carbide_network_segment_controller::io::NetworkSegmentStateControllerIO; @@ -124,11 +134,10 @@ use crate::api::metrics::ApiMetricsEmitter; use crate::cfg::file::{ CarbideConfig, ComputeAllocationEnforcement, DpaConfig, DpaInterfaceStateControllerConfig, DpuConfig as InitialDpuConfig, FnnConfig, IbPartitionStateControllerConfig, ListenMode, - MachineUpdater, MachineValidationConfig, MeasuredBootMetricsCollectorConfig, MqttAuthConfig, - NetworkSecurityGroupConfig, NetworkSegmentStateControllerConfig, - PowerShelfStateControllerConfig, RackStateControllerConfig, SpdmConfig, - SpdmStateControllerConfig, SwitchStateControllerConfig, VmaasConfig, VpcPeeringPolicy, - default_max_find_by_ids, + MachineUpdater, MeasuredBootMetricsCollectorConfig, MqttAuthConfig, NetworkSecurityGroupConfig, + NetworkSegmentStateControllerConfig, PowerShelfStateControllerConfig, + RackStateControllerConfig, SpdmConfig, SpdmStateControllerConfig, SwitchStateControllerConfig, + VmaasConfig, VpcPeeringPolicy, default_max_find_by_ids, }; use crate::ethernet_virtualization::{EthVirtData, SiteFabricPrefixList}; use crate::logging::level_filter::ActiveLevel; @@ -136,15 +145,6 @@ use crate::logging::log_limiter::LogLimiter; use crate::measured_boot::convert_vec; use crate::scout_stream; use crate::state_controller::common_services::CommonStateHandlerServices; -use crate::state_controller::machine::config::{ - BomValidationConfig, FirmwareGlobal, MachineStateControllerConfig, PowerManagerOptions, -}; -use crate::state_controller::machine::context::MachineStateHandlerServices; -use crate::state_controller::machine::dpf::DpfOperations; -use crate::state_controller::machine::handler::{ - MachineStateHandler, MachineStateHandlerBuilder, PowerOptionConfig, ReachabilityParams, -}; -use crate::state_controller::machine::io::MachineStateControllerIO; use crate::tests::common::api_fixtures::endpoint_explorer::MockEndpointExplorer; use crate::tests::common::api_fixtures::managed_host::ManagedHostConfig; use crate::tests::common::api_fixtures::network_segment::{ diff --git a/crates/api/src/tests/dpf/duplicate_events.rs b/crates/api/src/tests/dpf/duplicate_events.rs index c9713a0d52..8add7fc6e0 100644 --- a/crates/api/src/tests/dpf/duplicate_events.rs +++ b/crates/api/src/tests/dpf/duplicate_events.rs @@ -29,13 +29,13 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use carbide_dpf::DpuPhase; +use carbide_machine_controller::dpf::{DpfOperations, MockDpfOperations}; use carbide_redfish::libredfish::test_support::RedfishSimAction; use carbide_uuid::machine::MachineId; use libredfish::SystemPowerControl; use model::machine::{DpfState, DpuInitState, ManagedHostState}; use tokio::time::timeout; -use crate::state_controller::machine::dpf::{DpfOperations, MockDpfOperations}; use crate::tests::common::api_fixtures::{ TestEnvOverrides, TestManagedHost, create_managed_host_with_dpf, create_test_env_with_overrides, get_config, reboot_completed, diff --git a/crates/api/src/tests/dpf/happy_path.rs b/crates/api/src/tests/dpf/happy_path.rs index 76f3bd4c74..b8f59de18e 100644 --- a/crates/api/src/tests/dpf/happy_path.rs +++ b/crates/api/src/tests/dpf/happy_path.rs @@ -21,14 +21,14 @@ use std::sync::Arc; use std::time::Duration; use carbide_dpf::DpuPhase; +use carbide_machine_controller::dpf::DpfOperations; use model::machine::ManagedHostState; use tokio::time::timeout; -use crate::state_controller::machine::dpf::DpfOperations; - const TEST_TIMEOUT: Duration = Duration::from_secs(30); -use crate::state_controller::machine::dpf::MockDpfOperations; +use carbide_machine_controller::dpf::MockDpfOperations; + use crate::tests::common::api_fixtures::{ TestEnvOverrides, create_managed_host_with_dpf, create_test_env_with_overrides, get_config, }; diff --git a/crates/api/src/tests/dpf/reprovisioning.rs b/crates/api/src/tests/dpf/reprovisioning.rs index 1db6388f5b..4bfdcf63e4 100644 --- a/crates/api/src/tests/dpf/reprovisioning.rs +++ b/crates/api/src/tests/dpf/reprovisioning.rs @@ -26,13 +26,13 @@ use std::sync::{Arc, Mutex}; use std::time::Duration; use carbide_dpf::DpuPhase; +use carbide_machine_controller::dpf::{DpfOperations, MockDpfOperations}; use carbide_uuid::machine::MachineId; use model::machine::{ DpfState, DpuReprovisionStates, InstanceState, ManagedHostState, ReprovisionState, }; use tokio::time::timeout; -use crate::state_controller::machine::dpf::{DpfOperations, MockDpfOperations}; use crate::tests::common::api_fixtures::{ TestEnvOverrides, TestManagedHost, create_managed_host_with_dpf, create_managed_host_with_dpf_multi, create_test_env_with_overrides, get_config, diff --git a/crates/api/src/tests/dpf/stale_labels.rs b/crates/api/src/tests/dpf/stale_labels.rs index e38442c4f3..fa0c166366 100644 --- a/crates/api/src/tests/dpf/stale_labels.rs +++ b/crates/api/src/tests/dpf/stale_labels.rs @@ -28,11 +28,11 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use carbide_dpf::DpuPhase; +use carbide_machine_controller::dpf::{DpfOperations, MockDpfOperations}; use carbide_uuid::machine::MachineId; use model::machine::{DpfState, DpuInitState, FailureCause, FailureDetails, ManagedHostState}; use tokio::time::timeout; -use crate::state_controller::machine::dpf::{DpfOperations, MockDpfOperations}; use crate::tests::common::api_fixtures::{ TestEnvOverrides, TestManagedHost, create_managed_host_with_dpf, create_test_env_with_overrides, get_config, diff --git a/crates/api/src/tests/dpf/waiting_for_ready.rs b/crates/api/src/tests/dpf/waiting_for_ready.rs index 3916e0b2b6..6fd1d9b994 100644 --- a/crates/api/src/tests/dpf/waiting_for_ready.rs +++ b/crates/api/src/tests/dpf/waiting_for_ready.rs @@ -23,6 +23,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use carbide_dpf::DpuPhase; +use carbide_machine_controller::dpf::{DpfOperations, MockDpfOperations}; use carbide_redfish::libredfish::RedfishClientPool; use carbide_redfish::libredfish::test_support::RedfishSimAction; use carbide_uuid::machine::MachineId; @@ -30,7 +31,6 @@ use libredfish::SystemPowerControl; use model::machine::{DpfState, DpuInitState, ManagedHostState}; use tokio::time::timeout; -use crate::state_controller::machine::dpf::{DpfOperations, MockDpfOperations}; use crate::tests::common::api_fixtures::{ TestEnvOverrides, TestManagedHost, create_managed_host_with_dpf, create_test_env_with_overrides, get_config, reboot_completed, diff --git a/crates/api/src/tests/dpu_nic_firmware.rs b/crates/api/src/tests/dpu_nic_firmware.rs index 1502d14249..43eb90493d 100644 --- a/crates/api/src/tests/dpu_nic_firmware.rs +++ b/crates/api/src/tests/dpu_nic_firmware.rs @@ -17,6 +17,7 @@ use std::collections::HashSet; use std::string::ToString; +use carbide_machine_controller::health_report::create_host_update_health_report_dpufw; use common::api_fixtures::{create_managed_host, create_managed_host_multi_dpu, create_test_env}; use model::machine::LoadSnapshotOptions; use model::machine_update_module::{ @@ -26,7 +27,6 @@ use model::machine_update_module::{ use crate::CarbideResult; use crate::machine_update_manager::dpu_nic_firmware::DpuNicFirmwareUpdate; use crate::machine_update_manager::machine_update_module::MachineUpdateModule; -use crate::state_controller::machine::health_report::create_host_update_health_report_dpufw; use crate::tests::common; use crate::tests::common::api_fixtures::TestManagedHost; use crate::tests::common::api_fixtures::test_managed_host::TestManagedHostSnapshots; diff --git a/crates/api/src/tests/dpu_reprovisioning.rs b/crates/api/src/tests/dpu_reprovisioning.rs index ff74bb57d1..af890c20de 100644 --- a/crates/api/src/tests/dpu_reprovisioning.rs +++ b/crates/api/src/tests/dpu_reprovisioning.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; +use carbide_machine_controller::handler::MachineStateHandlerBuilder; use carbide_redfish::libredfish::test_support::RedfishSimAction; use chrono::Utc; use common::api_fixtures::{create_managed_host_multi_dpu, create_test_env, reboot_completed}; @@ -32,7 +33,6 @@ use rpc::forge::forge_server::Forge; use rpc::forge_agent_control_response::Action; use rpc::model::instance::snapshot::instance_snapshot_derive_status; -use crate::state_controller::machine::handler::MachineStateHandlerBuilder; use crate::tests::common; use crate::tests::common::api_fixtures::dpu::create_dpu_machine_in_waiting_for_network_install; use crate::tests::common::api_fixtures::instance::TestInstance; diff --git a/crates/api/src/tests/host_bmc_firmware_test.rs b/crates/api/src/tests/host_bmc_firmware_test.rs index 2204264477..084a8e59c3 100644 --- a/crates/api/src/tests/host_bmc_firmware_test.rs +++ b/crates/api/src/tests/host_bmc_firmware_test.rs @@ -22,6 +22,8 @@ use std::os::unix::fs::PermissionsExt; use std::str::FromStr; use std::time::Duration; +use carbide_machine_controller::config::{FirmwareGlobal, TimePeriod}; +use carbide_machine_controller::handler::MAX_FIRMWARE_UPGRADE_RETRIES; use carbide_preingestion_manager::PreingestionManager; use carbide_redfish::libredfish::test_support::RedfishSimAction; use carbide_uuid::machine::MachineId; @@ -49,10 +51,8 @@ use tokio::time::sleep; use tonic::Request; use crate::CarbideResult; -use crate::cfg::file::{CarbideConfig, TimePeriod}; +use crate::cfg::file::CarbideConfig; use crate::machine_update_manager::MachineUpdateManager; -use crate::state_controller::machine::config::FirmwareGlobal; -use crate::state_controller::machine::handler::MAX_FIRMWARE_UPGRADE_RETRIES; use crate::tests::common; use crate::tests::common::api_fixtures::managed_host::HardwareInfoTemplate; use crate::tests::common::api_fixtures::{ diff --git a/crates/api/src/tests/machine_admin_force_delete.rs b/crates/api/src/tests/machine_admin_force_delete.rs index 22f2799465..e354ab89fb 100644 --- a/crates/api/src/tests/machine_admin_force_delete.rs +++ b/crates/api/src/tests/machine_admin_force_delete.rs @@ -26,6 +26,7 @@ use ::rpc::forge::{ }; use carbide_ib_fabric::config::IBFabricConfig; use carbide_ib_fabric::ib::{self, IBFabricManager}; +use carbide_machine_controller::dpf::{DpfOperations, MockDpfOperations}; use carbide_uuid::infiniband::IBPartitionId; use carbide_uuid::machine::{MachineId, MachineType}; use common::api_fixtures::dpu::create_dpu_machine; @@ -47,7 +48,6 @@ use tonic::Request; use crate::api::Api; use crate::attestation as attest; -use crate::state_controller::machine::dpf::{DpfOperations, MockDpfOperations}; use crate::tests::common; async fn get_partition_status(api: &Api, ib_partition_id: IBPartitionId) -> IbPartitionStatus { diff --git a/crates/api/src/tests/machine_creator.rs b/crates/api/src/tests/machine_creator.rs index a385eb4f8c..a2cef5e5c5 100644 --- a/crates/api/src/tests/machine_creator.rs +++ b/crates/api/src/tests/machine_creator.rs @@ -20,6 +20,7 @@ use std::net::IpAddr; use std::str::FromStr; use std::sync::Arc; +use carbide_machine_controller::handler::MachineStateHandlerBuilder; use carbide_site_explorer::MachineCreator; use carbide_site_explorer::config::SiteExplorerConfig; use carbide_site_explorer::errors::SiteExplorerError; @@ -39,7 +40,6 @@ use rpc::{BlockDevice, DiscoveryData, DiscoveryInfo, MachineDiscoveryInfo}; use tonic::Request; use crate::cfg::file::DpuConfig as InitialDpuConfig; -use crate::state_controller::machine::handler::MachineStateHandlerBuilder; use crate::tests::common; use crate::tests::common::api_fixtures::TestEnvOverrides; use crate::tests::common::api_fixtures::dpu::DpuConfig; diff --git a/crates/api/src/tests/machine_setup.rs b/crates/api/src/tests/machine_setup.rs new file mode 100644 index 0000000000..a96b32c9ed --- /dev/null +++ b/crates/api/src/tests/machine_setup.rs @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::collections::HashMap; + +/// Verify that `oem_manager_profiles` from the site config is forwarded to `machine_setup`. +/// +/// This test catches regressions where the argument gets dropped or replaced with an empty map. +#[tokio::test] +async fn test_oem_manager_profiles_passed_to_machine_setup() { + use carbide_redfish::libredfish::RedfishClientPool; + use carbide_redfish::libredfish::test_support::{RedfishSim, RedfishSimAction}; + use libredfish::BiosProfileType; + use libredfish::model::service_root::RedfishVendor; + + let mut config = crate::tests::common::api_fixtures::get_config(); + // Build an oem_manager_profiles map with a Dell R760 PSU Hot Spare setting. + // This mirrors the fix for the Dell R760 PSU fan issue (nvbugs-5834644). + config.oem_manager_profiles = HashMap::from([( + RedfishVendor::Dell, + HashMap::from([( + "r760".to_string(), + HashMap::from([( + BiosProfileType::Performance, + HashMap::from([( + "ServerPwr.1.PSRapidOn".to_string(), + serde_json::Value::String("Disabled".to_string()), + )]), + )]), + )]), + )]); + + use carbide_redfish::libredfish::RedfishAuth; + use forge_secrets::credentials::{CredentialKey, CredentialType}; + + let sim = RedfishSim::default(); + let timepoint = sim.timepoint(); + let client = sim + .create_client( + "test-host", + None, + RedfishAuth::Key(CredentialKey::HostRedfish { + credential_type: CredentialType::SiteDefault, + }), + None, + ) + .await + .unwrap(); + + let result = carbide_machine_controller::handler::call_machine_setup_and_handle_no_dpu_error( + client.as_ref(), + None, + 1, + &config.machine_state_handler_site_config(), + ) + .await; + + assert!(result.is_ok()); + + let actions = sim.actions_since(&timepoint).all_hosts(); + assert_eq!(actions.len(), 1); + assert_eq!( + actions[0], + RedfishSimAction::MachineSetup { + oem_manager_profiles: config.oem_manager_profiles, + } + ); +} diff --git a/crates/api/src/tests/machine_states.rs b/crates/api/src/tests/machine_states.rs index 27b1a70206..096c07ec9c 100644 --- a/crates/api/src/tests/machine_states.rs +++ b/crates/api/src/tests/machine_states.rs @@ -20,6 +20,9 @@ use std::sync::atomic::AtomicBool; use ::rpc::measured_boot::FromGrpc; use base64::prelude::*; +use carbide_machine_controller::context::MachineStateHandlerContextObjects; +use carbide_machine_controller::handler::{MachineStateHandlerBuilder, handler_host_power_control}; +use carbide_machine_controller::metrics::MachineMetrics; use carbide_redfish::libredfish::test_support::RedfishSimAction; use carbide_uuid::machine::MachineId; use carbide_uuid::machine_validation::MachineValidationId; @@ -65,11 +68,6 @@ use tonic::{Code, Request}; use crate::handlers::measured_boot::rpc_forge::MachineDiscoveryInfo; use crate::measured_boot::convert_vec; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::machine::handler::{ - MachineStateHandlerBuilder, handler_host_power_control, -}; -use crate::state_controller::machine::metrics::MachineMetrics; use crate::tests::common; use crate::tests::common::api_fixtures::dpu::{ TEST_DOCA_HBN_VERSION, TEST_DOCA_TELEMETRY_VERSION, TEST_DPU_AGENT_VERSION, diff --git a/crates/api/src/tests/machine_update_manager.rs b/crates/api/src/tests/machine_update_manager.rs index 1f44d3927b..afff7b2f62 100644 --- a/crates/api/src/tests/machine_update_manager.rs +++ b/crates/api/src/tests/machine_update_manager.rs @@ -20,6 +20,7 @@ use std::sync::{Arc, Mutex}; use std::time::Duration; use async_trait::async_trait; +use carbide_machine_controller::health_report::create_host_update_health_report; use carbide_uuid::machine::MachineId; use common::api_fixtures::create_test_env; use figment::Figment; @@ -37,7 +38,6 @@ use crate::CarbideResult; use crate::cfg::file::CarbideConfig; use crate::machine_update_manager::MachineUpdateManager; use crate::machine_update_manager::machine_update_module::MachineUpdateModule; -use crate::state_controller::machine::health_report::create_host_update_health_report; use crate::tests::common; use crate::tests::common::api_fixtures::create_managed_host; diff --git a/crates/api/src/tests/machine_validation.rs b/crates/api/src/tests/machine_validation.rs index ead690b6a9..434f366034 100644 --- a/crates/api/src/tests/machine_validation.rs +++ b/crates/api/src/tests/machine_validation.rs @@ -18,6 +18,9 @@ use std::str::FromStr; use std::time::SystemTime; +use carbide_machine_controller::config::machine_validation::{ + MachineValidationConfig, MachineValidationTestConfig, MachineValidationTestSelectionMode, +}; use carbide_uuid::machine_validation::MachineValidationId; use common::api_fixtures::{ TestEnvOverrides, create_host_with_machine_validation, create_test_env, @@ -33,9 +36,6 @@ use rpc::Timestamp; use rpc::forge::forge_server::Forge; use rpc::forge::{MachineValidationTestNextVersionRequest, MachineValidationTestVerfiedRequest}; -use crate::cfg::file::{ - MachineValidationConfig, MachineValidationTestConfig, MachineValidationTestSelectionMode, -}; use crate::handlers::machine_validation::apply_config_on_startup; use crate::tests::common; diff --git a/crates/api/src/tests/mod.rs b/crates/api/src/tests/mod.rs index af8893c89d..6fa4eb0dd4 100644 --- a/crates/api/src/tests/mod.rs +++ b/crates/api/src/tests/mod.rs @@ -75,6 +75,7 @@ mod machine_interfaces; mod machine_metadata; mod machine_network; mod machine_power; +mod machine_setup; mod machine_states; mod machine_topology; pub mod machine_update_manager; diff --git a/crates/machine-controller/Cargo.toml b/crates/machine-controller/Cargo.toml new file mode 100644 index 0000000000..12e5766fbf --- /dev/null +++ b/crates/machine-controller/Cargo.toml @@ -0,0 +1,76 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +[package] +name = "carbide-machine-controller" +version = "0.0.0" +edition.workspace = true +license.workspace = true +authors.workspace = true + +[features] +default = [] +test-support = ["dep:mockall"] + +[dependencies] +bmc-vendor = { path = "../bmc-vendor" } +carbide-api-db = { path = "../api-db", default-features = false } +carbide-api-model = { path = "../api-model", default-features = false } +carbide-dpf = { path = "../dpf", default-features = false } +carbide-health-report = { path = "../health-report", default-features = false } +carbide-health-metrics = { path = "../health-metrics" } +carbide-utils = { path = "../utils", default-features = false } +carbide-firmware = { path = "../firmware", default-features = false } +carbide-ipmi = { path = "../ipmi", default-features = false } +carbide-measured-boot = { path = "../measured-boot", default-features = false } +carbide-redfish = { path = "../redfish", default-features = false } +# TODO: RPC is only used to serialize / deserialize of +# ScoutFirmwareUpgradeTask. If we can find a way to do it differently +# we can improve build performance significantly. +carbide-rpc = { path = "../rpc", default-features = false } +carbide-secrets = { path = "../secrets" } +carbide-state-controller-common = { path = "../state-controller-common", default-features = false } +carbide-uuid = { path = "../uuid", default-features = false } +config-version = { path = "../config-version", default-features = false } +state-controller = { path = "../state-controller" } + +async-trait = { workspace = true } +chrono = { workspace = true } +duration-str = { workspace = true } +eyre = { workspace = true } +futures = { workspace = true } +futures-util = { workspace = true } +itertools = { workspace = true } +lazy_static = { workspace = true } +libredfish = { workspace = true } +mac_address = { workspace = true } +mockall = { workspace = true, optional = true } +opentelemetry = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +sqlx = { workspace = true } +tracing = { workspace = true } +tokio = { workspace = true } +uuid = { workspace = true, features = ["v4", "serde"] } +version-compare = { workspace = true } + +[dev-dependencies] +figment = { workspace = true, features = ["env", "test", "toml"] } +regex = { workspace = true } +lazy_static = { workspace = true } + +[lints] +workspace = true diff --git a/crates/api/src/state_controller/machine/config/bom_validation.rs b/crates/machine-controller/src/config/bom_validation.rs similarity index 100% rename from crates/api/src/state_controller/machine/config/bom_validation.rs rename to crates/machine-controller/src/config/bom_validation.rs diff --git a/crates/api/src/state_controller/machine/config/controller.rs b/crates/machine-controller/src/config/controller.rs similarity index 100% rename from crates/api/src/state_controller/machine/config/controller.rs rename to crates/machine-controller/src/config/controller.rs diff --git a/crates/api/src/state_controller/machine/config/firmware_global.rs b/crates/machine-controller/src/config/firmware_global.rs similarity index 98% rename from crates/api/src/state_controller/machine/config/firmware_global.rs rename to crates/machine-controller/src/config/firmware_global.rs index 78f59faa54..7929af7757 100644 --- a/crates/api/src/state_controller/machine/config/firmware_global.rs +++ b/crates/machine-controller/src/config/firmware_global.rs @@ -95,7 +95,7 @@ pub struct FirmwareGlobal { } impl FirmwareGlobal { - #[cfg(test)] + #[cfg(feature = "test-support")] pub fn test_default() -> Self { FirmwareGlobal { autoupdate: true, @@ -114,7 +114,7 @@ impl FirmwareGlobal { } } - #[cfg(test)] + #[cfg(feature = "test-support")] pub fn get_retry_interval() -> Duration { Duration::seconds(1) } diff --git a/crates/machine-controller/src/config/machine_validation.rs b/crates/machine-controller/src/config/machine_validation.rs new file mode 100644 index 0000000000..3bc8b4ecbb --- /dev/null +++ b/crates/machine-controller/src/config/machine_validation.rs @@ -0,0 +1,84 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use carbide_utils::config::as_std_duration; +use duration_str::deserialize_duration; +use serde::{Deserialize, Serialize}; + +/// Controls which machine validation tests are active. +#[derive(Default, Clone, Copy, Debug, Deserialize, Serialize)] +pub enum MachineValidationTestSelectionMode { + /// Only update tests in DB that are specified in the + /// `tests` config list. + #[default] + Default, + /// Enable all tests in DB, but allow per-test overrides + /// from the `tests` config list. + EnableAll, + /// Disable all tests in DB, but allow per-test overrides + /// from the `tests` config list. + DisableAll, +} + +/// Configuration for machine validation tests (memory +/// latency, SSD I/O, etc.) run after ingestion to verify +/// hardware health. +#[derive(Default, Clone, Debug, Deserialize, Serialize)] +pub struct MachineValidationConfig { + /// Enables machine validation testing. + #[serde(default)] + pub enabled: bool, + + /// Controls whether to run all tests, no tests, or use + /// per-test configuration. + #[serde(default)] + pub test_selection_mode: MachineValidationTestSelectionMode, + + #[serde( + default = "MachineValidationConfig::default_run_interval", + deserialize_with = "deserialize_duration", + serialize_with = "as_std_duration" + )] + pub run_interval: std::time::Duration, + + /// Per-test enable/disable overrides. + #[serde(default)] + pub tests: Vec, +} + +/// Per-test override for machine validation. +/// +/// Example: +/// ```toml +/// tests = [ +/// { id = "MmMemLatency", enable = true }, +/// { id = "FioSSD", enable = true } +/// ] +/// ``` +#[derive(Default, Clone, Debug, Deserialize, Serialize)] +pub struct MachineValidationTestConfig { + /// Unique test identifier (e.g., "MmMemLatency"). + pub id: String, + /// Whether this test is enabled. + pub enable: bool, +} + +impl MachineValidationConfig { + const fn default_run_interval() -> std::time::Duration { + std::time::Duration::from_secs(60) + } +} diff --git a/crates/api/src/state_controller/machine/config/mod.rs b/crates/machine-controller/src/config/mod.rs similarity index 78% rename from crates/api/src/state_controller/machine/config/mod.rs rename to crates/machine-controller/src/config/mod.rs index 99947e1d0a..504e7da0f8 100644 --- a/crates/api/src/state_controller/machine/config/mod.rs +++ b/crates/machine-controller/src/config/mod.rs @@ -16,15 +16,18 @@ */ use model::machine::HostHealthConfig; +use serde::{Deserialize, Serialize}; pub mod bom_validation; pub mod controller; pub mod firmware_global; +pub mod machine_validation; pub mod power_manager; pub use bom_validation::BomValidationConfig; pub use controller::MachineStateControllerConfig; pub use firmware_global::FirmwareGlobal; +pub use machine_validation::MachineValidationConfig; pub use power_manager::PowerManagerOptions; pub struct MachineStateHandlerSiteConfig { @@ -43,3 +46,12 @@ pub struct MachineStateHandlerSiteConfig { pub dpu_enable_secure_boot: bool, pub allow_zero_dpu_hosts: bool, } + +/// A UTC time window defined by a start and end timestamp. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct TimePeriod { + /// Start of the time window (UTC). + pub start: chrono::DateTime, + /// End of the time window (UTC). + pub end: chrono::DateTime, +} diff --git a/crates/api/src/state_controller/machine/config/power_manager.rs b/crates/machine-controller/src/config/power_manager.rs similarity index 100% rename from crates/api/src/state_controller/machine/config/power_manager.rs rename to crates/machine-controller/src/config/power_manager.rs diff --git a/crates/api/src/state_controller/machine/context.rs b/crates/machine-controller/src/context.rs similarity index 93% rename from crates/api/src/state_controller/machine/context.rs rename to crates/machine-controller/src/context.rs index df6bc3ae26..b39636be78 100644 --- a/crates/api/src/state_controller/machine/context.rs +++ b/crates/machine-controller/src/context.rs @@ -25,8 +25,8 @@ use model::machine::Machine; use sqlx::PgPool; use state_controller::state_handler::{StateHandlerContextObjects, StateHandlerError}; -use crate::state_controller::machine::config::MachineStateHandlerSiteConfig; -use crate::state_controller::machine::metrics::MachineMetrics; +use crate::config::MachineStateHandlerSiteConfig; +use crate::metrics::MachineMetrics; pub struct MachineStateHandlerContextObjects {} diff --git a/crates/api/src/state_controller/machine/dpf.rs b/crates/machine-controller/src/dpf.rs similarity index 99% rename from crates/api/src/state_controller/machine/dpf.rs rename to crates/machine-controller/src/dpf.rs index 8713a9df34..93b955c2be 100644 --- a/crates/api/src/state_controller/machine/dpf.rs +++ b/crates/machine-controller/src/dpf.rs @@ -32,7 +32,7 @@ use sqlx::PgPool; use state_controller::controller::Enqueuer; use tokio::task::JoinSet; -use crate::state_controller::machine::io::MachineStateControllerIO; +use crate::io::MachineStateControllerIO; /// Label key used by [`CarbideDPFLabeler`] to stamp the carbide `MachineId` of /// the DPU onto its DPUDevice. Propagates to the DPU CR via DPF. @@ -48,7 +48,7 @@ const CONTROLLED_DEVICE_LABEL: &str = "carbide.nvidia.com/controlled.device"; /// reacts to watcher callbacks, and performs reprovision/force-delete. /// /// Reboot handling is managed via the watcher's `on_reboot_required` callback. -#[cfg_attr(test, mockall::automock)] +#[cfg_attr(feature = "test-support", mockall::automock)] #[async_trait] pub trait DpfOperations: Send + Sync + std::fmt::Debug { /// Register a DPU device. diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/machine-controller/src/handler.rs similarity index 99% rename from crates/api/src/state_controller/machine/handler.rs rename to crates/machine-controller/src/handler.rs index f6bde21687..f164ddd078 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -94,21 +94,18 @@ use tokio::sync::Semaphore; use tracing::instrument; use version_compare::Cmp; -use crate::cfg::file::{MachineValidationConfig, TimePeriod}; -use crate::state_controller::machine::config::{FirmwareGlobal, MachineStateHandlerSiteConfig}; -use crate::state_controller::machine::context::{ - MachineStateHandlerContextObjects, MachineStateHandlerServices, +use crate::config::{ + FirmwareGlobal, MachineStateHandlerSiteConfig, MachineValidationConfig, TimePeriod, }; -use crate::state_controller::machine::dpf::DpfOperations; -use crate::state_controller::machine::health_report::{ +use crate::context::{MachineStateHandlerContextObjects, MachineStateHandlerServices}; +use crate::dpf::DpfOperations; +use crate::health_report::{ create_host_update_health_report_dpufw, create_host_update_health_report_hostfw, }; -use crate::state_controller::machine::redfish::{ +use crate::redfish::{ did_dpu_finish_booting, host_power_control, host_power_control_with_location, }; -use crate::state_controller::machine::{ - MeasuringOutcome, get_measuring_prerequisites, handle_measuring_state, -}; +use crate::{MeasuringOutcome, get_measuring_prerequisites, handle_measuring_state}; pub mod attestation; mod bios_config; @@ -129,8 +126,8 @@ use helpers::{ use rpc::forge_agent_control_response::FileArtifact; use state_controller::db_write_batch::DbWriteBatch; -use crate::state_controller::machine::config::{BomValidationConfig, PowerManagerOptions}; -use crate::state_controller::machine::write_ops::MachineWriteOp; +use crate::config::{BomValidationConfig, PowerManagerOptions}; +use crate::write_ops::MachineWriteOp; // We can't use http::StatusCode because libredfish has a newer version const NOT_FOUND: u16 = 404; @@ -295,7 +292,7 @@ impl MachineStateHandlerBuilder { self } - #[cfg(test)] // currently only used in tests + #[cfg(feature = "test-support")] pub fn dpu_nic_firmware_initial_update_enabled( mut self, dpu_nic_firmware_initial_update_enabled: bool, @@ -313,7 +310,7 @@ impl MachineStateHandlerBuilder { self } - #[cfg(test)] // currently only used in tests + #[cfg(feature = "test-support")] pub fn reachability_params(mut self, reachability_params: ReachabilityParams) -> Self { self.reachability_params = reachability_params; self @@ -9531,7 +9528,7 @@ fn can_restart_reprovision(dpu_snapshots: &[Machine], version: ConfigVersion) -> /// TODO(ken): This is a temporary workaround for work-in-progress on zero-DPU support (August 2024) /// The way we should do this going forward is to plumb the actual non-DPU MAC address we want to /// boot from, instead of special-casing NoDpu errors. -pub(super) async fn call_machine_setup_and_handle_no_dpu_error( +pub async fn call_machine_setup_and_handle_no_dpu_error( redfish_client: &dyn Redfish, boot_interface_mac: Option<&str>, expected_dpu_count: usize, @@ -10656,70 +10653,6 @@ mod tests { assert_eq!(to_install.version, target_version); } - /// Verify that `oem_manager_profiles` from the site config is forwarded to `machine_setup`. - /// - /// This test catches regressions where the argument gets dropped or replaced with an empty map. - #[tokio::test] - async fn test_oem_manager_profiles_passed_to_machine_setup() { - use carbide_redfish::libredfish::RedfishClientPool; - use carbide_redfish::libredfish::test_support::{RedfishSim, RedfishSimAction}; - use libredfish::BiosProfileType; - use libredfish::model::service_root::RedfishVendor; - - let mut config = crate::tests::common::api_fixtures::get_config(); - // Build an oem_manager_profiles map with a Dell R760 PSU Hot Spare setting. - // This mirrors the fix for the Dell R760 PSU fan issue (nvbugs-5834644). - config.oem_manager_profiles = HashMap::from([( - RedfishVendor::Dell, - HashMap::from([( - "r760".to_string(), - HashMap::from([( - BiosProfileType::Performance, - HashMap::from([( - "ServerPwr.1.PSRapidOn".to_string(), - serde_json::Value::String("Disabled".to_string()), - )]), - )]), - )]), - )]); - - use carbide_redfish::libredfish::RedfishAuth; - use forge_secrets::credentials::{CredentialKey, CredentialType}; - - let sim = RedfishSim::default(); - let timepoint = sim.timepoint(); - let client = sim - .create_client( - "test-host", - None, - RedfishAuth::Key(CredentialKey::HostRedfish { - credential_type: CredentialType::SiteDefault, - }), - None, - ) - .await - .unwrap(); - - let result = call_machine_setup_and_handle_no_dpu_error( - client.as_ref(), - None, - 1, - &config.machine_state_handler_site_config(), - ) - .await; - - assert!(result.is_ok()); - - let actions = sim.actions_since(&timepoint).all_hosts(); - assert_eq!(actions.len(), 1); - assert_eq!( - actions[0], - RedfishSimAction::MachineSetup { - oem_manager_profiles: config.oem_manager_profiles, - } - ); - } - #[test] fn test_cycle_1() { let state_change_time = diff --git a/crates/api/src/state_controller/machine/handler/attestation.rs b/crates/machine-controller/src/handler/attestation.rs similarity index 99% rename from crates/api/src/state_controller/machine/handler/attestation.rs rename to crates/machine-controller/src/handler/attestation.rs index d1db5d597b..7400292be5 100644 --- a/crates/api/src/state_controller/machine/handler/attestation.rs +++ b/crates/machine-controller/src/handler/attestation.rs @@ -38,7 +38,7 @@ use state_controller::state_handler::{ StateHandlerContext, StateHandlerError, StateHandlerOutcome, }; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; +use crate::context::MachineStateHandlerContextObjects; pub async fn trigger_attestation( db_pool: &PgPool, diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/machine-controller/src/handler/bios_config.rs similarity index 99% rename from crates/api/src/state_controller/machine/handler/bios_config.rs rename to crates/machine-controller/src/handler/bios_config.rs index 143fbf7fc9..d4640824f8 100644 --- a/crates/api/src/state_controller/machine/handler/bios_config.rs +++ b/crates/machine-controller/src/handler/bios_config.rs @@ -32,8 +32,8 @@ use super::{ ReachabilityParams, RebootStatus, call_machine_setup_and_handle_no_dpu_error, handler_host_power_control, trigger_reboot_if_needed, }; -use crate::state_controller::machine::config::MachineStateControllerConfig; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; +use crate::config::MachineStateControllerConfig; +use crate::context::MachineStateHandlerContextObjects; /// Outcome of configure_host_bios function. pub(super) enum BiosConfigOutcome { diff --git a/crates/api/src/state_controller/machine/handler/dpf.rs b/crates/machine-controller/src/handler/dpf.rs similarity index 99% rename from crates/api/src/state_controller/machine/handler/dpf.rs rename to crates/machine-controller/src/handler/dpf.rs index 21d24e0cb0..c16afbd4a4 100644 --- a/crates/api/src/state_controller/machine/handler/dpf.rs +++ b/crates/machine-controller/src/handler/dpf.rs @@ -33,8 +33,8 @@ use state_controller::state_handler::{ use super::helpers::{DpuInitStateHelper, ManagedHostStateHelper, ReprovisionStateHelper}; use super::{handler_host_power_control, host_power_state}; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::machine::dpf::DpfOperations; +use crate::context::MachineStateHandlerContextObjects; +use crate::dpf::DpfOperations; fn dpf_error(error: DpfError) -> StateHandlerError { ExternalServiceError::with_source("dpf", "", error.to_string(), "dpf_error", error).into() diff --git a/crates/api/src/state_controller/machine/handler/helpers.rs b/crates/machine-controller/src/handler/helpers.rs similarity index 100% rename from crates/api/src/state_controller/machine/handler/helpers.rs rename to crates/machine-controller/src/handler/helpers.rs diff --git a/crates/api/src/state_controller/machine/handler/machine_validation.rs b/crates/machine-controller/src/handler/machine_validation.rs similarity index 97% rename from crates/api/src/state_controller/machine/handler/machine_validation.rs rename to crates/machine-controller/src/handler/machine_validation.rs index 3a113553b5..bfdd0026ac 100644 --- a/crates/api/src/state_controller/machine/handler/machine_validation.rs +++ b/crates/machine-controller/src/handler/machine_validation.rs @@ -25,12 +25,8 @@ use state_controller::state_handler::{ }; use super::{HostHandlerParams, is_machine_validation_requested, machine_validation_completed}; -use crate::state_controller::machine::context::{ - MachineStateHandlerContextObjects, MachineStateHandlerServices, -}; -use crate::state_controller::machine::handler::{ - handler_host_power_control, rebooted, trigger_reboot_if_needed, -}; +use crate::context::{MachineStateHandlerContextObjects, MachineStateHandlerServices}; +use crate::handler::{handler_host_power_control, rebooted, trigger_reboot_if_needed}; pub(crate) async fn handle_machine_validation_state( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, diff --git a/crates/api/src/state_controller/machine/handler/power.rs b/crates/machine-controller/src/handler/power.rs similarity index 97% rename from crates/api/src/state_controller/machine/handler/power.rs rename to crates/machine-controller/src/handler/power.rs index 3990b870bb..d66493f240 100644 --- a/crates/api/src/state_controller/machine/handler/power.rs +++ b/crates/machine-controller/src/handler/power.rs @@ -25,10 +25,8 @@ use model::power_manager::{ }; use state_controller::state_handler::{StateHandlerContext, StateHandlerError}; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::machine::handler::{ - PowerOptionConfig, handler_host_power_control, host_power_state, -}; +use crate::context::MachineStateHandlerContextObjects; +use crate::handler::{PowerOptionConfig, handler_host_power_control, host_power_state}; // If power state is Paused and Reset, state machine can't take any decision on it. // Ignore power manager with a log and moved to state machine. diff --git a/crates/api/src/state_controller/machine/handler/sku.rs b/crates/machine-controller/src/handler/sku.rs similarity index 99% rename from crates/api/src/state_controller/machine/handler/sku.rs rename to crates/machine-controller/src/handler/sku.rs index 11c422abd2..c5f6f28cb9 100644 --- a/crates/api/src/state_controller/machine/handler/sku.rs +++ b/crates/machine-controller/src/handler/sku.rs @@ -27,10 +27,8 @@ use state_controller::state_handler::{ StateHandlerContext, StateHandlerError, StateHandlerOutcome, }; -use crate::state_controller::machine::context::{ - MachineStateHandlerContextObjects, MachineStateHandlerServices, -}; -use crate::state_controller::machine::handler::{ +use crate::context::{MachineStateHandlerContextObjects, MachineStateHandlerServices}; +use crate::handler::{ HostHandlerParams, discovered_after_state_transition, trigger_reboot_if_needed, }; diff --git a/crates/api/src/state_controller/machine/health_report.rs b/crates/machine-controller/src/health_report.rs similarity index 100% rename from crates/api/src/state_controller/machine/health_report.rs rename to crates/machine-controller/src/health_report.rs diff --git a/crates/api/src/state_controller/machine/io.rs b/crates/machine-controller/src/io.rs similarity index 99% rename from crates/api/src/state_controller/machine/io.rs rename to crates/machine-controller/src/io.rs index 046df2dab4..693457511a 100644 --- a/crates/api/src/state_controller/machine/io.rs +++ b/crates/machine-controller/src/io.rs @@ -32,8 +32,8 @@ use model::machine::{ use sqlx::PgConnection; use state_controller::io::StateControllerIO; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::machine::metrics::MachineMetricsEmitter; +use crate::context::MachineStateHandlerContextObjects; +use crate::metrics::MachineMetricsEmitter; /// State Controller IO implementation for Machines #[derive(Default, Debug)] diff --git a/crates/api/src/state_controller/machine/mod.rs b/crates/machine-controller/src/lib.rs similarity index 99% rename from crates/api/src/state_controller/machine/mod.rs rename to crates/machine-controller/src/lib.rs index 993be6bcff..fae9742bd1 100644 --- a/crates/api/src/state_controller/machine/mod.rs +++ b/crates/machine-controller/src/lib.rs @@ -139,7 +139,7 @@ where Ok((machine_state, ek_cert_verification_status)) } -pub(crate) async fn handle_measuring_state( +pub async fn handle_measuring_state( measuring_state: &MeasuringState, machine_id: &MachineId, db: &mut DB, diff --git a/crates/api/src/state_controller/machine/metrics.rs b/crates/machine-controller/src/metrics.rs similarity index 99% rename from crates/api/src/state_controller/machine/metrics.rs rename to crates/machine-controller/src/metrics.rs index c9908ee538..79e9e486ec 100644 --- a/crates/api/src/state_controller/machine/metrics.rs +++ b/crates/machine-controller/src/metrics.rs @@ -19,11 +19,11 @@ use std::collections::{HashMap, HashSet}; -use ::carbide_utils::metrics::SharedMetricsHolder; use carbide_health_metrics::{ HealthIterationMetrics, HealthMetricDimension, HealthObjectMetrics, register_alerts_suppressed_gauge, register_health_gauges, }; +use carbide_utils::metrics::SharedMetricsHolder; use model::hardware_info::MachineInventorySoftwareComponent; use model::tenant::TenantOrganizationId; use opentelemetry::KeyValue; diff --git a/crates/api/src/state_controller/machine/redfish.rs b/crates/machine-controller/src/redfish.rs similarity index 97% rename from crates/api/src/state_controller/machine/redfish.rs rename to crates/machine-controller/src/redfish.rs index 0d766e5dbd..4f17185e04 100644 --- a/crates/api/src/state_controller/machine/redfish.rs +++ b/crates/machine-controller/src/redfish.rs @@ -22,8 +22,8 @@ use libredfish::{PowerState, Redfish, RedfishError, SystemPowerControl}; use model::machine::Machine; use state_controller::state_handler::StateHandlerContext; -use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::machine::write_ops::MachineWriteOp; +use crate::context::MachineStateHandlerContextObjects; +use crate::write_ops::MachineWriteOp; #[track_caller] pub fn host_power_control( diff --git a/crates/api/src/state_controller/machine/write_ops.rs b/crates/machine-controller/src/write_ops.rs similarity index 100% rename from crates/api/src/state_controller/machine/write_ops.rs rename to crates/machine-controller/src/write_ops.rs From f8670d86214f02d684fa0e8df83ddc85f91a187f Mon Sep 17 00:00:00 2001 From: Dmitry Porokh Date: Wed, 27 May 2026 17:35:46 -0700 Subject: [PATCH 2/2] refactor(machine-controller): isolate scout firmware upgrade proto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move Scout firmware upgrade payload messages into a small dedicated proto and use that generated code from the machine controller instead of depending on the full RPC crate. This keeps the machine controller’s RPC usage limited to JSON serialization of firmware upgrade tasks and reduces coupling while preserving the existing ForgeAgentControl wire shape. Signed-off-by: Dmitry Porokh --- Cargo.lock | 4 ++- crates/api/src/compat.rs | 14 ++++---- crates/api/src/handlers/machine_scout.rs | 4 +-- crates/machine-controller/Cargo.toml | 9 ++--- crates/machine-controller/build.rs | 34 +++++++++++++++++++ crates/machine-controller/src/handler.rs | 4 +-- crates/machine-controller/src/lib.rs | 1 + crates/machine-controller/src/rpc.rs | 21 ++++++++++++ crates/rpc/build.rs | 11 ++++-- crates/rpc/proto/forge.proto | 17 ++-------- crates/rpc/proto/scout_firmware_upgrade.proto | 18 ++++++++++ crates/rpc/src/lib.rs | 2 +- crates/rpc/src/protos/mod.rs | 6 ++++ crates/scout/src/firmware_upgrade.rs | 4 +-- crates/scout/src/main.rs | 7 ++-- crates/ssh-console-mock-api-server/build.rs | 1 + .../src/generated/mod.rs | 3 ++ 17 files changed, 122 insertions(+), 38 deletions(-) create mode 100644 crates/machine-controller/build.rs create mode 100644 crates/machine-controller/src/rpc.rs create mode 100644 crates/rpc/proto/scout_firmware_upgrade.proto diff --git a/Cargo.lock b/Cargo.lock index 70f9aec032..ce6e26b34f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2188,7 +2188,6 @@ dependencies = [ "carbide-ipmi", "carbide-measured-boot", "carbide-redfish", - "carbide-rpc", "carbide-secrets", "carbide-state-controller-common", "carbide-utils", @@ -2206,12 +2205,15 @@ dependencies = [ "mac_address", "mockall", "opentelemetry", + "prost", + "prost-types", "regex", "serde", "serde_json", "sqlx", "state-controller", "tokio", + "tonic-prost-build", "tracing", "uuid", "version-compare 0.2.1", diff --git a/crates/api/src/compat.rs b/crates/api/src/compat.rs index e3d249b610..bc27cdcec5 100644 --- a/crates/api/src/compat.rs +++ b/crates/api/src/compat.rs @@ -130,8 +130,8 @@ impl BuildAndFillLegacyFields for ForgeAgentControlResponse { #[cfg(test)] mod tests { - use ::rpc::common; use ::rpc::protos::mlx_device; + use ::rpc::{common, scout_firmware_upgrade as sfu}; use carbide_uuid::machine_validation::MachineValidationId; use super::*; @@ -339,17 +339,17 @@ mod tests { fn firmware_upgrade_converts_to_legacy_task_json() { let upgrade_task_id = uuid::Uuid::new_v4().to_string(); let action = fac::Action::FirmwareUpgrade(fac::FirmwareUpgrade { - task: Some(fac::ScoutFirmwareUpgradeTask { + task: Some(sfu::ScoutFirmwareUpgradeTask { upgrade_task_id: upgrade_task_id.clone(), component_type: "cpld".to_string(), target_version: "1.2.3".to_string(), - script: Some(fac::FileArtifact { + script: Some(sfu::FileArtifact { url: "http://pxe/script.sh".to_string(), sha256: "abc".to_string(), }), execution_timeout_seconds: 30, artifact_download_timeout_seconds: 10, - file_artifacts: vec![fac::FileArtifact { + file_artifacts: vec![sfu::FileArtifact { url: "http://pxe/fw.bin".to_string(), sha256: "def".to_string(), }], @@ -385,17 +385,17 @@ mod tests { fn response_from_firmware_upgrade_sets_typed_payload_and_legacy_pairs() { let response = ForgeAgentControlResponse::build_and_fill_legacy_fields( fac::Action::FirmwareUpgrade(fac::FirmwareUpgrade { - task: Some(fac::ScoutFirmwareUpgradeTask { + task: Some(sfu::ScoutFirmwareUpgradeTask { upgrade_task_id: uuid::Uuid::new_v4().to_string(), component_type: "cpld".to_string(), target_version: "1.2.3".to_string(), - script: Some(fac::FileArtifact { + script: Some(sfu::FileArtifact { url: "http://pxe/script.sh".to_string(), sha256: "abc".to_string(), }), execution_timeout_seconds: 30, artifact_download_timeout_seconds: 10, - file_artifacts: vec![fac::FileArtifact { + file_artifacts: vec![sfu::FileArtifact { url: "http://pxe/fw.bin".to_string(), sha256: "def".to_string(), }], diff --git a/crates/api/src/handlers/machine_scout.rs b/crates/api/src/handlers/machine_scout.rs index dde0dd3147..4a297d0cc2 100644 --- a/crates/api/src/handlers/machine_scout.rs +++ b/crates/api/src/handlers/machine_scout.rs @@ -16,7 +16,7 @@ */ use ::rpc::forge::ForgeAgentControlResponse; use ::rpc::model::machine::get_action_for_dpu_state; -use ::rpc::{forge as rpc, forge_agent_control_response as fac}; +use ::rpc::{forge as rpc, forge_agent_control_response as fac, scout_firmware_upgrade as sfu}; use model::machine::machine_search_config::MachineSearchConfig; use model::machine::{ BomValidating, CleanupContext, CleanupState, FailureCause, FailureDetails, FailureSource, @@ -334,7 +334,7 @@ pub(crate) async fn forge_agent_control( machine_id = %machine.id, "Sending firmware upgrade task to scout", ); - let action = match serde_json::from_str::(task_json) + let action = match serde_json::from_str::(task_json) { Ok(task) => Action::FirmwareUpgrade(fac::FirmwareUpgrade { task: Some(task) }), Err(e) => { diff --git a/crates/machine-controller/Cargo.toml b/crates/machine-controller/Cargo.toml index 12e5766fbf..cdbdcd7620 100644 --- a/crates/machine-controller/Cargo.toml +++ b/crates/machine-controller/Cargo.toml @@ -37,10 +37,6 @@ carbide-firmware = { path = "../firmware", default-features = false } carbide-ipmi = { path = "../ipmi", default-features = false } carbide-measured-boot = { path = "../measured-boot", default-features = false } carbide-redfish = { path = "../redfish", default-features = false } -# TODO: RPC is only used to serialize / deserialize of -# ScoutFirmwareUpgradeTask. If we can find a way to do it differently -# we can improve build performance significantly. -carbide-rpc = { path = "../rpc", default-features = false } carbide-secrets = { path = "../secrets" } carbide-state-controller-common = { path = "../state-controller-common", default-features = false } carbide-uuid = { path = "../uuid", default-features = false } @@ -59,6 +55,8 @@ libredfish = { workspace = true } mac_address = { workspace = true } mockall = { workspace = true, optional = true } opentelemetry = { workspace = true } +prost = { workspace = true } +prost-types = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } sqlx = { workspace = true } @@ -72,5 +70,8 @@ figment = { workspace = true, features = ["env", "test", "toml"] } regex = { workspace = true } lazy_static = { workspace = true } +[build-dependencies] +tonic-prost-build = "0.14" + [lints] workspace = true diff --git a/crates/machine-controller/build.rs b/crates/machine-controller/build.rs new file mode 100644 index 0000000000..d52f264928 --- /dev/null +++ b/crates/machine-controller/build.rs @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use std::path::PathBuf; + +fn main() -> Result<(), Box> { + let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); + tonic_prost_build::configure() + .out_dir(out_dir) + .type_attribute( + "scout_firmware_upgrade.ScoutFirmwareUpgradeTask", + "#[derive(serde::Serialize, serde::Deserialize)]", + ) + .type_attribute( + "scout_firmware_upgrade.FileArtifact", + "#[derive(serde::Serialize, serde::Deserialize)]", + ) + .compile_protos(&["scout_firmware_upgrade.proto"], &["../rpc/proto"])?; + + Ok(()) +} diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index f164ddd078..c50c840892 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -123,10 +123,10 @@ use helpers::{ DpuDiscoveringStateHelper, DpuInitStateHelper, ManagedHostStateHelper, NextState, ReprovisionStateHelper, all_equal, }; -use rpc::forge_agent_control_response::FileArtifact; use state_controller::db_write_batch::DbWriteBatch; use crate::config::{BomValidationConfig, PowerManagerOptions}; +use crate::rpc::scout_firmware_upgrade::{FileArtifact, ScoutFirmwareUpgradeTask}; use crate::write_ops::MachineWriteOp; // We can't use http::StatusCode because libredfish has a newer version @@ -7427,7 +7427,7 @@ impl HostUpgradeState { let upgrade_task_id = uuid::Uuid::new_v4().to_string(); let file_artifact_count = to_install.files.len(); - let task = rpc::forge_agent_control_response::ScoutFirmwareUpgradeTask { + let task = ScoutFirmwareUpgradeTask { upgrade_task_id: upgrade_task_id.clone(), component_type: firmware_type.to_string(), target_version: to_install.version.clone(), diff --git a/crates/machine-controller/src/lib.rs b/crates/machine-controller/src/lib.rs index fae9742bd1..d8b598b16d 100644 --- a/crates/machine-controller/src/lib.rs +++ b/crates/machine-controller/src/lib.rs @@ -37,6 +37,7 @@ pub mod health_report; pub mod io; pub mod metrics; pub mod redfish; +pub(crate) mod rpc; pub mod write_ops; /// Fields of span that should be logged for each message. diff --git a/crates/machine-controller/src/rpc.rs b/crates/machine-controller/src/rpc.rs new file mode 100644 index 0000000000..3fa0e6dbe1 --- /dev/null +++ b/crates/machine-controller/src/rpc.rs @@ -0,0 +1,21 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#[allow(dead_code)] +pub(crate) mod scout_firmware_upgrade { + include!(concat!(env!("OUT_DIR"), "/scout_firmware_upgrade.rs")); +} diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index 5e57bad3b3..82b6dfaa62 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -859,8 +859,14 @@ fn main() -> Result<(), Box> { "forge.SpdmAttestationDetails", "#[derive(serde::Serialize)]", ) - .type_attribute("forge.ForgeAgentControlResponse.ScoutFirmwareUpgradeTask", "#[derive(serde::Serialize, serde::Deserialize)]") - .type_attribute("forge.ForgeAgentControlResponse.FileArtifact", "#[derive(serde::Serialize, serde::Deserialize)]") + .type_attribute( + "scout_firmware_upgrade.ScoutFirmwareUpgradeTask", + "#[derive(serde::Serialize, serde::Deserialize)]", + ) + .type_attribute( + "scout_firmware_upgrade.FileArtifact", + "#[derive(serde::Serialize, serde::Deserialize)]", + ) .build_server(true) .build_client(true) .protoc_arg("--experimental_allow_proto3_optional") @@ -868,6 +874,7 @@ fn main() -> Result<(), Box> { .compile_protos( &[ "proto/common.proto", + "proto/scout_firmware_upgrade.proto", "proto/forge.proto", "proto/machine_discovery.proto", "proto/mlx_device.proto", diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 28098e8414..7fc8d360e8 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -15,6 +15,7 @@ import "health.proto"; import "machine_discovery.proto"; import "measured_boot.proto"; import "mlx_device.proto"; +import "scout_firmware_upgrade.proto"; import "site_explorer.proto"; service Forge { @@ -4565,23 +4566,9 @@ message ForgeAgentControlResponse { } message FirmwareUpgrade { - ScoutFirmwareUpgradeTask task = 1; + scout_firmware_upgrade.ScoutFirmwareUpgradeTask task = 1; } - message ScoutFirmwareUpgradeTask { - string upgrade_task_id = 1; - string component_type = 2; - string target_version = 3; - FileArtifact script = 4; - uint32 execution_timeout_seconds = 5; - uint32 artifact_download_timeout_seconds = 6; - repeated FileArtifact file_artifacts = 7; - } - - message FileArtifact { - string url = 1; - string sha256 = 2; - } oneof action { Noop noop = 3; diff --git a/crates/rpc/proto/scout_firmware_upgrade.proto b/crates/rpc/proto/scout_firmware_upgrade.proto new file mode 100644 index 0000000000..b3f1a60867 --- /dev/null +++ b/crates/rpc/proto/scout_firmware_upgrade.proto @@ -0,0 +1,18 @@ +syntax = "proto3"; + +package scout_firmware_upgrade; + +message ScoutFirmwareUpgradeTask { + string upgrade_task_id = 1; + string component_type = 2; + string target_version = 3; + FileArtifact script = 4; + uint32 execution_timeout_seconds = 5; + uint32 artifact_download_timeout_seconds = 6; + repeated FileArtifact file_artifacts = 7; +} + +message FileArtifact { + string url = 1; + string sha256 = 2; +} diff --git a/crates/rpc/src/lib.rs b/crates/rpc/src/lib.rs index 0d311abd38..6a4ed0f954 100644 --- a/crates/rpc/src/lib.rs +++ b/crates/rpc/src/lib.rs @@ -58,7 +58,7 @@ pub use crate::protos::machine_discovery::{ self, BlockDevice, Cpu, DiscoveryInfo, DmiData, NetworkInterface, NvmeDevice, PciDeviceProperties, }; -pub use crate::protos::{fmds, health, site_explorer}; +pub use crate::protos::{fmds, health, scout_firmware_upgrade, site_explorer}; pub mod errors; pub mod forge_tls_client; diff --git a/crates/rpc/src/protos/mod.rs b/crates/rpc/src/protos/mod.rs index 5c8b26297c..01f0490d54 100644 --- a/crates/rpc/src/protos/mod.rs +++ b/crates/rpc/src/protos/mod.rs @@ -25,6 +25,12 @@ pub mod common { include!(concat!(env!("OUT_DIR"), "/common.rs")); } +#[allow(non_snake_case, unknown_lints, clippy::all)] +#[rustfmt::skip] +pub mod scout_firmware_upgrade { + include!(concat!(env!("OUT_DIR"), "/scout_firmware_upgrade.rs")); +} + #[allow(non_snake_case, unknown_lints, clippy::all)] #[rustfmt::skip] pub mod forge { diff --git a/crates/scout/src/firmware_upgrade.rs b/crates/scout/src/firmware_upgrade.rs index 89b352e871..828fcc6d88 100644 --- a/crates/scout/src/firmware_upgrade.rs +++ b/crates/scout/src/firmware_upgrade.rs @@ -19,7 +19,7 @@ use std::path::{Path, PathBuf}; use std::time::Duration; use futures_util::TryStreamExt; -use rpc::forge_agent_control_response::ScoutFirmwareUpgradeTask as FirmwareUpgradeTask; +use rpc::scout_firmware_upgrade::ScoutFirmwareUpgradeTask as FirmwareUpgradeTask; use sha2::{Digest, Sha256}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -264,7 +264,7 @@ async fn sha256_file(path: &Path) -> Result> mod tests { use axum::Router; use axum::routing::get; - use rpc::forge_agent_control_response::FileArtifact; + use rpc::scout_firmware_upgrade::FileArtifact; use tokio::net::TcpListener; use super::*; diff --git a/crates/scout/src/main.rs b/crates/scout/src/main.rs index 90f0959faa..d38a12f0f4 100644 --- a/crates/scout/src/main.rs +++ b/crates/scout/src/main.rs @@ -37,7 +37,10 @@ use rpc::protos::mlx_device::{ FirmwareFlashReport as FirmwareFlashReportPb, LockStatus, MlxObservation, MlxObservationReport, PublishMlxObservationReportRequest, }; -use rpc::{ForgeScoutErrorReport, forge as rpc_forge, forge_agent_control_response as fac}; +use rpc::{ + ForgeScoutErrorReport, forge as rpc_forge, forge_agent_control_response as fac, + scout_firmware_upgrade as sfu, +}; pub use scout::{CarbideClientError, CarbideClientResult}; use tokio::sync::RwLock; use tryhard::{RetryFutureConfig, RetryPolicy}; @@ -407,7 +410,7 @@ async fn handle_action( async fn handle_firmware_upgrade_action( config: &Options, machine_id: &MachineId, - task: Option, + task: Option, ) -> Result<(), CarbideClientError> { let task = task.ok_or_else(|| { CarbideClientError::GenericError("firmware upgrade action missing task".to_string()) diff --git a/crates/ssh-console-mock-api-server/build.rs b/crates/ssh-console-mock-api-server/build.rs index 66fadb9ea1..36d2f36e05 100644 --- a/crates/ssh-console-mock-api-server/build.rs +++ b/crates/ssh-console-mock-api-server/build.rs @@ -51,6 +51,7 @@ fn main() -> Result<(), Box> { .compile_protos( &[ "proto/common.proto", + "proto/scout_firmware_upgrade.proto", "proto/dns.proto", "proto/forge.proto", "proto/machine_discovery.proto", diff --git a/crates/ssh-console-mock-api-server/src/generated/mod.rs b/crates/ssh-console-mock-api-server/src/generated/mod.rs index 91def1d871..d96d3149e2 100644 --- a/crates/ssh-console-mock-api-server/src/generated/mod.rs +++ b/crates/ssh-console-mock-api-server/src/generated/mod.rs @@ -37,4 +37,7 @@ pub mod measured_boot; pub mod mlx_device; #[allow(non_snake_case, unknown_lints, clippy::all)] #[rustfmt::skip] +pub mod scout_firmware_upgrade; +#[allow(non_snake_case, unknown_lints, clippy::all)] +#[rustfmt::skip] pub mod site_explorer;