From 95cf3b319677daba6a0bb4256f639fdc51ffd8a5 Mon Sep 17 00:00:00 2001 From: Chet Nichols III Date: Wed, 27 May 2026 11:56:55 -0700 Subject: [PATCH] chore: add integration tests for expected power shelf and switch discovery These land as two new subtests of `test_integration`, running alongside the existing machine tests against the shared `carbide-api` + `site-explorer`. Each one registers an expected entity, simulates the BMC showing up via DHCP, stands up a mock BMC at the assigned IP, then waits for `site-explorer` to link it to a real managed `PowerShelf` / `Switch` (and confirms we can fetch it back by ID). The power shelf test includes exercising the work from #842, ensuring "*service root vendor not populated*" happens (and is logged) right before it falls back and links successfully. Notable changes in here: - Flipped on `create_power_shelves` / `create_switches` in the test `site-explorer` config -- they default off, so the explorer wasn't creating either of them in tests (took me a sec to be like uhhh...). - New `test_support::host_bmc_router` in `bmc-mock` so a test can hand a mock BMC router to the shared registry, without making the internal `NoopCallbacks` public. - New `power_shelf`, `switch`, and `dhcp` helpers in `api-test-helper`, which are built on the existing `grpcurl` helper just like `tenant`/`vpc`/`subnet` Signed-off-by: Chet Nichols III --- crates/api-integration-tests/tests/lib.rs | 87 ++++++++++++++++++++- crates/api-test-helper/src/api_server.rs | 5 ++ crates/api-test-helper/src/dhcp.rs | 44 +++++++++++ crates/api-test-helper/src/lib.rs | 3 + crates/api-test-helper/src/power_shelf.rs | 92 +++++++++++++++++++++++ crates/api-test-helper/src/switch.rs | 92 +++++++++++++++++++++++ crates/bmc-mock/src/test_support/mod.rs | 20 +++++ 7 files changed, 341 insertions(+), 2 deletions(-) create mode 100644 crates/api-test-helper/src/dhcp.rs create mode 100644 crates/api-test-helper/src/power_shelf.rs create mode 100644 crates/api-test-helper/src/switch.rs diff --git a/crates/api-integration-tests/tests/lib.rs b/crates/api-integration-tests/tests/lib.rs index 26b2ed668d..20581f676e 100644 --- a/crates/api-integration-tests/tests/lib.rs +++ b/crates/api-integration-tests/tests/lib.rs @@ -24,8 +24,8 @@ use std::time::{self, Duration}; use ::carbide_utils::HostPortPair; use ::machine_a_tron::{BmcMockRegistry, HostMachineHandle, MachineATronConfig, MachineConfig}; use api_test_helper::{ - IntegrationTestEnvironment, domain, instance, machine, metrics, subnet, tenant, utils, vpc, - vpc_prefix, + IntegrationTestEnvironment, dhcp, domain, instance, machine, metrics, power_shelf, subnet, + switch, tenant, utils, vpc, vpc_prefix, }; use bmc_mock::{HostHardwareType, ListenerOrAddress}; use eyre::ContextCompat; @@ -192,6 +192,8 @@ async fn test_integration() -> eyre::Result<()> { Ipv4Addr::new(172, 20, 0, 2), ) .boxed(), + test_power_shelf_discovery(&test_env, &bmc_address_registry).boxed(), + test_switch_discovery(&test_env, &bmc_address_registry).boxed(), ]); tokio::select! { @@ -934,6 +936,87 @@ where results.into_iter().try_collect() } +/// Discover a LITE-ON power shelf BMC through the full site-explorer flow. +/// +/// LITE-ON power shelves don't expose vendor details in their Redfish service root, +/// so the explorer has to fall back to an authenticated Chassis probe to identify +/// them. On a brand-new shelf there are no credentials in Vault yet, so it falls back +/// to the expected power shelf's credentials (the fix from PR #842). We assert the +/// expected power shelf ends up linked to a managed PowerShelf that we can fetch back. +async fn test_power_shelf_discovery( + test_env: &IntegrationTestEnvironment, + bmc_mock_registry: &BmcMockRegistry, +) -> eyre::Result<()> { + let addrs = &test_env.carbide_api_addrs; + + // HostMachineInfo::new assigns this shelf a unique BMC MAC and serial number. + let host_info = bmc_mock::HostMachineInfo::new(HostHardwareType::LiteOnPowerShelf, vec![]); + let bmc_mac = host_info.bmc_mac_address.to_string(); + let serial = host_info.serial.clone(); + let (router, _) = + bmc_mock::test_support::host_bmc_router(host_info, format!("ps-test-{bmc_mac}")); + + // The explorer matches a discovered BMC to this expected entry by MAC address. + power_shelf::add_expected(addrs, &bmc_mac, "root", "password", &serial).await?; + tracing::info!(%bmc_mac, "Registered expected power shelf"); + + // Announce the BMC via DHCP on the underlay relay, then publish its mock at the + // assigned IP so the shared bmc-mock server routes the explorer's probes to it. + let assigned_ip = dhcp::discover(addrs, &bmc_mac, "172.20.1.1").await?; + tracing::info!(%bmc_mac, %assigned_ip, "Power shelf BMC announced; registering mock"); + bmc_mock_registry.write().await.insert(assigned_ip, router); + + // Wait for the explorer to link the expected shelf to a managed PowerShelf, then + // confirm that PowerShelf is retrievable by its id. + let power_shelf_id = power_shelf::wait_for_linked(addrs, &bmc_mac).await?; + let found_id = power_shelf::find_by_id(addrs, &power_shelf_id).await?; + assert_eq!( + found_id, power_shelf_id, + "FindPowerShelvesByIds should return the linked power shelf" + ); + + tracing::info!(%bmc_mac, %power_shelf_id, "Power shelf discovery test passed"); + Ok(()) +} + +/// Discover an NVIDIA switch BMC (ND5200_LD) through the full site-explorer flow, +/// asserting the expected switch ends up linked to a managed Switch we can fetch back. +async fn test_switch_discovery( + test_env: &IntegrationTestEnvironment, + bmc_mock_registry: &BmcMockRegistry, +) -> eyre::Result<()> { + let addrs = &test_env.carbide_api_addrs; + + // HostMachineInfo::new assigns this switch a unique BMC MAC and serial number. + let host_info = bmc_mock::HostMachineInfo::new(HostHardwareType::NvidiaSwitchNd5200Ld, vec![]); + let bmc_mac = host_info.bmc_mac_address.to_string(); + let serial = host_info.serial.clone(); + let (router, _) = + bmc_mock::test_support::host_bmc_router(host_info, format!("sw-test-{bmc_mac}")); + + // The explorer matches a discovered BMC to this expected entry by MAC address. + switch::add_expected(addrs, &bmc_mac, "root", "password", &serial).await?; + tracing::info!(%bmc_mac, "Registered expected switch"); + + // Announce the BMC via DHCP on the underlay relay, then publish its mock at the + // assigned IP so the shared bmc-mock server routes the explorer's probes to it. + let assigned_ip = dhcp::discover(addrs, &bmc_mac, "172.20.1.1").await?; + tracing::info!(%bmc_mac, %assigned_ip, "Switch BMC announced; registering mock"); + bmc_mock_registry.write().await.insert(assigned_ip, router); + + // Wait for the explorer to link the expected switch to a managed Switch, then + // confirm that Switch is retrievable by its id. + let switch_id = switch::wait_for_linked(addrs, &bmc_mac).await?; + let found_id = switch::find_by_id(addrs, &switch_id).await?; + assert_eq!( + found_id, switch_id, + "FindSwitchesByIds should return the linked switch" + ); + + tracing::info!(%bmc_mac, %switch_id, "Switch discovery test passed"); + Ok(()) +} + // Get the current number of rows in the dns_records view, // which is expected to start at 0, and then progress, as // the test continues. diff --git a/crates/api-test-helper/src/api_server.rs b/crates/api-test-helper/src/api_server.rs index 8229112fe2..07899ff215 100644 --- a/crates/api-test-helper/src/api_server.rs +++ b/crates/api-test-helper/src/api_server.rs @@ -199,6 +199,11 @@ pub async fn start( explorations_per_run = 90 create_machines = true machines_created_per_run = 30 + create_power_shelves = true + power_shelves_created_per_run = 5 + explore_power_shelves_from_static_ip = false + create_switches = true + switches_created_per_run = 5 allow_zero_dpu_hosts = true allow_proxy_to_unknown_host = false {bmc_proxy_cfg} diff --git a/crates/api-test-helper/src/dhcp.rs b/crates/api-test-helper/src/dhcp.rs new file mode 100644 index 0000000000..0108b31bc8 --- /dev/null +++ b/crates/api-test-helper/src/dhcp.rs @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::net::SocketAddr; + +use eyre::ContextCompat; + +use crate::grpcurl::grpcurl; + +/// Simulate a DHCP discovery for `mac_address` arriving on `relay_address`, and return +/// the IP address NICo assigned to it. This is how a BMC that isn't driven by +/// machine-a-tron (e.g. a power shelf or switch) gets an underlay address that the +/// site explorer can then probe -- the same way a real DHCP relay announces a freshly +/// cabled BMC. +pub async fn discover( + addrs: &[SocketAddr], + mac_address: &str, + relay_address: &str, +) -> eyre::Result { + let data = serde_json::json!({ + "mac_address": mac_address, + "relay_address": relay_address, + }); + let response = grpcurl(addrs, "DiscoverDhcp", Some(&data)).await?; + let record: serde_json::Value = serde_json::from_str(&response)?; + let address = record["address"] + .as_str() + .with_context(|| format!("DiscoverDhcp returned no address: {response}"))?; + Ok(address.to_string()) +} diff --git a/crates/api-test-helper/src/lib.rs b/crates/api-test-helper/src/lib.rs index d1d1270fae..7c049a1be7 100644 --- a/crates/api-test-helper/src/lib.rs +++ b/crates/api-test-helper/src/lib.rs @@ -16,6 +16,7 @@ */ pub mod api_server; +pub mod dhcp; pub mod domain; pub mod grpcurl; pub mod instance; @@ -23,7 +24,9 @@ pub mod machine; pub mod machine_a_tron; pub mod metrics; pub mod mock_rms; +pub mod power_shelf; pub mod subnet; +pub mod switch; pub mod tenant; pub mod utils; pub mod vault; diff --git a/crates/api-test-helper/src/power_shelf.rs b/crates/api-test-helper/src/power_shelf.rs new file mode 100644 index 0000000000..93cd136ea3 --- /dev/null +++ b/crates/api-test-helper/src/power_shelf.rs @@ -0,0 +1,92 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::net::SocketAddr; +use std::time::{Duration, Instant}; + +use eyre::ContextCompat; +use tokio::time::sleep; + +use crate::grpcurl::grpcurl; + +/// Upper bound on waiting for the site explorer to discover a BMC and link its +/// expected entity to a managed one. The happy path is a few seconds (the explorer +/// runs on a one-second loop), but giving it some extra time, since we give 90 +/// seconds for equivalent machine runs. +const LINK_TIMEOUT: Duration = Duration::from_secs(60); + +/// Register an expected power shelf. The site explorer matches a discovered BMC to +/// this entry by `bmc_mac_address`, then creates and links a managed PowerShelf. +pub async fn add_expected( + addrs: &[SocketAddr], + bmc_mac_address: &str, + bmc_username: &str, + bmc_password: &str, + serial_number: &str, +) -> eyre::Result<()> { + let data = serde_json::json!({ + "bmc_mac_address": bmc_mac_address, + "bmc_username": bmc_username, + "bmc_password": bmc_password, + "shelf_serial_number": serial_number, + "metadata": { "name": serial_number }, + }); + grpcurl(addrs, "AddExpectedPowerShelf", Some(&data)).await?; + Ok(()) +} + +/// Poll until the expected power shelf identified by `bmc_mac_address` is linked to a +/// managed PowerShelf, returning that PowerShelf's id. Fails if it is not linked +/// within [`LINK_TIMEOUT`]. +pub async fn wait_for_linked(addrs: &[SocketAddr], bmc_mac_address: &str) -> eyre::Result { + let start = Instant::now(); + loop { + let response = grpcurl(addrs, "GetAllExpectedPowerShelvesLinked", Some("{}")).await?; + let linked: serde_json::Value = serde_json::from_str(&response)?; + if let Some(entries) = linked["expectedPowerShelves"].as_array() { + for entry in entries { + let matches_mac = entry["bmcMacAddress"] + .as_str() + .is_some_and(|m| m.eq_ignore_ascii_case(bmc_mac_address)); + if matches_mac && let Some(id) = entry["powerShelfId"]["id"].as_str() { + return Ok(id.to_string()); + } + } + } + if start.elapsed() > LINK_TIMEOUT { + eyre::bail!( + "expected power shelf {bmc_mac_address} was not linked to a managed \ + power shelf within {LINK_TIMEOUT:?}" + ); + } + sleep(Duration::from_secs(2)).await; + } +} + +/// Fetch a managed power shelf by id via FindPowerShelvesByIds, returning the id the +/// API echoes back. Used to confirm a linked PowerShelf is actually retrievable. +pub async fn find_by_id(addrs: &[SocketAddr], power_shelf_id: &str) -> eyre::Result { + let data = serde_json::json!({ + "power_shelf_ids": [{ "id": power_shelf_id }], + }); + let response = grpcurl(addrs, "FindPowerShelvesByIds", Some(&data)).await?; + let list: serde_json::Value = serde_json::from_str(&response)?; + let id = list["powerShelves"][0]["id"]["id"] + .as_str() + .with_context(|| format!("FindPowerShelvesByIds returned no power shelf: {response}"))?; + Ok(id.to_string()) +} diff --git a/crates/api-test-helper/src/switch.rs b/crates/api-test-helper/src/switch.rs new file mode 100644 index 0000000000..c5f306ee61 --- /dev/null +++ b/crates/api-test-helper/src/switch.rs @@ -0,0 +1,92 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::net::SocketAddr; +use std::time::{Duration, Instant}; + +use eyre::ContextCompat; +use tokio::time::sleep; + +use crate::grpcurl::grpcurl; + +/// Upper bound on waiting for the site explorer to discover a BMC and link its +/// expected entity to a managed one. The happy path is a few seconds (the explorer +/// runs on a one-second loop), but giving it some extra time, since we give 90 +/// seconds for equivalent machine runs. +const LINK_TIMEOUT: Duration = Duration::from_secs(60); + +/// Register an expected switch. The site explorer matches a discovered BMC to this +/// entry by `bmc_mac_address`, then creates and links a managed Switch. +pub async fn add_expected( + addrs: &[SocketAddr], + bmc_mac_address: &str, + bmc_username: &str, + bmc_password: &str, + serial_number: &str, +) -> eyre::Result<()> { + let data = serde_json::json!({ + "bmc_mac_address": bmc_mac_address, + "bmc_username": bmc_username, + "bmc_password": bmc_password, + "switch_serial_number": serial_number, + "metadata": { "name": serial_number }, + }); + grpcurl(addrs, "AddExpectedSwitch", Some(&data)).await?; + Ok(()) +} + +/// Poll until the expected switch identified by `bmc_mac_address` is linked to a +/// managed Switch, returning that Switch's id. Fails if it is not linked within +/// [`LINK_TIMEOUT`]. +pub async fn wait_for_linked(addrs: &[SocketAddr], bmc_mac_address: &str) -> eyre::Result { + let start = Instant::now(); + loop { + let response = grpcurl(addrs, "GetAllExpectedSwitchesLinked", Some("{}")).await?; + let linked: serde_json::Value = serde_json::from_str(&response)?; + if let Some(entries) = linked["expectedSwitches"].as_array() { + for entry in entries { + let matches_mac = entry["bmcMacAddress"] + .as_str() + .is_some_and(|m| m.eq_ignore_ascii_case(bmc_mac_address)); + if matches_mac && let Some(id) = entry["switchId"]["id"].as_str() { + return Ok(id.to_string()); + } + } + } + if start.elapsed() > LINK_TIMEOUT { + eyre::bail!( + "expected switch {bmc_mac_address} was not linked to a managed \ + switch within {LINK_TIMEOUT:?}" + ); + } + sleep(Duration::from_secs(2)).await; + } +} + +/// Fetch a managed switch by id via FindSwitchesByIds, returning the id the API +/// echoes back. Used to confirm a linked Switch is actually retrievable. +pub async fn find_by_id(addrs: &[SocketAddr], switch_id: &str) -> eyre::Result { + let data = serde_json::json!({ + "switch_ids": [{ "id": switch_id }], + }); + let response = grpcurl(addrs, "FindSwitchesByIds", Some(&data)).await?; + let list: serde_json::Value = serde_json::from_str(&response)?; + let id = list["switches"][0]["id"]["id"] + .as_str() + .with_context(|| format!("FindSwitchesByIds returned no switch: {response}"))?; + Ok(id.to_string()) +} diff --git a/crates/bmc-mock/src/test_support/mod.rs b/crates/bmc-mock/src/test_support/mod.rs index c73fa26f61..9701308d5c 100644 --- a/crates/bmc-mock/src/test_support/mod.rs +++ b/crates/bmc-mock/src/test_support/mod.rs @@ -71,6 +71,26 @@ async fn test_bmc((router, state): (axum::Router, BmcState)) -> TestBmcHandle { } } +/// Build a BMC mock router (and its state) for a host-type machine, ready to be +/// registered in a shared [`CombinedServer`](crate::CombinedServer) registry under +/// the BMC's IP address. Uses no-op power callbacks and leaves Redfish auth disabled, +/// matching the `*_bmc` helpers below. +/// +/// Integration tests use this to drive discovery of non-machine BMCs -- such as +/// LITE-ON power shelves or NVIDIA switches -- through the real site explorer, where +/// the test owns the shared registry and inserts the returned router itself. +pub fn host_bmc_router( + host_info: HostMachineInfo, + mat_host_id: String, +) -> (axum::Router, BmcState) { + machine_router( + MachineInfo::Host(host_info), + Arc::new(NoopCallbacks), + mat_host_id, + false, + ) +} + pub async fn wiwynn_gb200_bmc() -> TestBmcHandle { test_bmc(machine_router( MachineInfo::Host(HostMachineInfo::new(