Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 85 additions & 2 deletions crates/api-integration-tests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ use std::time::{self, Duration};
use ::carbide_utils::HostPortPair;
use ::machine_a_tron::{BmcMockRegistry, HostMachineHandle, MachineATronConfig, MachineConfig};
use api_test_helper::{
IntegrationTestEnvironment, domain, instance, machine, metrics, subnet, tenant, utils, vpc,
vpc_prefix,
IntegrationTestEnvironment, dhcp, domain, instance, machine, metrics, power_shelf, subnet,
switch, tenant, utils, vpc, vpc_prefix,
};
use bmc_mock::{HostHardwareType, ListenerOrAddress};
use eyre::ContextCompat;
Expand Down Expand Up @@ -192,6 +192,8 @@ async fn test_integration() -> eyre::Result<()> {
Ipv4Addr::new(172, 20, 0, 2),
)
.boxed(),
test_power_shelf_discovery(&test_env, &bmc_address_registry).boxed(),
test_switch_discovery(&test_env, &bmc_address_registry).boxed(),
]);

tokio::select! {
Expand Down Expand Up @@ -934,6 +936,87 @@ where
results.into_iter().try_collect()
}

/// Discover a LITE-ON power shelf BMC through the full site-explorer flow.
///
/// LITE-ON power shelves don't expose vendor details in their Redfish service root,
/// so the explorer has to fall back to an authenticated Chassis probe to identify
/// them. On a brand-new shelf there are no credentials in Vault yet, so it falls back
/// to the expected power shelf's credentials (the fix from PR #842). We assert the
/// expected power shelf ends up linked to a managed PowerShelf that we can fetch back.
async fn test_power_shelf_discovery(
test_env: &IntegrationTestEnvironment,
bmc_mock_registry: &BmcMockRegistry,
) -> eyre::Result<()> {
let addrs = &test_env.carbide_api_addrs;

// HostMachineInfo::new assigns this shelf a unique BMC MAC and serial number.
let host_info = bmc_mock::HostMachineInfo::new(HostHardwareType::LiteOnPowerShelf, vec![]);
let bmc_mac = host_info.bmc_mac_address.to_string();
let serial = host_info.serial.clone();
let (router, _) =
bmc_mock::test_support::host_bmc_router(host_info, format!("ps-test-{bmc_mac}"));

// The explorer matches a discovered BMC to this expected entry by MAC address.
power_shelf::add_expected(addrs, &bmc_mac, "root", "password", &serial).await?;
tracing::info!(%bmc_mac, "Registered expected power shelf");

// Announce the BMC via DHCP on the underlay relay, then publish its mock at the
// assigned IP so the shared bmc-mock server routes the explorer's probes to it.
let assigned_ip = dhcp::discover(addrs, &bmc_mac, "172.20.1.1").await?;
tracing::info!(%bmc_mac, %assigned_ip, "Power shelf BMC announced; registering mock");
bmc_mock_registry.write().await.insert(assigned_ip, router);

// Wait for the explorer to link the expected shelf to a managed PowerShelf, then
// confirm that PowerShelf is retrievable by its id.
let power_shelf_id = power_shelf::wait_for_linked(addrs, &bmc_mac).await?;
let found_id = power_shelf::find_by_id(addrs, &power_shelf_id).await?;
assert_eq!(
found_id, power_shelf_id,
"FindPowerShelvesByIds should return the linked power shelf"
);

tracing::info!(%bmc_mac, %power_shelf_id, "Power shelf discovery test passed");
Ok(())
}

/// Discover an NVIDIA switch BMC (ND5200_LD) through the full site-explorer flow,
/// asserting the expected switch ends up linked to a managed Switch we can fetch back.
async fn test_switch_discovery(
test_env: &IntegrationTestEnvironment,
bmc_mock_registry: &BmcMockRegistry,
) -> eyre::Result<()> {
let addrs = &test_env.carbide_api_addrs;

// HostMachineInfo::new assigns this switch a unique BMC MAC and serial number.
let host_info = bmc_mock::HostMachineInfo::new(HostHardwareType::NvidiaSwitchNd5200Ld, vec![]);
let bmc_mac = host_info.bmc_mac_address.to_string();
let serial = host_info.serial.clone();
let (router, _) =
bmc_mock::test_support::host_bmc_router(host_info, format!("sw-test-{bmc_mac}"));

// The explorer matches a discovered BMC to this expected entry by MAC address.
switch::add_expected(addrs, &bmc_mac, "root", "password", &serial).await?;
tracing::info!(%bmc_mac, "Registered expected switch");

// Announce the BMC via DHCP on the underlay relay, then publish its mock at the
// assigned IP so the shared bmc-mock server routes the explorer's probes to it.
let assigned_ip = dhcp::discover(addrs, &bmc_mac, "172.20.1.1").await?;
tracing::info!(%bmc_mac, %assigned_ip, "Switch BMC announced; registering mock");
bmc_mock_registry.write().await.insert(assigned_ip, router);

// Wait for the explorer to link the expected switch to a managed Switch, then
// confirm that Switch is retrievable by its id.
let switch_id = switch::wait_for_linked(addrs, &bmc_mac).await?;
let found_id = switch::find_by_id(addrs, &switch_id).await?;
assert_eq!(
found_id, switch_id,
"FindSwitchesByIds should return the linked switch"
);

tracing::info!(%bmc_mac, %switch_id, "Switch discovery test passed");
Ok(())
}

// Get the current number of rows in the dns_records view,
// which is expected to start at 0, and then progress, as
// the test continues.
Expand Down
5 changes: 5 additions & 0 deletions crates/api-test-helper/src/api_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,11 @@ pub async fn start(
explorations_per_run = 90
create_machines = true
machines_created_per_run = 30
create_power_shelves = true
power_shelves_created_per_run = 5
explore_power_shelves_from_static_ip = false
create_switches = true
switches_created_per_run = 5
allow_zero_dpu_hosts = true
allow_proxy_to_unknown_host = false
{bmc_proxy_cfg}
Expand Down
44 changes: 44 additions & 0 deletions crates/api-test-helper/src/dhcp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use std::net::SocketAddr;

use eyre::ContextCompat;

use crate::grpcurl::grpcurl;

/// Simulate a DHCP discovery for `mac_address` arriving on `relay_address`, and return
/// the IP address NICo assigned to it. This is how a BMC that isn't driven by
/// machine-a-tron (e.g. a power shelf or switch) gets an underlay address that the
/// site explorer can then probe -- the same way a real DHCP relay announces a freshly
/// cabled BMC.
pub async fn discover(
addrs: &[SocketAddr],
mac_address: &str,
relay_address: &str,
) -> eyre::Result<String> {
let data = serde_json::json!({
"mac_address": mac_address,
"relay_address": relay_address,
});
let response = grpcurl(addrs, "DiscoverDhcp", Some(&data)).await?;
let record: serde_json::Value = serde_json::from_str(&response)?;
let address = record["address"]
.as_str()
.with_context(|| format!("DiscoverDhcp returned no address: {response}"))?;
Ok(address.to_string())
}
3 changes: 3 additions & 0 deletions crates/api-test-helper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@
*/

pub mod api_server;
pub mod dhcp;
pub mod domain;
pub mod grpcurl;
pub mod instance;
pub mod machine;
pub mod machine_a_tron;
pub mod metrics;
pub mod mock_rms;
pub mod power_shelf;
pub mod subnet;
pub mod switch;
pub mod tenant;
pub mod utils;
pub mod vault;
Expand Down
92 changes: 92 additions & 0 deletions crates/api-test-helper/src/power_shelf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use std::net::SocketAddr;
use std::time::{Duration, Instant};

use eyre::ContextCompat;
use tokio::time::sleep;

use crate::grpcurl::grpcurl;

/// Upper bound on waiting for the site explorer to discover a BMC and link its
/// expected entity to a managed one. The happy path is a few seconds (the explorer
/// runs on a one-second loop), but giving it some extra time, since we give 90
/// seconds for equivalent machine runs.
const LINK_TIMEOUT: Duration = Duration::from_secs(60);

/// Register an expected power shelf. The site explorer matches a discovered BMC to
/// this entry by `bmc_mac_address`, then creates and links a managed PowerShelf.
pub async fn add_expected(
addrs: &[SocketAddr],
bmc_mac_address: &str,
bmc_username: &str,
bmc_password: &str,
serial_number: &str,
) -> eyre::Result<()> {
let data = serde_json::json!({
"bmc_mac_address": bmc_mac_address,
"bmc_username": bmc_username,
"bmc_password": bmc_password,
"shelf_serial_number": serial_number,
"metadata": { "name": serial_number },
});
grpcurl(addrs, "AddExpectedPowerShelf", Some(&data)).await?;
Ok(())
}

/// Poll until the expected power shelf identified by `bmc_mac_address` is linked to a
/// managed PowerShelf, returning that PowerShelf's id. Fails if it is not linked
/// within [`LINK_TIMEOUT`].
pub async fn wait_for_linked(addrs: &[SocketAddr], bmc_mac_address: &str) -> eyre::Result<String> {
let start = Instant::now();
loop {
let response = grpcurl(addrs, "GetAllExpectedPowerShelvesLinked", Some("{}")).await?;
let linked: serde_json::Value = serde_json::from_str(&response)?;
if let Some(entries) = linked["expectedPowerShelves"].as_array() {
for entry in entries {
let matches_mac = entry["bmcMacAddress"]
.as_str()
.is_some_and(|m| m.eq_ignore_ascii_case(bmc_mac_address));
if matches_mac && let Some(id) = entry["powerShelfId"]["id"].as_str() {
return Ok(id.to_string());
}
}
}
if start.elapsed() > LINK_TIMEOUT {
eyre::bail!(
"expected power shelf {bmc_mac_address} was not linked to a managed \
power shelf within {LINK_TIMEOUT:?}"
);
}
sleep(Duration::from_secs(2)).await;
}
}

/// Fetch a managed power shelf by id via FindPowerShelvesByIds, returning the id the
/// API echoes back. Used to confirm a linked PowerShelf is actually retrievable.
pub async fn find_by_id(addrs: &[SocketAddr], power_shelf_id: &str) -> eyre::Result<String> {
let data = serde_json::json!({
"power_shelf_ids": [{ "id": power_shelf_id }],
});
let response = grpcurl(addrs, "FindPowerShelvesByIds", Some(&data)).await?;
let list: serde_json::Value = serde_json::from_str(&response)?;
let id = list["powerShelves"][0]["id"]["id"]
.as_str()
.with_context(|| format!("FindPowerShelvesByIds returned no power shelf: {response}"))?;
Ok(id.to_string())
}
92 changes: 92 additions & 0 deletions crates/api-test-helper/src/switch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use std::net::SocketAddr;
use std::time::{Duration, Instant};

use eyre::ContextCompat;
use tokio::time::sleep;

use crate::grpcurl::grpcurl;

/// Upper bound on waiting for the site explorer to discover a BMC and link its
/// expected entity to a managed one. The happy path is a few seconds (the explorer
/// runs on a one-second loop), but giving it some extra time, since we give 90
/// seconds for equivalent machine runs.
const LINK_TIMEOUT: Duration = Duration::from_secs(60);

/// Register an expected switch. The site explorer matches a discovered BMC to this
/// entry by `bmc_mac_address`, then creates and links a managed Switch.
pub async fn add_expected(
addrs: &[SocketAddr],
bmc_mac_address: &str,
bmc_username: &str,
bmc_password: &str,
serial_number: &str,
) -> eyre::Result<()> {
let data = serde_json::json!({
"bmc_mac_address": bmc_mac_address,
"bmc_username": bmc_username,
"bmc_password": bmc_password,
"switch_serial_number": serial_number,
"metadata": { "name": serial_number },
});
grpcurl(addrs, "AddExpectedSwitch", Some(&data)).await?;
Ok(())
}

/// Poll until the expected switch identified by `bmc_mac_address` is linked to a
/// managed Switch, returning that Switch's id. Fails if it is not linked within
/// [`LINK_TIMEOUT`].
pub async fn wait_for_linked(addrs: &[SocketAddr], bmc_mac_address: &str) -> eyre::Result<String> {
let start = Instant::now();
loop {
let response = grpcurl(addrs, "GetAllExpectedSwitchesLinked", Some("{}")).await?;
let linked: serde_json::Value = serde_json::from_str(&response)?;
if let Some(entries) = linked["expectedSwitches"].as_array() {
for entry in entries {
let matches_mac = entry["bmcMacAddress"]
.as_str()
.is_some_and(|m| m.eq_ignore_ascii_case(bmc_mac_address));
if matches_mac && let Some(id) = entry["switchId"]["id"].as_str() {
return Ok(id.to_string());
}
}
}
if start.elapsed() > LINK_TIMEOUT {
eyre::bail!(
"expected switch {bmc_mac_address} was not linked to a managed \
switch within {LINK_TIMEOUT:?}"
);
}
sleep(Duration::from_secs(2)).await;
}
}

/// Fetch a managed switch by id via FindSwitchesByIds, returning the id the API
/// echoes back. Used to confirm a linked Switch is actually retrievable.
pub async fn find_by_id(addrs: &[SocketAddr], switch_id: &str) -> eyre::Result<String> {
let data = serde_json::json!({
"switch_ids": [{ "id": switch_id }],
});
let response = grpcurl(addrs, "FindSwitchesByIds", Some(&data)).await?;
let list: serde_json::Value = serde_json::from_str(&response)?;
let id = list["switches"][0]["id"]["id"]
.as_str()
.with_context(|| format!("FindSwitchesByIds returned no switch: {response}"))?;
Ok(id.to_string())
}
20 changes: 20 additions & 0 deletions crates/bmc-mock/src/test_support/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,26 @@ async fn test_bmc((router, state): (axum::Router, BmcState)) -> TestBmcHandle {
}
}

/// Build a BMC mock router (and its state) for a host-type machine, ready to be
/// registered in a shared [`CombinedServer`](crate::CombinedServer) registry under
/// the BMC's IP address. Uses no-op power callbacks and leaves Redfish auth disabled,
/// matching the `*_bmc` helpers below.
///
/// Integration tests use this to drive discovery of non-machine BMCs -- such as
/// LITE-ON power shelves or NVIDIA switches -- through the real site explorer, where
/// the test owns the shared registry and inserts the returned router itself.
pub fn host_bmc_router(
host_info: HostMachineInfo,
mat_host_id: String,
) -> (axum::Router, BmcState) {
machine_router(
MachineInfo::Host(host_info),
Arc::new(NoopCallbacks),
mat_host_id,
false,
)
}

pub async fn wiwynn_gb200_bmc() -> TestBmcHandle {
test_bmc(machine_router(
MachineInfo::Host(HostMachineInfo::new(
Expand Down
Loading