From 295dd773560e4dc35a1d0c618a4e902ff3138cda Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Tue, 26 May 2026 14:05:51 +0000 Subject: [PATCH 1/7] feat: initial work Signed-off-by: Krish Dandiwala --- crates/api-model/src/machine/mod.rs | 47 +- .../src/state_controller/machine/handler.rs | 452 +++------------ .../machine/handler/bios_config.rs | 536 ++++++++++++++++++ crates/api/src/state_controller/machine/io.rs | 2 +- .../api/src/tests/common/api_fixtures/mod.rs | 2 +- crates/api/src/tests/machine_history.rs | 2 +- crates/api/src/tests/machine_states.rs | 233 +++++++- crates/redfish/src/libredfish/test_support.rs | 7 +- .../state_machines/managedhost.md | 21 +- 9 files changed, 905 insertions(+), 397 deletions(-) create mode 100644 crates/api/src/state_controller/machine/handler/bios_config.rs diff --git a/crates/api-model/src/machine/mod.rs b/crates/api-model/src/machine/mod.rs index 1442cbbfa0..bb67568271 100644 --- a/crates/api-model/src/machine/mod.rs +++ b/crates/api-model/src/machine/mod.rs @@ -1660,7 +1660,10 @@ pub enum MachineState { WaitingForBiosJob { bios_config_info: BiosConfigInfo, }, - PollingBiosSetup, + PollingBiosSetup { + #[serde(default)] + retry_count: u32, + }, SetBootOrder { set_boot_order_info: Option, }, @@ -1716,6 +1719,10 @@ pub enum UefiSetupState { /// Tracks progress waiting for the Dell BIOS config job (from machine_setup PATCH) to complete /// before configuring boot order. Same pattern as SetBootOrderInfo / SetBootOrderState. +/// +/// `bios_job_id` is `Some` while polling a vendor BIOS job (e.g. Dell). `None` only during +/// `HandleBiosJobFailure` recovery from stuck PollingBiosSetup; non-Dell hosts reboot in +/// `configure_host_bios` and never enter job-polling substates. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] #[serde(rename_all = "lowercase")] pub struct BiosConfigInfo { @@ -1929,7 +1936,10 @@ pub enum HostPlatformConfigurationState { WaitingForBiosJob { bios_config_info: BiosConfigInfo, }, - PollingBiosSetup, + PollingBiosSetup { + #[serde(default)] + retry_count: u32, + }, SetBootOrder { set_boot_order_info: SetBootOrderInfo, }, @@ -2867,7 +2877,38 @@ mod tests { assert_eq!( deserialized, ManagedHostState::HostInit { - machine_state: MachineState::PollingBiosSetup, + machine_state: MachineState::PollingBiosSetup { retry_count: 0 }, + } + ); + } + + #[test] + fn test_json_deserialize_polling_bios_setup_with_retry_count() { + let serialized = + r#"{"state":"hostinit","machine_state":{"state":"pollingbiossetup","retry_count":2}}"#; + let deserialized: ManagedHostState = serde_json::from_str(serialized).unwrap(); + + assert_eq!( + deserialized, + ManagedHostState::HostInit { + machine_state: MachineState::PollingBiosSetup { retry_count: 2 }, + } + ); + } + + #[test] + fn test_json_deserialize_host_platform_configuration_polling_bios_setup_legacy() { + let serialized = r#"{"state":"assigned","instance_state":{"state":"hostplatformconfiguration","platform_config_state":{"state":"pollingbiossetup"}}}"#; + let deserialized: ManagedHostState = serde_json::from_str(serialized).unwrap(); + + assert_eq!( + deserialized, + ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: HostPlatformConfigurationState::PollingBiosSetup { + retry_count: 0, + }, + }, } ); } diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/api/src/state_controller/machine/handler.rs index 91ac95eb93..da6e6547da 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/api/src/state_controller/machine/handler.rs @@ -68,18 +68,17 @@ use model::machine::LockdownMode::{self, Enable}; use model::machine::infiniband::{IbConfigNotSyncedReason, ib_config_synced}; use model::machine::nvlink::nvlink_config_synced; use model::machine::{ - AttestationMode, BiosConfigInfo, BiosConfigState, BomValidating, BomValidatingContext, - CleanupContext, CleanupState, CreateBossVolumeContext, CreateBossVolumeState, - DpuDiscoveringState, DpuInitNextStateResolver, DpuInitState, FailureCause, FailureDetails, - FailureSource, HostPlatformConfigurationState, HostReprovisionState, InitialResetPhase, - InstallDpuOsState, InstanceNextStateResolver, InstanceState, LockdownInfo, LockdownState, - Machine, MachineLastRebootRequested, MachineLastRebootRequestedMode, MachineNextStateResolver, - MachineState, ManagedHostState, ManagedHostStateSnapshot, MeasuringState, - NetworkConfigUpdateState, NextStateBFBSupport, PerformPowerOperation, PowerDrainState, - PowerState, ReprovisionState, RetryInfo, SecureEraseBossContext, SecureEraseBossState, - SetBootOrderInfo, SetBootOrderState, SetSecureBootState, SpdmMeasuringState, StateMachineArea, - UefiSetupInfo, UefiSetupState, UnlockHostState, ValidationState, - dpf_based_dpu_provisioning_possible, get_display_ids, + AttestationMode, BomValidating, BomValidatingContext, CleanupContext, CleanupState, + CreateBossVolumeContext, CreateBossVolumeState, DpuDiscoveringState, DpuInitNextStateResolver, + DpuInitState, FailureCause, FailureDetails, FailureSource, HostPlatformConfigurationState, + HostReprovisionState, InitialResetPhase, InstallDpuOsState, InstanceNextStateResolver, + InstanceState, LockdownInfo, LockdownState, Machine, MachineLastRebootRequested, + MachineLastRebootRequestedMode, MachineNextStateResolver, MachineState, ManagedHostState, + ManagedHostStateSnapshot, MeasuringState, NetworkConfigUpdateState, NextStateBFBSupport, + PerformPowerOperation, PowerDrainState, PowerState, ReprovisionState, RetryInfo, + SecureEraseBossContext, SecureEraseBossState, SetBootOrderInfo, SetBootOrderState, + SetSecureBootState, SpdmMeasuringState, StateMachineArea, UefiSetupInfo, UefiSetupState, + UnlockHostState, ValidationState, dpf_based_dpu_provisioning_possible, get_display_ids, }; use model::power_manager::PowerHandlingOutcome; use model::resource_pool::common::CommonPools; @@ -111,11 +110,16 @@ use crate::state_controller::state_handler::{ }; mod attestation; +mod bios_config; mod dpf; mod helpers; mod machine_validation; mod power; mod sku; +use bios_config::{ + BiosConfigJobAdvanceOutcome, BiosConfigOutcome, PollingBiosSetupOutcome, + advance_bios_config_job, advance_polling_bios_setup, configure_host_bios, +}; use helpers::{ DpuDiscoveringStateHelper, DpuInitStateHelper, ManagedHostStateHelper, NextState, ReprovisionStateHelper, all_equal, @@ -4178,17 +4182,6 @@ pub struct RebootStatus { status: String, // what we did or are waiting for } -/// Outcome of configure_host_bios function. -enum BiosConfigOutcome { - Done, - WaitingForReboot(String), - /// Dell BIOS PATCH returned a job ID; wait for it to complete before boot order. - WaitingForBiosJob(BiosConfigInfo), -} - -/// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery (matches boot-order retry budget). -const MAX_BIOS_CONFIG_RETRIES: u32 = 3; - /// Outcome of set_host_boot_order function. enum SetBootOrderOutcome { Continue(SetBootOrderInfo), @@ -4877,7 +4870,9 @@ impl StateHandler for HostMachineStateHandler { { BiosConfigOutcome::Done => Ok(StateHandlerOutcome::transition( ManagedHostState::HostInit { - machine_state: MachineState::PollingBiosSetup, + machine_state: MachineState::PollingBiosSetup { + retry_count: *retry_count, + }, }, )), BiosConfigOutcome::WaitingForBiosJob(bios_config_info) => Ok( @@ -4910,11 +4905,14 @@ impl StateHandler for HostMachineStateHandler { }, }), ), - BiosConfigJobAdvanceOutcome::Done => Ok(StateHandlerOutcome::transition( - ManagedHostState::HostInit { - machine_state: MachineState::PollingBiosSetup, - }, - )), + BiosConfigJobAdvanceOutcome::Done + | BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup => Ok( + StateHandlerOutcome::transition(ManagedHostState::HostInit { + machine_state: MachineState::PollingBiosSetup { + retry_count: bios_config_info.retry_count, + }, + }), + ), BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count } => { Ok(StateHandlerOutcome::transition( ManagedHostState::HostInit { @@ -4929,7 +4927,7 @@ impl StateHandler for HostMachineStateHandler { } } } - MachineState::PollingBiosSetup => { + MachineState::PollingBiosSetup { retry_count } => { let next_state = ManagedHostState::HostInit { machine_state: MachineState::SetBootOrder { set_boot_order_info: Some(SetBootOrderInfo { @@ -4945,34 +4943,23 @@ impl StateHandler for HostMachineStateHandler { .create_redfish_client_from_machine(&mh_snapshot.host_snapshot) .await?; - let boot_interface_mac = - mh_snapshot.boot_interface_mac().map(|m| m.to_string()); - - match redfish_client - .is_bios_setup(boot_interface_mac.as_deref()) - .await + match advance_polling_bios_setup( + redfish_client.as_ref(), + mh_snapshot, + *retry_count, + ) + .await? { - Ok(true) => { - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - "BIOS setup verified successfully" - ); + PollingBiosSetupOutcome::Verified => { Ok(StateHandlerOutcome::transition(next_state)) } - Ok(false) => Ok(StateHandlerOutcome::wait( - "Polling BIOS setup status, waiting for settings to be applied" - .to_string(), - )), - Err(e) => { - tracing::warn!( - machine_id = %mh_snapshot.host_snapshot.id, - error = %e, - "Failed to check BIOS setup status, will retry" - ); - Ok(StateHandlerOutcome::wait(format!( - "Failed to check BIOS setup status: {}. Will retry.", - e - ))) + PollingBiosSetupOutcome::EnterRecovery(bios_config_info) => Ok( + StateHandlerOutcome::transition(ManagedHostState::HostInit { + machine_state: MachineState::WaitingForBiosJob { bios_config_info }, + }), + ), + PollingBiosSetupOutcome::Wait(reason) => { + Ok(StateHandlerOutcome::wait(reason)) } } } @@ -9437,7 +9424,7 @@ fn can_restart_reprovision(dpu_snapshots: &[Machine], version: ConfigVersion) -> /// TODO(ken): This is a temporary workaround for work-in-progress on zero-DPU support (August 2024) /// The way we should do this going forward is to plumb the actual non-DPU MAC address we want to /// boot from, instead of special-casing NoDpu errors. -async fn call_machine_setup_and_handle_no_dpu_error( +pub(super) async fn call_machine_setup_and_handle_no_dpu_error( redfish_client: &dyn Redfish, boot_interface_mac: Option<&str>, expected_dpu_count: usize, @@ -9913,7 +9900,9 @@ async fn handle_instance_host_platform_config( ) .await? { - BiosConfigOutcome::Done => HostPlatformConfigurationState::PollingBiosSetup, + BiosConfigOutcome::Done => { + HostPlatformConfigurationState::PollingBiosSetup { retry_count } + } BiosConfigOutcome::WaitingForBiosJob(bios_config_info) => { HostPlatformConfigurationState::WaitingForBiosJob { bios_config_info } } @@ -9943,8 +9932,11 @@ async fn handle_instance_host_platform_config( bios_config_info: updated, } } - BiosConfigJobAdvanceOutcome::Done => { - HostPlatformConfigurationState::PollingBiosSetup + BiosConfigJobAdvanceOutcome::Done + | BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup => { + HostPlatformConfigurationState::PollingBiosSetup { + retry_count: bios_config_info.retry_count, + } } BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count: next_count, @@ -9964,7 +9956,7 @@ async fn handle_instance_host_platform_config( }, )); } - HostPlatformConfigurationState::PollingBiosSetup => { + HostPlatformConfigurationState::PollingBiosSetup { retry_count } => { let next_instance_state = InstanceState::HostPlatformConfiguration { platform_config_state: HostPlatformConfigurationState::SetBootOrder { set_boot_order_info: SetBootOrderInfo { @@ -9975,34 +9967,24 @@ async fn handle_instance_host_platform_config( }, }; - let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); - - match redfish_client - .is_bios_setup(boot_interface_mac.as_deref()) - .await + match advance_polling_bios_setup(redfish_client.as_ref(), mh_snapshot, retry_count) + .await? { - Ok(true) => { - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - "BIOS setup verified successfully" - ); - next_instance_state - } - Ok(false) => { - return Ok(StateHandlerOutcome::wait( - "Polling BIOS setup status, waiting for settings to be applied".to_string(), + PollingBiosSetupOutcome::Verified => next_instance_state, + PollingBiosSetupOutcome::EnterRecovery(bios_config_info) => { + return Ok(StateHandlerOutcome::transition( + ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: + HostPlatformConfigurationState::WaitingForBiosJob { + bios_config_info, + }, + }, + }, )); } - Err(e) => { - tracing::warn!( - machine_id = %mh_snapshot.host_snapshot.id, - error = %e, - "Failed to check BIOS setup status, will retry" - ); - return Ok(StateHandlerOutcome::wait(format!( - "Failed to check BIOS setup status: {}. Will retry.", - e - ))); + PollingBiosSetupOutcome::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); } } } @@ -10055,306 +10037,6 @@ async fn handle_instance_host_platform_config( Ok(StateHandlerOutcome::transition(next_state)) } -async fn configure_host_bios( - ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, - reachability_params: &ReachabilityParams, - redfish_client: &dyn Redfish, - mh_snapshot: &ManagedHostStateSnapshot, - retry_count: u32, -) -> Result { - let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); - - let bios_job_id = match call_machine_setup_and_handle_no_dpu_error( - redfish_client, - boot_interface_mac.as_deref(), - mh_snapshot.host_snapshot.associated_dpu_machine_ids().len(), - &ctx.services.site_config, - ) - .await - { - Err(e) => { - tracing::warn!( - "redfish machine_setup failed for {}, potentially due to known race condition between UEFI POST and BMC. triggering force-restart if needed. err: {}", - mh_snapshot.host_snapshot.id, - e - ); - - // if machine_setup failed, reboot to potentially work around - // a known race between the DPU UEFI and the BMC, where if - // the BMC is not up when DPU UEFI runs, then Attributes might - // not come through. The fix is to force-restart the DPU to - // re-POST. - // - // As of July 2024, Josh Price said there's an NBU FR to fix - // this, but it wasn't target to a release yet. - let reboot_status = if mh_snapshot.host_snapshot.last_reboot_requested.is_none() { - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart) - .await?; - - RebootStatus { - increase_retry_count: true, - status: "Restarted host".to_string(), - } - } else { - trigger_reboot_if_needed( - &mh_snapshot.host_snapshot, - mh_snapshot, - None, - reachability_params, - ctx, - ) - .await? - }; - return Ok(BiosConfigOutcome::WaitingForReboot(format!( - "redfish machine_setup failed: {e}; triggered host reboot: {reboot_status:#?}" - ))); - } - Ok(jid) => jid, - }; - - if let Some(job_id) = &bios_job_id { - return Ok(BiosConfigOutcome::WaitingForBiosJob(BiosConfigInfo { - bios_job_id: Some(job_id.clone()), - bios_config_state: BiosConfigState::WaitForBiosJobScheduled, - retry_count, - })); - } - - // No job to wait for (non-Dell or vendor that doesn't return job); reboot to apply and continue. - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; - Ok(BiosConfigOutcome::Done) -} - -/// Outcome of advancing the BIOS config job state machine (Dell: wait for BIOS PATCH job before boot order). -enum BiosConfigJobAdvanceOutcome { - Continue(BiosConfigInfo), - Done, - /// Same state, but wait (e.g. waiting for power down or BMC to come back). - Wait(String), - /// After successful power/BMC recovery from a failed BIOS job: re-run machine_setup (not PollingBiosSetup). - RetryPlatformConfiguration { - retry_count: u32, - }, -} - -fn bios_config_enter_handle_failure( - info: &BiosConfigInfo, - failure: String, - host_id: &MachineId, -) -> Result { - if info.retry_count >= MAX_BIOS_CONFIG_RETRIES { - return Err(StateHandlerError::GenericError(eyre::eyre!( - "BIOS config job failure remediation exceeded max retries ({MAX_BIOS_CONFIG_RETRIES}) for host {host_id}: {failure}" - ))); - } - Ok(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::HandleBiosJobFailure { - failure, - power_state: PowerState::Off, - }, - retry_count: info.retry_count + 1, - }) -} - -/// Advance one step of the BIOS config job wait state machine. Same pattern as set_host_boot_order. -async fn advance_bios_config_job( - ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, - redfish_client: &dyn Redfish, - mh_snapshot: &ManagedHostStateSnapshot, - info: BiosConfigInfo, -) -> Result { - match info.bios_config_state { - BiosConfigState::WaitForBiosJobScheduled => { - if let Some(job_id) = &info.bios_job_id { - let job_state = redfish_client - .get_job_state(job_id) - .await - .map_err(|e| redfish_error("get_job_state", e))?; - if matches!( - job_state, - libredfish::JobState::ScheduledWithErrors - | libredfish::JobState::CompletedWithErrors - ) { - let failure = format!("BIOS job {} failed with state {job_state:#?}", job_id); - tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id - ); - return Ok(BiosConfigJobAdvanceOutcome::Continue( - bios_config_enter_handle_failure( - &info, - failure, - &mh_snapshot.host_snapshot.id, - )?, - )); - } - if !matches!(job_state, libredfish::JobState::Scheduled) { - return Err(StateHandlerError::GenericError(eyre::eyre!( - "waiting for BIOS job {:#?} to be scheduled; current state: {job_state:#?}", - job_id - ))); - } - } - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::RebootHost, - retry_count: info.retry_count, - })) - } - BiosConfigState::RebootHost => { - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::WaitForBiosJobCompletion, - retry_count: info.retry_count, - })) - } - BiosConfigState::WaitForBiosJobCompletion => { - const JOB_QUERY_WAIT_MINUTES: i64 = 5; - if let Some(job_id) = &info.bios_job_id { - let job_state = match redfish_client.get_job_state(job_id).await { - Ok(s) => s, - Err(e) => { - let minutes_since_state_change = mh_snapshot - .host_snapshot - .state - .version - .since_state_change() - .num_minutes(); - if minutes_since_state_change < JOB_QUERY_WAIT_MINUTES { - return Err(redfish_error("get_job_state", e)); - } - let failure = format!( - "BIOS config job {} lookup failed after {} min: {}", - job_id, minutes_since_state_change, e - ); - tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id - ); - return Ok(BiosConfigJobAdvanceOutcome::Continue( - bios_config_enter_handle_failure( - &info, - failure, - &mh_snapshot.host_snapshot.id, - )?, - )); - } - }; - match job_state { - libredfish::JobState::Completed => Ok(BiosConfigJobAdvanceOutcome::Done), - libredfish::JobState::ScheduledWithErrors - | libredfish::JobState::CompletedWithErrors => { - let failure = format!( - "BIOS config job {} failed with state {job_state:#?}", - job_id - ); - tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id, - ); - Ok(BiosConfigJobAdvanceOutcome::Continue( - bios_config_enter_handle_failure( - &info, - failure, - &mh_snapshot.host_snapshot.id, - )?, - )) - } - _ => Err(StateHandlerError::GenericError(eyre::eyre!( - "waiting for BIOS job {:#?} to complete; current state: {job_state:#?}", - job_id - ))), - } - } else { - Ok(BiosConfigJobAdvanceOutcome::Done) - } - } - BiosConfigState::HandleBiosJobFailure { - failure, - power_state, - } => { - let current_power_state = redfish_client - .get_power_state() - .await - .map_err(|e| redfish_error("get_power_state", e))?; - - match power_state { - PowerState::Off => { - if current_power_state != libredfish::PowerState::Off { - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceOff) - .await?; - return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: waiting for {} to power down; current power state: {current_power_state}; failure: {}", - mh_snapshot.host_snapshot.id, failure - ))); - } - tracing::info!( - "HandleBiosJobFailure: Resetting BMC for {} after BIOS job failure: {}", - mh_snapshot.host_snapshot.id, - failure - ); - redfish_client - .bmc_reset() - .await - .map_err(|e| redfish_error("bmc_reset", e))?; - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::HandleBiosJobFailure { - failure: failure.clone(), - power_state: PowerState::On, - }, - retry_count: info.retry_count, - })) - } - PowerState::On => { - if current_power_state != libredfish::PowerState::On { - let basetime = mh_snapshot - .host_snapshot - .last_reboot_requested - .as_ref() - .map(|x| x.time) - .unwrap_or(mh_snapshot.host_snapshot.state.version.timestamp()); - let power_down_wait = ctx - .services - .site_config - .machine_state_controller - .power_down_wait; - if Utc::now().signed_duration_since(basetime) < power_down_wait { - return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: waiting for BMC to come back online for {}; failure: {}", - mh_snapshot.host_snapshot.id, failure - ))); - } - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::On) - .await?; - return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: powering on {} after BMC reset; failure: {}", - mh_snapshot.host_snapshot.id, failure - ))); - } - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - retry_count = info.retry_count, - "HandleBiosJobFailure: BMC reset complete; re-running platform configuration (machine_setup) — power cycle does not apply BIOS attributes", - ); - Ok(BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { - retry_count: info.retry_count, - }) - } - _ => Err(StateHandlerError::GenericError(eyre::eyre!( - "HandleBiosJobFailure: unexpected power state {power_state:#?} for {}", - mh_snapshot.host_snapshot.id - ))), - } - } - } -} - async fn set_host_boot_order( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, reachability_params: &ReachabilityParams, diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs new file mode 100644 index 0000000000..c3b7aa32ba --- /dev/null +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -0,0 +1,536 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! BIOS configuration: machine_setup, Dell job wait/recovery, and PollingBiosSetup escalation. + +use carbide_uuid::machine::MachineId; +use chrono::Utc; +use eyre::eyre; +use libredfish::{Redfish, SystemPowerControl}; +use model::machine::{BiosConfigInfo, BiosConfigState, ManagedHostStateSnapshot, PowerState}; + +use super::{ + ReachabilityParams, RebootStatus, call_machine_setup_and_handle_no_dpu_error, + handler_host_power_control, trigger_reboot_if_needed, +}; +use crate::state_controller::external_service_error::redfish_error; +use crate::state_controller::machine::context::MachineStateHandlerContextObjects; +use crate::state_controller::state_handler::{StateHandlerContext, StateHandlerError}; + +/// Outcome of configure_host_bios function. +pub(super) enum BiosConfigOutcome { + Done, + WaitingForReboot(String), + /// Dell BIOS PATCH returned a job ID; wait for it to complete before boot order. + WaitingForBiosJob(BiosConfigInfo), +} + +/// Outcome of advancing the BIOS config job state machine (Dell: wait for BIOS PATCH job before boot order). +pub(super) enum BiosConfigJobAdvanceOutcome { + Continue(BiosConfigInfo), + /// Dell BIOS job completed; proceed to verify settings via PollingBiosSetup. + Done, + /// Automated recovery budget exhausted; poll is_bios_setup for manual remediation. + DeferToPollingBiosSetup, + /// Same state, but wait (e.g. waiting for power down or BMC to come back). + Wait(String), + /// After successful power/BMC recovery from a failed BIOS job: re-run machine_setup (not PollingBiosSetup). + RetryPlatformConfiguration { + retry_count: u32, + }, +} + +#[derive(Debug)] +pub(super) enum PollingBiosSetupOutcome { + Verified, + Wait(String), + EnterRecovery(BiosConfigInfo), +} + +/// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery (matches boot-order retry budget). +const MAX_BIOS_CONFIG_RETRIES: u32 = 3; + +/// How long PollingBiosSetup may sit on Ok(false) before escalating into HandleBiosJobFailure recovery. +/// +/// From `machine_state_history` (4 sites, ~4500 samples): HostInit/PollingBiosSetup usually +/// finishes within ~11 min p95; wedged hosts sit 90+ min. 15 min keeps the first recovery attempt +/// inside the 30-min HOST_INIT SLA. +const POLLING_BIOS_SETUP_STUCK_THRESHOLD: chrono::Duration = chrono::Duration::minutes(15); + +pub(super) async fn configure_host_bios( + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, + reachability_params: &ReachabilityParams, + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + retry_count: u32, +) -> Result { + let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); + + let bios_job_id = match call_machine_setup_and_handle_no_dpu_error( + redfish_client, + boot_interface_mac.as_deref(), + mh_snapshot.host_snapshot.associated_dpu_machine_ids().len(), + &ctx.services.site_config, + ) + .await + { + Err(e) => { + tracing::warn!( + "redfish machine_setup failed for {}, potentially due to known race condition between UEFI POST and BMC. triggering force-restart if needed. err: {}", + mh_snapshot.host_snapshot.id, + e + ); + + // if machine_setup failed, reboot to potentially work around + // a known race between the DPU UEFI and the BMC, where if + // the BMC is not up when DPU UEFI runs, then Attributes might + // not come through. The fix is to force-restart the DPU to + // re-POST. + // + // As of July 2024, Josh Price said there's an NBU FR to fix + // this, but it wasn't target to a release yet. + let reboot_status = if mh_snapshot.host_snapshot.last_reboot_requested.is_none() { + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart) + .await?; + + RebootStatus { + increase_retry_count: true, + status: "Restarted host".to_string(), + } + } else { + trigger_reboot_if_needed( + &mh_snapshot.host_snapshot, + mh_snapshot, + None, + reachability_params, + ctx, + ) + .await? + }; + return Ok(BiosConfigOutcome::WaitingForReboot(format!( + "redfish machine_setup failed: {e}; triggered host reboot: {reboot_status:#?}" + ))); + } + Ok(jid) => jid, + }; + + if let Some(job_id) = &bios_job_id { + return Ok(BiosConfigOutcome::WaitingForBiosJob(BiosConfigInfo { + bios_job_id: Some(job_id.clone()), + bios_config_state: BiosConfigState::WaitForBiosJobScheduled, + retry_count, + })); + } + + // No job to wait for (non-Dell or vendor that doesn't return job); reboot to apply and continue. + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; + Ok(BiosConfigOutcome::Done) +} + +/// Advance one step of the BIOS config job wait state machine. +pub(super) async fn advance_bios_config_job( + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + info: BiosConfigInfo, +) -> Result { + match info.bios_config_state { + BiosConfigState::WaitForBiosJobScheduled => { + let job_id = info.bios_job_id.as_ref().ok_or_else(|| { + StateHandlerError::GenericError(eyre!( + "WaitForBiosJobScheduled requires bios_job_id for host {}", + mh_snapshot.host_snapshot.id + )) + })?; + let job_state = redfish_client + .get_job_state(job_id) + .await + .map_err(|e| redfish_error("get_job_state", e))?; + if matches!( + job_state, + libredfish::JobState::ScheduledWithErrors + | libredfish::JobState::CompletedWithErrors + ) { + let failure = format!("BIOS job {} failed with state {job_state:#?}", job_id); + tracing::warn!( + "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", + failure, + mh_snapshot.host_snapshot.id + ); + return try_bios_recovery_attempt( + info.retry_count, + info.bios_job_id.clone(), + failure, + &mh_snapshot.host_snapshot.id, + ); + } + if !matches!(job_state, libredfish::JobState::Scheduled) { + return Err(StateHandlerError::GenericError(eyre!( + "waiting for BIOS job {:#?} to be scheduled; current state: {job_state:#?}", + job_id + ))); + } + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id: info.bios_job_id.clone(), + bios_config_state: BiosConfigState::RebootHost, + retry_count: info.retry_count, + })) + } + BiosConfigState::RebootHost => { + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id: info.bios_job_id.clone(), + bios_config_state: BiosConfigState::WaitForBiosJobCompletion, + retry_count: info.retry_count, + })) + } + BiosConfigState::WaitForBiosJobCompletion => { + const JOB_QUERY_WAIT_MINUTES: i64 = 5; + let job_id = info.bios_job_id.as_ref().ok_or_else(|| { + StateHandlerError::GenericError(eyre!( + "WaitForBiosJobCompletion requires bios_job_id for host {}", + mh_snapshot.host_snapshot.id + )) + })?; + let job_state = match redfish_client.get_job_state(job_id).await { + Ok(s) => s, + Err(e) => { + let minutes_since_state_change = mh_snapshot + .host_snapshot + .state + .version + .since_state_change() + .num_minutes(); + if minutes_since_state_change < JOB_QUERY_WAIT_MINUTES { + return Err(redfish_error("get_job_state", e)); + } + let failure = format!( + "BIOS config job {} lookup failed after {} min: {}", + job_id, minutes_since_state_change, e + ); + tracing::warn!( + "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", + failure, + mh_snapshot.host_snapshot.id + ); + return try_bios_recovery_attempt( + info.retry_count, + info.bios_job_id.clone(), + failure, + &mh_snapshot.host_snapshot.id, + ); + } + }; + match job_state { + libredfish::JobState::Completed => Ok(BiosConfigJobAdvanceOutcome::Done), + libredfish::JobState::ScheduledWithErrors + | libredfish::JobState::CompletedWithErrors => { + let failure = format!( + "BIOS config job {} failed with state {job_state:#?}", + job_id + ); + tracing::warn!( + "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", + failure, + mh_snapshot.host_snapshot.id, + ); + try_bios_recovery_attempt( + info.retry_count, + info.bios_job_id.clone(), + failure, + &mh_snapshot.host_snapshot.id, + ) + } + _ => Err(StateHandlerError::GenericError(eyre!( + "waiting for BIOS job {:#?} to complete; current state: {job_state:#?}", + job_id + ))), + } + } + BiosConfigState::HandleBiosJobFailure { + failure, + power_state, + } => { + let current_power_state = redfish_client + .get_power_state() + .await + .map_err(|e| redfish_error("get_power_state", e))?; + + match power_state { + PowerState::Off => { + if current_power_state != libredfish::PowerState::Off { + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceOff) + .await?; + return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( + "HandleBiosJobFailure: waiting for {} to power down; current power state: {current_power_state}; failure: {}", + mh_snapshot.host_snapshot.id, failure + ))); + } + tracing::info!( + "HandleBiosJobFailure: Resetting BMC for {} after BIOS job failure: {}", + mh_snapshot.host_snapshot.id, + failure + ); + redfish_client + .bmc_reset() + .await + .map_err(|e| redfish_error("bmc_reset", e))?; + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id: info.bios_job_id.clone(), + bios_config_state: BiosConfigState::HandleBiosJobFailure { + failure: failure.clone(), + power_state: PowerState::On, + }, + retry_count: info.retry_count, + })) + } + PowerState::On => { + if current_power_state != libredfish::PowerState::On { + let basetime = mh_snapshot + .host_snapshot + .last_reboot_requested + .as_ref() + .map(|x| x.time) + .unwrap_or(mh_snapshot.host_snapshot.state.version.timestamp()); + let power_down_wait = ctx + .services + .site_config + .machine_state_controller + .power_down_wait; + if Utc::now().signed_duration_since(basetime) < power_down_wait { + return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( + "HandleBiosJobFailure: waiting for BMC to come back online for {}; failure: {}", + mh_snapshot.host_snapshot.id, failure + ))); + } + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::On) + .await?; + return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( + "HandleBiosJobFailure: powering on {} after BMC reset; failure: {}", + mh_snapshot.host_snapshot.id, failure + ))); + } + tracing::info!( + machine_id = %mh_snapshot.host_snapshot.id, + retry_count = info.retry_count, + "HandleBiosJobFailure: BMC reset complete; re-running platform configuration (machine_setup) — power cycle does not apply BIOS attributes", + ); + Ok(BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { + retry_count: info.retry_count, + }) + } + _ => Err(StateHandlerError::GenericError(eyre!( + "HandleBiosJobFailure: unexpected power state {power_state:#?} for {}", + mh_snapshot.host_snapshot.id + ))), + } + } + } +} + +/// Enter HandleBiosJobFailure recovery, or defer to PollingBiosSetup when budget is exhausted. +fn try_bios_recovery_attempt( + retry_count: u32, + bios_job_id: Option, + failure: String, + host_id: &MachineId, +) -> Result { + if retry_count >= MAX_BIOS_CONFIG_RETRIES { + tracing::warn!( + machine_id = %host_id, + retry_count, + max_retries = MAX_BIOS_CONFIG_RETRIES, + %failure, + "BIOS recovery budget exhausted; staying in PollingBiosSetup for manual remediation" + ); + return Ok(BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup); + } + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id, + bios_config_state: BiosConfigState::HandleBiosJobFailure { + failure, + power_state: PowerState::Off, + }, + retry_count: retry_count + 1, + })) +} + +pub(super) async fn advance_polling_bios_setup( + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + retry_count: u32, +) -> Result { + let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); + let stuck_for = mh_snapshot.host_snapshot.state.version.since_state_change(); + let host_id = &mh_snapshot.host_snapshot.id; + + match redfish_client + .is_bios_setup(boot_interface_mac.as_deref()) + .await + { + Ok(true) => { + tracing::info!( + machine_id = %host_id, + "BIOS setup verified successfully" + ); + Ok(PollingBiosSetupOutcome::Verified) + } + Ok(false) => { + if let Some(bios_config_info) = + escalate_stuck_polling_bios_setup(retry_count, stuck_for, host_id)? + { + return Ok(PollingBiosSetupOutcome::EnterRecovery(bios_config_info)); + } + Ok(PollingBiosSetupOutcome::Wait(format!( + "Polling BIOS setup status, waiting for settings to be applied (retry_count={retry_count})" + ))) + } + Err(e) => { + tracing::warn!( + machine_id = %host_id, + error = %e, + "Failed to check BIOS setup status, will retry" + ); + Ok(PollingBiosSetupOutcome::Wait(format!( + "Failed to check BIOS setup status: {e}. Will retry." + ))) + } + } +} + +fn escalate_stuck_polling_bios_setup( + retry_count: u32, + stuck_for: chrono::Duration, + host_id: &MachineId, +) -> Result, StateHandlerError> { + if stuck_for <= POLLING_BIOS_SETUP_STUCK_THRESHOLD { + return Ok(None); + } + + tracing::warn!( + machine_id = %host_id, + ?stuck_for, + retry_count, + "PollingBiosSetup stuck; entering HandleBiosJobFailure recovery (power-off + BMC reset + power-on + re-run machine_setup)" + ); + + let failure = format!( + "PollingBiosSetup stuck for {} minutes (is_bios_setup returned false)", + stuck_for.num_minutes() + ); + + Ok( + match try_bios_recovery_attempt(retry_count, None, failure, host_id)? { + BiosConfigJobAdvanceOutcome::Continue(info) => Some(info), + BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup => None, + _ => unreachable!("recovery attempt only returns Continue or DeferToPollingBiosSetup"), + }, + ) +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use super::*; + + #[test] + fn escalate_stuck_polling_bios_setup_not_triggered_before_threshold() { + let host_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + + let result = + escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(10), &host_id).unwrap(); + + assert!(result.is_none()); + } + + #[test] + fn escalate_stuck_polling_bios_setup_enters_handle_bios_job_failure_when_stuck() { + let host_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + + let result = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(16), &host_id) + .unwrap() + .expect("recovery should be triggered"); + + assert_eq!(result.bios_job_id, None); + assert_eq!(result.retry_count, 1); + assert!(matches!( + result.bios_config_state, + BiosConfigState::HandleBiosJobFailure { + power_state: PowerState::Off, + .. + } + )); + } + + #[test] + fn escalate_stuck_polling_bios_setup_respects_shared_retry_budget() { + let host_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + + let result = escalate_stuck_polling_bios_setup( + MAX_BIOS_CONFIG_RETRIES, + chrono::Duration::minutes(20), + &host_id, + ) + .unwrap(); + + assert!(result.is_none()); + } + + #[test] + fn try_bios_recovery_attempt_defers_when_budget_exhausted() { + let host_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + + let result = try_bios_recovery_attempt( + MAX_BIOS_CONFIG_RETRIES, + Some("job-1".to_string()), + "job failed".to_string(), + &host_id, + ) + .unwrap(); + + assert!(matches!( + result, + BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup + )); + } + + #[test] + fn escalate_stuck_polling_bios_setup_allows_last_budgeted_attempt() { + let host_id = + MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + .unwrap(); + + let result = escalate_stuck_polling_bios_setup( + MAX_BIOS_CONFIG_RETRIES - 1, + chrono::Duration::minutes(20), + &host_id, + ) + .unwrap() + .expect("last budgeted recovery should be allowed"); + + assert_eq!(result.retry_count, MAX_BIOS_CONFIG_RETRIES); + } +} diff --git a/crates/api/src/state_controller/machine/io.rs b/crates/api/src/state_controller/machine/io.rs index 5f75335ff4..729aecc5c3 100644 --- a/crates/api/src/state_controller/machine/io.rs +++ b/crates/api/src/state_controller/machine/io.rs @@ -176,7 +176,7 @@ impl StateControllerIO for MachineStateControllerIO { "waitingforplatformconfiguration" } MachineState::WaitingForBiosJob { .. } => "waitingforbiosjob", - MachineState::PollingBiosSetup => "pollingbiossetup", + MachineState::PollingBiosSetup { .. } => "pollingbiossetup", MachineState::SetBootOrder { .. } => "setbootorder", MachineState::UefiSetup { .. } => "uefisetup", MachineState::WaitingForDiscovery => "waitingfordiscovery", diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index c2a2ef60e7..b6f8c19794 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -445,7 +445,7 @@ impl TestEnv { model::machine::MachineState::WaitingForPlatformConfiguration { .. } => { machine_state } - model::machine::MachineState::PollingBiosSetup => machine_state, + model::machine::MachineState::PollingBiosSetup { .. } => machine_state, model::machine::MachineState::SetBootOrder { .. } => machine_state, model::machine::MachineState::UefiSetup { .. } => machine_state, model::machine::MachineState::WaitingForDiscovery => machine_state, diff --git a/crates/api/src/tests/machine_history.rs b/crates/api/src/tests/machine_history.rs index 4294d1e78c..40b1db800b 100644 --- a/crates/api/src/tests/machine_history.rs +++ b/crates/api/src/tests/machine_history.rs @@ -50,7 +50,7 @@ async fn test_machine_state_history(pool: sqlx::PgPool) -> Result<(), Box, + is_bios_setup: Option, job_state_sequence: VecDeque, /// Records every call to `RedfishClientPool::create_client` so tests can /// assert what vendor was passed at each call site. @@ -151,6 +152,10 @@ impl RedfishSim { self.state.lock().unwrap().job_state_sequence = VecDeque::from(states); } + pub fn set_is_bios_setup(&self, ready: bool) { + self.state.lock().unwrap().is_bios_setup = Some(ready); + } + /// Returns a snapshot of every `create_client` call made through this sim, /// in the order they happened. Useful for asserting which vendor was /// passed at a given call site. @@ -1347,7 +1352,7 @@ impl Redfish for RedfishSimClient { &'a self, _: Option<&'a str>, ) -> libredfish::RedfishFuture<'a, Result> { - Box::pin(async move { Ok(true) }) + Box::pin(async move { Ok(self.state.lock().unwrap().is_bios_setup.unwrap_or(true)) }) } fn get_secure_boot_certificate<'a>( diff --git a/docs/architecture/state_machines/managedhost.md b/docs/architecture/state_machines/managedhost.md index 43a2e55dda..cd7b3c77e2 100644 --- a/docs/architecture/state_machines/managedhost.md +++ b/docs/architecture/state_machines/managedhost.md @@ -240,6 +240,10 @@ stateDiagram-v2 state "EnableIpmiOverLan" as HI_EnableIpmiOverLan state "WaitingForPlatformConfiguration" as HI_WaitingForPlatformConfiguration + state "WaitingForBiosJob" as HI_WaitingForBiosJob { + state "HandleBiosJobFailure" as HI_WBJ_HandleBiosJobFailure + } + state "PollingBiosSetup" as HI_PollingBiosSetup state "WaitingForDiscovery" as HI_WaitingForDiscovery state "Discovered" as HI_Discovered state "BomValidating/MatchingSku" as BomValidating_BV_MatchingSku @@ -253,7 +257,14 @@ stateDiagram-v2 Failed --> HI_M_WaitingForMeasurements HI_EnableIpmiOverLan --> HI_WaitingForPlatformConfiguration : Enable IPMI over LAN access - HI_WaitingForPlatformConfiguration --> HI_SBO_SetBootOrder : Call machine setup/Restart Host + HI_WaitingForPlatformConfiguration --> HI_PollingBiosSetup : Call machine setup/Restart Host + HI_WaitingForPlatformConfiguration --> HI_WaitingForBiosJob : Dell BIOS job scheduled + HI_WaitingForBiosJob --> HI_PollingBiosSetup : BIOS job completed + HI_WaitingForBiosJob --> HI_WBJ_HandleBiosJobFailure : BIOS job failed + HI_WBJ_HandleBiosJobFailure --> HI_WaitingForPlatformConfiguration : Power off + BMC reset + power on + HI_PollingBiosSetup --> HI_PollingBiosSetup : Wait for BIOS setup + HI_PollingBiosSetup --> HI_WBJ_HandleBiosJobFailure : Stuck > 15 min, retry budget left + HI_PollingBiosSetup --> HI_SBO_SetBootOrder : BIOS is setup HI_SBO_SetBootOrder --> hi_sbo_if_zero_dpu hi_sbo_if_zero_dpu --> HI_SBO_WaitForSetBootOrderJobCompletion : No DPU @@ -446,6 +457,9 @@ stateDiagram-v2 } state "CheckHostConfig" as A_HPC_CheckHostConfig state "ConfigureBios" as A_HPC_ConfigureBios + state "WaitingForBiosJob" as A_HPC_WaitingForBiosJob { + state "HandleBiosJobFailure" as A_HPC_WBJ_HandleBiosJobFailure + } state "PollingBiosSetup" as A_HPC_PollingBiosSetup state "SetBootOrder" as A_HPC_SetBootOrder { state "SetBootOrder" as A_HPC_SBO_SetBootOrder @@ -513,7 +527,12 @@ stateDiagram-v2 A_HPC_CheckHostConfig --> A_HPC_ConfigureBios : Need config host boot order A_HPC_CheckHostConfig --> A_HPC_LockHost : No need config host boot order A_HPC_ConfigureBios --> A_HPC_PollingBiosSetup : Config BIOS + A_HPC_ConfigureBios --> A_HPC_WaitingForBiosJob : Dell BIOS job scheduled + A_HPC_WaitingForBiosJob --> A_HPC_PollingBiosSetup : BIOS job completed + A_HPC_WaitingForBiosJob --> A_HPC_WBJ_HandleBiosJobFailure : BIOS job failed + A_HPC_WBJ_HandleBiosJobFailure --> A_HPC_ConfigureBios : Power off + BMC reset + power on A_HPC_PollingBiosSetup --> A_HPC_PollingBiosSetup : Wait for BIOS setup + A_HPC_PollingBiosSetup --> A_HPC_WBJ_HandleBiosJobFailure : Stuck > 15 min, retry budget left A_HPC_PollingBiosSetup --> A_HPC_SBO_SetBootOrder : BIOS is setup A_HPC_SBO_SetBootOrder --> A_HPC_SBO_WaitForSetBootOrderJobScheduled : Set boot order job scheduled A_HPC_SBO_WaitForSetBootOrderJobScheduled --> A_HPC_SBO_RebootHost : Job scheduled From f5868b8d67cfdfebefad3e7d40b13be41f4834f7 Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Tue, 26 May 2026 15:44:03 +0000 Subject: [PATCH 2/7] feat: route to the Failed state Signed-off-by: Krish Dandiwala --- crates/api-model/src/machine/mod.rs | 3 + .../src/state_controller/machine/handler.rs | 130 ++++++++++++++--- .../machine/handler/bios_config.rs | 117 ++++++++++----- crates/api/src/tests/machine_states.rs | 138 ++++++++++++++++++ .../state_machines/managedhost.md | 10 ++ 5 files changed, 342 insertions(+), 56 deletions(-) diff --git a/crates/api-model/src/machine/mod.rs b/crates/api-model/src/machine/mod.rs index bb67568271..d87d80cfea 100644 --- a/crates/api-model/src/machine/mod.rs +++ b/crates/api-model/src/machine/mod.rs @@ -1509,6 +1509,8 @@ pub enum FailureCause { DpfProvisioning { err: String }, SpdmAttestationFailed { err: String }, + + BiosSetupFailed { err: String }, } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] @@ -2053,6 +2055,7 @@ impl Display for FailureCause { FailureCause::SpdmAttestationFailed { .. } => { write!(f, "SpdmAttestationFailed") } + FailureCause::BiosSetupFailed { .. } => write!(f, "BiosSetupFailed"), } } } diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/api/src/state_controller/machine/handler.rs index da6e6547da..a99b19ecc9 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/api/src/state_controller/machine/handler.rs @@ -119,6 +119,7 @@ mod sku; use bios_config::{ BiosConfigJobAdvanceOutcome, BiosConfigOutcome, PollingBiosSetupOutcome, advance_bios_config_job, advance_polling_bios_setup, configure_host_bios, + handle_bios_setup_failed_recovery, }; use helpers::{ DpuDiscoveringStateHelper, DpuInitStateHelper, ManagedHostStateHelper, NextState, @@ -1431,6 +1432,19 @@ impl MachineStateHandler { None => Ok(StateHandlerOutcome::do_nothing()), } } + FailureCause::BiosSetupFailed { .. } if machine_id.machine_type().is_host() => { + let recovered = ManagedHostState::HostInit { + machine_state: MachineState::SetBootOrder { + set_boot_order_info: Some(SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state: SetBootOrderState::SetBootOrder, + retry_count: 0, + }), + }, + }; + handle_bios_setup_failed_recovery(ctx, mh_snapshot, machine_id, recovered) + .await + } _ => { // Do nothing. // Handle error cause and decide how to recover if possible. @@ -4905,14 +4919,26 @@ impl StateHandler for HostMachineStateHandler { }, }), ), - BiosConfigJobAdvanceOutcome::Done - | BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup => Ok( - StateHandlerOutcome::transition(ManagedHostState::HostInit { + BiosConfigJobAdvanceOutcome::Done => Ok(StateHandlerOutcome::transition( + ManagedHostState::HostInit { machine_state: MachineState::PollingBiosSetup { retry_count: bios_config_info.retry_count, }, - }), - ), + }, + )), + BiosConfigJobAdvanceOutcome::Failed { failure } => { + Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::HostInit, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + retry_count: 0, + })) + } BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count } => { Ok(StateHandlerOutcome::transition( ManagedHostState::HostInit { @@ -4958,6 +4984,19 @@ impl StateHandler for HostMachineStateHandler { machine_state: MachineState::WaitingForBiosJob { bios_config_info }, }), ), + PollingBiosSetupOutcome::Failed { failure } => { + Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::HostInit, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + retry_count: 0, + })) + } PollingBiosSetupOutcome::Wait(reason) => { Ok(StateHandlerOutcome::wait(reason)) } @@ -6197,20 +6236,38 @@ impl StateHandler for InstanceStateHandler { InstanceState::Failed { details, machine_id, - } => { - // Only way to proceed is to - // 1. Force-delete the machine. - // 2. If failed during reprovision, fix the config/hw issue and - // retrigger DPU reprovision. - tracing::warn!( - "Instance id {}/machine: {} stuck in failed state. details: {:?}, failed machine: {}", - instance.id, - host_machine_id, - details, - machine_id - ); - Ok(StateHandlerOutcome::do_nothing()) - } + } => match details.cause { + FailureCause::BiosSetupFailed { .. } if machine_id.machine_type().is_host() => { + let recovered = ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: + HostPlatformConfigurationState::SetBootOrder { + set_boot_order_info: SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state: SetBootOrderState::SetBootOrder, + retry_count: 0, + }, + }, + }, + }; + handle_bios_setup_failed_recovery(ctx, mh_snapshot, machine_id, recovered) + .await + } + _ => { + // Only way to proceed for other causes is to + // 1. Force-delete the machine. + // 2. If failed during reprovision, fix the config/hw issue and + // retrigger DPU reprovision. + tracing::warn!( + "Instance id {}/machine: {} stuck in failed state. details: {:?}, failed machine: {}", + instance.id, + host_machine_id, + details, + machine_id + ); + Ok(StateHandlerOutcome::do_nothing()) + } + }, InstanceState::HostReprovision { .. } => { self.host_upgrade .handle_host_reprovision( @@ -9932,12 +9989,27 @@ async fn handle_instance_host_platform_config( bios_config_info: updated, } } - BiosConfigJobAdvanceOutcome::Done - | BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup => { + BiosConfigJobAdvanceOutcome::Done => { HostPlatformConfigurationState::PollingBiosSetup { retry_count: bios_config_info.retry_count, } } + BiosConfigJobAdvanceOutcome::Failed { failure } => { + return Ok(StateHandlerOutcome::transition( + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::AssignedInstance, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + }, + }, + )); + } BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count: next_count, } => HostPlatformConfigurationState::ConfigureBios { @@ -9983,6 +10055,22 @@ async fn handle_instance_host_platform_config( }, )); } + PollingBiosSetupOutcome::Failed { failure } => { + return Ok(StateHandlerOutcome::transition( + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::AssignedInstance, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + }, + }, + )); + } PollingBiosSetupOutcome::Wait(reason) => { return Ok(StateHandlerOutcome::wait(reason)); } diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs index c3b7aa32ba..40fe2e94f8 100644 --- a/crates/api/src/state_controller/machine/handler/bios_config.rs +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -21,7 +21,9 @@ use carbide_uuid::machine::MachineId; use chrono::Utc; use eyre::eyre; use libredfish::{Redfish, SystemPowerControl}; -use model::machine::{BiosConfigInfo, BiosConfigState, ManagedHostStateSnapshot, PowerState}; +use model::machine::{ + BiosConfigInfo, BiosConfigState, ManagedHostState, ManagedHostStateSnapshot, PowerState, +}; use super::{ ReachabilityParams, RebootStatus, call_machine_setup_and_handle_no_dpu_error, @@ -29,7 +31,9 @@ use super::{ }; use crate::state_controller::external_service_error::redfish_error; use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::state_handler::{StateHandlerContext, StateHandlerError}; +use crate::state_controller::state_handler::{ + StateHandlerContext, StateHandlerError, StateHandlerOutcome, +}; /// Outcome of configure_host_bios function. pub(super) enum BiosConfigOutcome { @@ -44,8 +48,9 @@ pub(super) enum BiosConfigJobAdvanceOutcome { Continue(BiosConfigInfo), /// Dell BIOS job completed; proceed to verify settings via PollingBiosSetup. Done, - /// Automated recovery budget exhausted; poll is_bios_setup for manual remediation. - DeferToPollingBiosSetup, + Failed { + failure: String, + }, /// Same state, but wait (e.g. waiting for power down or BMC to come back). Wait(String), /// After successful power/BMC recovery from a failed BIOS job: re-run machine_setup (not PollingBiosSetup). @@ -59,6 +64,7 @@ pub(super) enum PollingBiosSetupOutcome { Verified, Wait(String), EnterRecovery(BiosConfigInfo), + Failed { failure: String }, } /// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery (matches boot-order retry budget). @@ -342,7 +348,7 @@ pub(super) async fn advance_bios_config_job( } } -/// Enter HandleBiosJobFailure recovery, or defer to PollingBiosSetup when budget is exhausted. +/// Enter HandleBiosJobFailure recovery, or move to Failed when budget is exhausted. fn try_bios_recovery_attempt( retry_count: u32, bios_job_id: Option, @@ -355,9 +361,9 @@ fn try_bios_recovery_attempt( retry_count, max_retries = MAX_BIOS_CONFIG_RETRIES, %failure, - "BIOS recovery budget exhausted; staying in PollingBiosSetup for manual remediation" + "BIOS recovery budget exhausted; moving host to Failed for manual remediation" ); - return Ok(BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup); + return Ok(BiosConfigJobAdvanceOutcome::Failed { failure }); } Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { bios_job_id, @@ -390,10 +396,10 @@ pub(super) async fn advance_polling_bios_setup( Ok(PollingBiosSetupOutcome::Verified) } Ok(false) => { - if let Some(bios_config_info) = + if let Some(outcome) = escalate_stuck_polling_bios_setup(retry_count, stuck_for, host_id)? { - return Ok(PollingBiosSetupOutcome::EnterRecovery(bios_config_info)); + return Ok(outcome); } Ok(PollingBiosSetupOutcome::Wait(format!( "Polling BIOS setup status, waiting for settings to be applied (retry_count={retry_count})" @@ -416,7 +422,7 @@ fn escalate_stuck_polling_bios_setup( retry_count: u32, stuck_for: chrono::Duration, host_id: &MachineId, -) -> Result, StateHandlerError> { +) -> Result, StateHandlerError> { if stuck_for <= POLLING_BIOS_SETUP_STUCK_THRESHOLD { return Ok(None); } @@ -425,7 +431,7 @@ fn escalate_stuck_polling_bios_setup( machine_id = %host_id, ?stuck_for, retry_count, - "PollingBiosSetup stuck; entering HandleBiosJobFailure recovery (power-off + BMC reset + power-on + re-run machine_setup)" + "PollingBiosSetup stuck; attempting HandleBiosJobFailure recovery (power-off + BMC reset + power-on + re-run machine_setup)" ); let failure = format!( @@ -433,13 +439,51 @@ fn escalate_stuck_polling_bios_setup( stuck_for.num_minutes() ); - Ok( + Ok(Some( match try_bios_recovery_attempt(retry_count, None, failure, host_id)? { - BiosConfigJobAdvanceOutcome::Continue(info) => Some(info), - BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup => None, - _ => unreachable!("recovery attempt only returns Continue or DeferToPollingBiosSetup"), + BiosConfigJobAdvanceOutcome::Continue(info) => { + PollingBiosSetupOutcome::EnterRecovery(info) + } + BiosConfigJobAdvanceOutcome::Failed { failure } => { + PollingBiosSetupOutcome::Failed { failure } + } + _ => unreachable!("recovery attempt only returns Continue or Failed"), }, - ) + )) +} + +pub(super) async fn handle_bios_setup_failed_recovery( + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, + mh_snapshot: &ManagedHostStateSnapshot, + machine_id: &MachineId, + recovered_state: ManagedHostState, +) -> Result, StateHandlerError> { + let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&mh_snapshot.host_snapshot) + .await?; + match redfish_client + .is_bios_setup(boot_interface_mac.as_deref()) + .await + { + Ok(true) => { + tracing::info!( + machine_id = %machine_id, + "BIOS setup verified after manual remediation; resuming state machine" + ); + Ok(StateHandlerOutcome::transition(recovered_state)) + } + Ok(false) => Ok(StateHandlerOutcome::do_nothing()), + Err(e) => { + tracing::warn!( + machine_id = %machine_id, + error = %e, + "Failed to check BIOS setup status, will retry" + ); + Ok(StateHandlerOutcome::do_nothing()) + } + } } #[cfg(test)] @@ -451,7 +495,7 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_not_triggered_before_threshold() { let host_id = - MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") .unwrap(); let result = @@ -463,17 +507,19 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_enters_handle_bios_job_failure_when_stuck() { let host_id = - MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") .unwrap(); - let result = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(16), &host_id) + let info = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(16), &host_id) .unwrap() .expect("recovery should be triggered"); - - assert_eq!(result.bios_job_id, None); - assert_eq!(result.retry_count, 1); + let PollingBiosSetupOutcome::EnterRecovery(info) = info else { + panic!("expected EnterRecovery"); + }; + assert_eq!(info.bios_job_id, None); + assert_eq!(info.retry_count, 1); assert!(matches!( - result.bios_config_state, + info.bios_config_state, BiosConfigState::HandleBiosJobFailure { power_state: PowerState::Off, .. @@ -484,7 +530,7 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_respects_shared_retry_budget() { let host_id = - MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") .unwrap(); let result = escalate_stuck_polling_bios_setup( @@ -492,15 +538,16 @@ mod tests { chrono::Duration::minutes(20), &host_id, ) - .unwrap(); + .unwrap() + .expect("expected Failed outcome"); - assert!(result.is_none()); + assert!(matches!(result, PollingBiosSetupOutcome::Failed { .. })); } #[test] - fn try_bios_recovery_attempt_defers_when_budget_exhausted() { + fn try_bios_recovery_attempt_fails_when_budget_exhausted() { let host_id = - MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") .unwrap(); let result = try_bios_recovery_attempt( @@ -511,19 +558,16 @@ mod tests { ) .unwrap(); - assert!(matches!( - result, - BiosConfigJobAdvanceOutcome::DeferToPollingBiosSetup - )); + assert!(matches!(result, BiosConfigJobAdvanceOutcome::Failed { .. })); } #[test] fn escalate_stuck_polling_bios_setup_allows_last_budgeted_attempt() { let host_id = - MachineId::from_str("fm100ds7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") + MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") .unwrap(); - let result = escalate_stuck_polling_bios_setup( + let outcome = escalate_stuck_polling_bios_setup( MAX_BIOS_CONFIG_RETRIES - 1, chrono::Duration::minutes(20), &host_id, @@ -531,6 +575,9 @@ mod tests { .unwrap() .expect("last budgeted recovery should be allowed"); - assert_eq!(result.retry_count, MAX_BIOS_CONFIG_RETRIES); + let PollingBiosSetupOutcome::EnterRecovery(info) = outcome else { + panic!("expected EnterRecovery"); + }; + assert_eq!(info.retry_count, MAX_BIOS_CONFIG_RETRIES); } } diff --git a/crates/api/src/tests/machine_states.rs b/crates/api/src/tests/machine_states.rs index d491438de7..e89e6dd8b9 100644 --- a/crates/api/src/tests/machine_states.rs +++ b/crates/api/src/tests/machine_states.rs @@ -2380,6 +2380,144 @@ async fn test_polling_bios_setup_full_recovery_reruns_machine_setup_and_succeeds ); } +/// When HostInit/PollingBiosSetup retry budget is exhausted, enter Failed and recover via is_bios_setup. +#[crate::sqlx_test] +async fn test_polling_bios_setup_exhausted_enters_failed_and_recovers_when_bios_setup_true( + pool: sqlx::PgPool, +) { + let env = create_test_env(pool).await; + + let mh = common::api_fixtures::create_managed_host(&env).await; + let host_id = mh.host().id; + + env.redfish_sim.set_is_bios_setup(false); + + set_host_controller_state_stuck_in( + &env, + host_id, + &ManagedHostState::HostInit { + machine_state: MachineState::PollingBiosSetup { retry_count: 3 }, + }, + 16, + ) + .await; + + env.run_machine_state_controller_iteration().await; + + { + let mut txn = env.db_txn().await; + let host = mh.host().db_machine(&mut txn).await; + assert!( + matches!( + host.current_state(), + ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { .. }, + source: FailureSource::StateMachineArea(StateMachineArea::HostInit), + .. + }, + .. + } + ), + "expected ManagedHostState::Failed with BiosSetupFailed/HostInit, got: {:?}", + host.current_state() + ); + } + + env.redfish_sim.set_is_bios_setup(true); + env.run_machine_state_controller_iteration().await; + + { + let mut txn = env.db_txn().await; + let host = mh.host().db_machine(&mut txn).await; + assert!( + matches!( + host.current_state(), + ManagedHostState::HostInit { + machine_state: MachineState::SetBootOrder { .. }, + } + ), + "expected recovery to reach HostInit/SetBootOrder, got: {:?}", + host.current_state() + ); + } +} + +/// Assigned/HostPlatformConfiguration/PollingBiosSetup retry exhaustion enters InstanceState::Failed. +#[crate::sqlx_test] +async fn test_hpc_polling_bios_setup_exhausted_enters_failed_and_recovers_when_bios_setup_true( + pool: sqlx::PgPool, +) { + let env = create_test_env(pool).await; + + let mh = common::api_fixtures::create_managed_host(&env).await; + let segment_id = env.create_vpc_and_tenant_segment().await; + create_instance(&env, &mh, false, segment_id).await; + let host_id = mh.host().id; + + env.redfish_sim.set_is_bios_setup(false); + + set_host_controller_state_stuck_in( + &env, + host_id, + &ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: HostPlatformConfigurationState::PollingBiosSetup { + retry_count: 3, + }, + }, + }, + 16, + ) + .await; + + env.run_machine_state_controller_iteration().await; + + { + let mut txn = env.db_txn().await; + let host = mh.host().db_machine(&mut txn).await; + assert!( + matches!( + host.current_state(), + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { .. }, + source: FailureSource::StateMachineArea( + StateMachineArea::AssignedInstance + ), + .. + }, + .. + }, + } + ), + "expected Assigned/InstanceState::Failed with BiosSetupFailed/AssignedInstance, got: {:?}", + host.current_state() + ); + } + + env.redfish_sim.set_is_bios_setup(true); + env.run_machine_state_controller_iteration().await; + + { + let mut txn = env.db_txn().await; + let host = mh.host().db_machine(&mut txn).await; + assert!( + matches!( + host.current_state(), + ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: HostPlatformConfigurationState::SetBootOrder { .. }, + }, + } + ), + "expected recovery to reach HostPlatformConfiguration/SetBootOrder, got: {:?}", + host.current_state() + ); + } +} + async fn set_host_controller_state_stuck_in( env: &TestEnv, host_id: MachineId, diff --git a/docs/architecture/state_machines/managedhost.md b/docs/architecture/state_machines/managedhost.md index cd7b3c77e2..3486a74ff1 100644 --- a/docs/architecture/state_machines/managedhost.md +++ b/docs/architecture/state_machines/managedhost.md @@ -76,6 +76,10 @@ stateDiagram-v2 ForceDeletion --> [*] : Force deletion complete ``` +### `Failed` recovery semantics + +`FailureCause::BiosSetupFailed` is set when BIOS setup retries are exhausted during ingestion or instance deprovisioning. The Failed handler polls `is_bios_setup` and auto-recovers into `SetBootOrder` once the BMC reports success. + ## DPU Discovery State Details (DpuDiscoveringState) Shows the complete DPU discovery and configuration process: @@ -255,15 +259,18 @@ stateDiagram-v2 DpuInitState_DI_WaitingForNetworkConfig --> HI_EnableIpmiOverLan Failed --> HI_WFL_TimeWaitForDPUDown Failed --> HI_M_WaitingForMeasurements + Failed --> HI_SBO_SetBootOrder : BiosSetupFailed AND is_bios_setup ok HI_EnableIpmiOverLan --> HI_WaitingForPlatformConfiguration : Enable IPMI over LAN access HI_WaitingForPlatformConfiguration --> HI_PollingBiosSetup : Call machine setup/Restart Host HI_WaitingForPlatformConfiguration --> HI_WaitingForBiosJob : Dell BIOS job scheduled HI_WaitingForBiosJob --> HI_PollingBiosSetup : BIOS job completed HI_WaitingForBiosJob --> HI_WBJ_HandleBiosJobFailure : BIOS job failed + HI_WaitingForBiosJob --> Failed : Retry budget exhausted (BiosSetupFailed) HI_WBJ_HandleBiosJobFailure --> HI_WaitingForPlatformConfiguration : Power off + BMC reset + power on HI_PollingBiosSetup --> HI_PollingBiosSetup : Wait for BIOS setup HI_PollingBiosSetup --> HI_WBJ_HandleBiosJobFailure : Stuck > 15 min, retry budget left + HI_PollingBiosSetup --> Failed : Stuck > 15 min, retry budget exhausted (BiosSetupFailed) HI_PollingBiosSetup --> HI_SBO_SetBootOrder : BIOS is setup HI_SBO_SetBootOrder --> hi_sbo_if_zero_dpu @@ -530,9 +537,11 @@ stateDiagram-v2 A_HPC_ConfigureBios --> A_HPC_WaitingForBiosJob : Dell BIOS job scheduled A_HPC_WaitingForBiosJob --> A_HPC_PollingBiosSetup : BIOS job completed A_HPC_WaitingForBiosJob --> A_HPC_WBJ_HandleBiosJobFailure : BIOS job failed + A_HPC_WaitingForBiosJob --> A_Failed : Retry budget exhausted (BiosSetupFailed) A_HPC_WBJ_HandleBiosJobFailure --> A_HPC_ConfigureBios : Power off + BMC reset + power on A_HPC_PollingBiosSetup --> A_HPC_PollingBiosSetup : Wait for BIOS setup A_HPC_PollingBiosSetup --> A_HPC_WBJ_HandleBiosJobFailure : Stuck > 15 min, retry budget left + A_HPC_PollingBiosSetup --> A_Failed : Stuck > 15 min, retry budget exhausted (BiosSetupFailed) A_HPC_PollingBiosSetup --> A_HPC_SBO_SetBootOrder : BIOS is setup A_HPC_SBO_SetBootOrder --> A_HPC_SBO_WaitForSetBootOrderJobScheduled : Set boot order job scheduled A_HPC_SBO_WaitForSetBootOrderJobScheduled --> A_HPC_SBO_RebootHost : Job scheduled @@ -543,6 +552,7 @@ stateDiagram-v2 state AnyState AnyState --> A_Failed : Any failure condition A_Failed --> A_Failed : Wait (stuck, manual action needed) + A_Failed --> A_HPC_SBO_SetBootOrder : BiosSetupFailed AND is_bios_setup ok ``` From c5102d891c9a3d56e5023442ffbdeca30d52826d Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Tue, 26 May 2026 15:53:52 +0000 Subject: [PATCH 3/7] chore: better error message Signed-off-by: Krish Dandiwala --- .../api/src/state_controller/machine/handler/bios_config.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs index 40fe2e94f8..082a63ebf6 100644 --- a/crates/api/src/state_controller/machine/handler/bios_config.rs +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -363,7 +363,11 @@ fn try_bios_recovery_attempt( %failure, "BIOS recovery budget exhausted; moving host to Failed for manual remediation" ); - return Ok(BiosConfigJobAdvanceOutcome::Failed { failure }); + return Ok(BiosConfigJobAdvanceOutcome::Failed { + failure: format!( + "{failure} (automated BIOS recovery exhausted after {MAX_BIOS_CONFIG_RETRIES} attempts)" + ), + }); } Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { bios_job_id, From f2e3aeaf957100e113dcd81288492a1113c15399 Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Wed, 27 May 2026 13:29:49 +0000 Subject: [PATCH 4/7] fix: address comments Signed-off-by: Krish Dandiwala --- .../src/state_controller/machine/handler.rs | 6 +- .../machine/handler/bios_config.rs | 148 +++++++----------- 2 files changed, 55 insertions(+), 99 deletions(-) diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/api/src/state_controller/machine/handler.rs index a99b19ecc9..9a58daa38e 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/api/src/state_controller/machine/handler.rs @@ -1442,8 +1442,7 @@ impl MachineStateHandler { }), }, }; - handle_bios_setup_failed_recovery(ctx, mh_snapshot, machine_id, recovered) - .await + handle_bios_setup_failed_recovery(ctx, mh_snapshot, recovered).await } _ => { // Do nothing. @@ -6250,8 +6249,7 @@ impl StateHandler for InstanceStateHandler { }, }, }; - handle_bios_setup_failed_recovery(ctx, mh_snapshot, machine_id, recovered) - .await + handle_bios_setup_failed_recovery(ctx, mh_snapshot, recovered).await } _ => { // Only way to proceed for other causes is to diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs index 082a63ebf6..bfb2b52586 100644 --- a/crates/api/src/state_controller/machine/handler/bios_config.rs +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -17,7 +17,6 @@ //! BIOS configuration: machine_setup, Dell job wait/recovery, and PollingBiosSetup escalation. -use carbide_uuid::machine::MachineId; use chrono::Utc; use eyre::eyre; use libredfish::{Redfish, SystemPowerControl}; @@ -67,6 +66,21 @@ pub(super) enum PollingBiosSetupOutcome { Failed { failure: String }, } +/// Outcome of entering HandleBiosJobFailure recovery, or failing once the budget is exhausted. +enum BiosRecoveryAttemptOutcome { + Continue(BiosConfigInfo), + Failed { failure: String }, +} + +impl From for BiosConfigJobAdvanceOutcome { + fn from(outcome: BiosRecoveryAttemptOutcome) -> Self { + match outcome { + BiosRecoveryAttemptOutcome::Continue(info) => Self::Continue(info), + BiosRecoveryAttemptOutcome::Failed { failure } => Self::Failed { failure }, + } + } +} + /// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery (matches boot-order retry budget). const MAX_BIOS_CONFIG_RETRIES: u32 = 3; @@ -134,9 +148,9 @@ pub(super) async fn configure_host_bios( Ok(jid) => jid, }; - if let Some(job_id) = &bios_job_id { + if let Some(job_id) = bios_job_id { return Ok(BiosConfigOutcome::WaitingForBiosJob(BiosConfigInfo { - bios_job_id: Some(job_id.clone()), + bios_job_id: Some(job_id), bios_config_state: BiosConfigState::WaitForBiosJobScheduled, retry_count, })); @@ -173,15 +187,11 @@ pub(super) async fn advance_bios_config_job( ) { let failure = format!("BIOS job {} failed with state {job_state:#?}", job_id); tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id + %failure, + "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" ); - return try_bios_recovery_attempt( - info.retry_count, - info.bios_job_id.clone(), - failure, - &mh_snapshot.host_snapshot.id, + return Ok( + try_bios_recovery_attempt(info.retry_count, info.bios_job_id, failure)?.into(), ); } if !matches!(job_state, libredfish::JobState::Scheduled) { @@ -191,7 +201,7 @@ pub(super) async fn advance_bios_config_job( ))); } Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), + bios_job_id: info.bios_job_id, bios_config_state: BiosConfigState::RebootHost, retry_count: info.retry_count, })) @@ -199,7 +209,7 @@ pub(super) async fn advance_bios_config_job( BiosConfigState::RebootHost => { handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), + bios_job_id: info.bios_job_id, bios_config_state: BiosConfigState::WaitForBiosJobCompletion, retry_count: info.retry_count, })) @@ -229,16 +239,15 @@ pub(super) async fn advance_bios_config_job( job_id, minutes_since_state_change, e ); tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id + %failure, + "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" ); - return try_bios_recovery_attempt( + return Ok(try_bios_recovery_attempt( info.retry_count, - info.bios_job_id.clone(), + info.bios_job_id, failure, - &mh_snapshot.host_snapshot.id, - ); + )? + .into()); } }; match job_state { @@ -250,15 +259,12 @@ pub(super) async fn advance_bios_config_job( job_id ); tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id, + %failure, + "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" ); - try_bios_recovery_attempt( - info.retry_count, - info.bios_job_id.clone(), - failure, - &mh_snapshot.host_snapshot.id, + Ok( + try_bios_recovery_attempt(info.retry_count, info.bios_job_id, failure)? + .into(), ) } _ => Err(StateHandlerError::GenericError(eyre!( @@ -282,23 +288,21 @@ pub(super) async fn advance_bios_config_job( handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceOff) .await?; return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: waiting for {} to power down; current power state: {current_power_state}; failure: {}", - mh_snapshot.host_snapshot.id, failure + "HandleBiosJobFailure: waiting for power down; current power state: {current_power_state}; failure: {failure}" ))); } tracing::info!( - "HandleBiosJobFailure: Resetting BMC for {} after BIOS job failure: {}", - mh_snapshot.host_snapshot.id, - failure + %failure, + "HandleBiosJobFailure: resetting BMC after BIOS job failure" ); redfish_client .bmc_reset() .await .map_err(|e| redfish_error("bmc_reset", e))?; Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), + bios_job_id: info.bios_job_id, bios_config_state: BiosConfigState::HandleBiosJobFailure { - failure: failure.clone(), + failure, power_state: PowerState::On, }, retry_count: info.retry_count, @@ -319,19 +323,16 @@ pub(super) async fn advance_bios_config_job( .power_down_wait; if Utc::now().signed_duration_since(basetime) < power_down_wait { return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: waiting for BMC to come back online for {}; failure: {}", - mh_snapshot.host_snapshot.id, failure + "HandleBiosJobFailure: waiting for BMC to come back online; failure: {failure}" ))); } handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::On) .await?; return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: powering on {} after BMC reset; failure: {}", - mh_snapshot.host_snapshot.id, failure + "HandleBiosJobFailure: powering on after BMC reset; failure: {failure}" ))); } tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, retry_count = info.retry_count, "HandleBiosJobFailure: BMC reset complete; re-running platform configuration (machine_setup) — power cycle does not apply BIOS attributes", ); @@ -353,23 +354,21 @@ fn try_bios_recovery_attempt( retry_count: u32, bios_job_id: Option, failure: String, - host_id: &MachineId, -) -> Result { +) -> Result { if retry_count >= MAX_BIOS_CONFIG_RETRIES { tracing::warn!( - machine_id = %host_id, retry_count, max_retries = MAX_BIOS_CONFIG_RETRIES, %failure, "BIOS recovery budget exhausted; moving host to Failed for manual remediation" ); - return Ok(BiosConfigJobAdvanceOutcome::Failed { + return Ok(BiosRecoveryAttemptOutcome::Failed { failure: format!( "{failure} (automated BIOS recovery exhausted after {MAX_BIOS_CONFIG_RETRIES} attempts)" ), }); } - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + Ok(BiosRecoveryAttemptOutcome::Continue(BiosConfigInfo { bios_job_id, bios_config_state: BiosConfigState::HandleBiosJobFailure { failure, @@ -386,23 +385,17 @@ pub(super) async fn advance_polling_bios_setup( ) -> Result { let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); let stuck_for = mh_snapshot.host_snapshot.state.version.since_state_change(); - let host_id = &mh_snapshot.host_snapshot.id; match redfish_client .is_bios_setup(boot_interface_mac.as_deref()) .await { Ok(true) => { - tracing::info!( - machine_id = %host_id, - "BIOS setup verified successfully" - ); + tracing::info!("BIOS setup verified successfully"); Ok(PollingBiosSetupOutcome::Verified) } Ok(false) => { - if let Some(outcome) = - escalate_stuck_polling_bios_setup(retry_count, stuck_for, host_id)? - { + if let Some(outcome) = escalate_stuck_polling_bios_setup(retry_count, stuck_for)? { return Ok(outcome); } Ok(PollingBiosSetupOutcome::Wait(format!( @@ -411,7 +404,6 @@ pub(super) async fn advance_polling_bios_setup( } Err(e) => { tracing::warn!( - machine_id = %host_id, error = %e, "Failed to check BIOS setup status, will retry" ); @@ -425,14 +417,12 @@ pub(super) async fn advance_polling_bios_setup( fn escalate_stuck_polling_bios_setup( retry_count: u32, stuck_for: chrono::Duration, - host_id: &MachineId, ) -> Result, StateHandlerError> { if stuck_for <= POLLING_BIOS_SETUP_STUCK_THRESHOLD { return Ok(None); } tracing::warn!( - machine_id = %host_id, ?stuck_for, retry_count, "PollingBiosSetup stuck; attempting HandleBiosJobFailure recovery (power-off + BMC reset + power-on + re-run machine_setup)" @@ -444,14 +434,13 @@ fn escalate_stuck_polling_bios_setup( ); Ok(Some( - match try_bios_recovery_attempt(retry_count, None, failure, host_id)? { - BiosConfigJobAdvanceOutcome::Continue(info) => { + match try_bios_recovery_attempt(retry_count, None, failure)? { + BiosRecoveryAttemptOutcome::Continue(info) => { PollingBiosSetupOutcome::EnterRecovery(info) } - BiosConfigJobAdvanceOutcome::Failed { failure } => { + BiosRecoveryAttemptOutcome::Failed { failure } => { PollingBiosSetupOutcome::Failed { failure } } - _ => unreachable!("recovery attempt only returns Continue or Failed"), }, )) } @@ -459,7 +448,6 @@ fn escalate_stuck_polling_bios_setup( pub(super) async fn handle_bios_setup_failed_recovery( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, mh_snapshot: &ManagedHostStateSnapshot, - machine_id: &MachineId, recovered_state: ManagedHostState, ) -> Result, StateHandlerError> { let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); @@ -472,16 +460,12 @@ pub(super) async fn handle_bios_setup_failed_recovery( .await { Ok(true) => { - tracing::info!( - machine_id = %machine_id, - "BIOS setup verified after manual remediation; resuming state machine" - ); + tracing::info!("BIOS setup verified after manual remediation; resuming state machine"); Ok(StateHandlerOutcome::transition(recovered_state)) } Ok(false) => Ok(StateHandlerOutcome::do_nothing()), Err(e) => { tracing::warn!( - machine_id = %machine_id, error = %e, "Failed to check BIOS setup status, will retry" ); @@ -492,29 +476,18 @@ pub(super) async fn handle_bios_setup_failed_recovery( #[cfg(test)] mod tests { - use std::str::FromStr; - use super::*; #[test] fn escalate_stuck_polling_bios_setup_not_triggered_before_threshold() { - let host_id = - MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") - .unwrap(); - - let result = - escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(10), &host_id).unwrap(); + let result = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(10)).unwrap(); assert!(result.is_none()); } #[test] fn escalate_stuck_polling_bios_setup_enters_handle_bios_job_failure_when_stuck() { - let host_id = - MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") - .unwrap(); - - let info = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(16), &host_id) + let info = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(16)) .unwrap() .expect("recovery should be triggered"); let PollingBiosSetupOutcome::EnterRecovery(info) = info else { @@ -533,14 +506,9 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_respects_shared_retry_budget() { - let host_id = - MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") - .unwrap(); - let result = escalate_stuck_polling_bios_setup( MAX_BIOS_CONFIG_RETRIES, chrono::Duration::minutes(20), - &host_id, ) .unwrap() .expect("expected Failed outcome"); @@ -550,31 +518,21 @@ mod tests { #[test] fn try_bios_recovery_attempt_fails_when_budget_exhausted() { - let host_id = - MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") - .unwrap(); - let result = try_bios_recovery_attempt( MAX_BIOS_CONFIG_RETRIES, Some("job-1".to_string()), "job failed".to_string(), - &host_id, ) .unwrap(); - assert!(matches!(result, BiosConfigJobAdvanceOutcome::Failed { .. })); + assert!(matches!(result, BiosRecoveryAttemptOutcome::Failed { .. })); } #[test] fn escalate_stuck_polling_bios_setup_allows_last_budgeted_attempt() { - let host_id = - MachineId::from_str("fm100ht7blqjsadm2uuh3qqbf1h7k8pmf47um6v9uckrg7l03po8mhqgvng") - .unwrap(); - let outcome = escalate_stuck_polling_bios_setup( MAX_BIOS_CONFIG_RETRIES - 1, chrono::Duration::minutes(20), - &host_id, ) .unwrap() .expect("last budgeted recovery should be allowed"); From 12f1fbafb11a594b982daf4c657a95b3cb4bd32d Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Wed, 27 May 2026 21:32:32 +0000 Subject: [PATCH 5/7] feat: make bios config retry and polling timeout configurable Signed-off-by: Krish Dandiwala --- crates/api/src/cfg/README.md | 2 + crates/api/src/cfg/file.rs | 12 +++ .../machine/config/controller.rs | 23 +++++ .../src/state_controller/machine/handler.rs | 11 ++- .../machine/handler/bios_config.rs | 92 ++++++++++++------- .../api/src/tests/common/api_fixtures/mod.rs | 4 + 6 files changed, 109 insertions(+), 35 deletions(-) diff --git a/crates/api/src/cfg/README.md b/crates/api/src/cfg/README.md index defc651db8..0c2f2fa387 100644 --- a/crates/api/src/cfg/README.md +++ b/crates/api/src/cfg/README.md @@ -192,6 +192,8 @@ Extends `StateControllerConfig` with: | `dpu_up_threshold` | `Duration` | `5m` | Max time without DPU health report before assuming it's down. | | `scout_reporting_timeout` | `Duration` | `5m` | Duration without scout report before host is unhealthy. | | `uefi_boot_wait` | `Duration` | `5m` | Wait time for UEFI boot completion after host reboot. | +| `max_bios_config_retries` | `u32` | `3` | Max HandleBiosJobFailure recovery cycles during BIOS configuration. | +| `polling_bios_setup_stuck_threshold` | `Duration` | `15m` | Time in PollingBiosSetup with `is_bios_setup == false` before recovery escalation. | ### `NetworkSegmentStateControllerConfig` diff --git a/crates/api/src/cfg/file.rs b/crates/api/src/cfg/file.rs index c41626dd1e..97a774b56f 100644 --- a/crates/api/src/cfg/file.rs +++ b/crates/api/src/cfg/file.rs @@ -2352,6 +2352,8 @@ mod tests { dpu_up_threshold: Duration::weeks(1), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), }; let config_str = serde_json::to_string(&input).unwrap(); @@ -2395,6 +2397,8 @@ mod tests { dpu_up_threshold: Duration::weeks(1), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); } @@ -2415,6 +2419,8 @@ mod tests { dpu_up_threshold: Duration::weeks(1), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); } @@ -2707,6 +2713,8 @@ mod tests { dpu_up_threshold: Duration::minutes(77), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); assert_eq!( @@ -2892,6 +2900,8 @@ mod tests { dpu_up_threshold: Duration::minutes(33), scout_reporting_timeout: Duration::minutes(20), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); assert_eq!( @@ -3201,6 +3211,8 @@ mod tests { dpu_up_threshold: Duration::minutes(77), scout_reporting_timeout: Duration::minutes(20), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); assert_eq!( diff --git a/crates/api/src/state_controller/machine/config/controller.rs b/crates/api/src/state_controller/machine/config/controller.rs index d5acae4fe3..8af5c04e28 100644 --- a/crates/api/src/state_controller/machine/config/controller.rs +++ b/crates/api/src/state_controller/machine/config/controller.rs @@ -70,6 +70,17 @@ pub struct MachineStateControllerConfig { serialize_with = "as_duration" )] pub uefi_boot_wait: Duration, + /// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery. + #[serde(default = "MachineStateControllerConfig::max_bios_config_retries_default")] + pub max_bios_config_retries: u32, + /// How long PollingBiosSetup may sit on Ok(false) before escalating into + /// HandleBiosJobFailure recovery. + #[serde( + default = "MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default", + deserialize_with = "deserialize_duration_chrono", + serialize_with = "as_duration" + )] + pub polling_bios_setup_stuck_threshold: Duration, } impl MachineStateControllerConfig { @@ -96,6 +107,14 @@ impl MachineStateControllerConfig { pub fn uefi_boot_wait_default() -> Duration { Duration::minutes(5) } + + pub fn max_bios_config_retries_default() -> u32 { + 3 + } + + pub fn polling_bios_setup_stuck_threshold_default() -> Duration { + Duration::minutes(15) + } } impl Default for MachineStateControllerConfig { @@ -109,6 +128,10 @@ impl Default for MachineStateControllerConfig { scout_reporting_timeout: MachineStateControllerConfig::scout_reporting_timeout_default( ), uefi_boot_wait: MachineStateControllerConfig::uefi_boot_wait_default(), + max_bios_config_retries: MachineStateControllerConfig::max_bios_config_retries_default( + ), + polling_bios_setup_stuck_threshold: + MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default(), } } } diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/api/src/state_controller/machine/handler.rs index 030bbe22ae..f9586ac44a 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/api/src/state_controller/machine/handler.rs @@ -4952,11 +4952,11 @@ impl StateHandler for HostMachineStateHandler { .services .create_redfish_client_from_machine(&mh_snapshot.host_snapshot) .await?; - match advance_polling_bios_setup( redfish_client.as_ref(), mh_snapshot, *retry_count, + &ctx.services.site_config.machine_state_controller, ) .await? { @@ -10085,8 +10085,13 @@ async fn handle_instance_host_platform_config( }, }; - match advance_polling_bios_setup(redfish_client.as_ref(), mh_snapshot, retry_count) - .await? + match advance_polling_bios_setup( + redfish_client.as_ref(), + mh_snapshot, + retry_count, + &ctx.services.site_config.machine_state_controller, + ) + .await? { PollingBiosSetupOutcome::Verified => next_instance_state, PollingBiosSetupOutcome::EnterRecovery(bios_config_info) => { diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs index bfb2b52586..40f437e590 100644 --- a/crates/api/src/state_controller/machine/handler/bios_config.rs +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -17,6 +17,7 @@ //! BIOS configuration: machine_setup, Dell job wait/recovery, and PollingBiosSetup escalation. +use carbide_redfish::libredfish::error::state_handler_redfish_error as redfish_error; use chrono::Utc; use eyre::eyre; use libredfish::{Redfish, SystemPowerControl}; @@ -28,7 +29,7 @@ use super::{ ReachabilityParams, RebootStatus, call_machine_setup_and_handle_no_dpu_error, handler_host_power_control, trigger_reboot_if_needed, }; -use crate::state_controller::external_service_error::redfish_error; +use crate::state_controller::machine::config::MachineStateControllerConfig; use crate::state_controller::machine::context::MachineStateHandlerContextObjects; use crate::state_controller::state_handler::{ StateHandlerContext, StateHandlerError, StateHandlerOutcome, @@ -81,16 +82,6 @@ impl From for BiosConfigJobAdvanceOutcome { } } -/// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery (matches boot-order retry budget). -const MAX_BIOS_CONFIG_RETRIES: u32 = 3; - -/// How long PollingBiosSetup may sit on Ok(false) before escalating into HandleBiosJobFailure recovery. -/// -/// From `machine_state_history` (4 sites, ~4500 samples): HostInit/PollingBiosSetup usually -/// finishes within ~11 min p95; wedged hosts sit 90+ min. 15 min keeps the first recovery attempt -/// inside the 30-min HOST_INIT SLA. -const POLLING_BIOS_SETUP_STUCK_THRESHOLD: chrono::Duration = chrono::Duration::minutes(15); - pub(super) async fn configure_host_bios( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, reachability_params: &ReachabilityParams, @@ -168,6 +159,7 @@ pub(super) async fn advance_bios_config_job( mh_snapshot: &ManagedHostStateSnapshot, info: BiosConfigInfo, ) -> Result { + let machine_controller_config = &ctx.services.site_config.machine_state_controller; match info.bios_config_state { BiosConfigState::WaitForBiosJobScheduled => { let job_id = info.bios_job_id.as_ref().ok_or_else(|| { @@ -190,9 +182,13 @@ pub(super) async fn advance_bios_config_job( %failure, "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" ); - return Ok( - try_bios_recovery_attempt(info.retry_count, info.bios_job_id, failure)?.into(), - ); + return Ok(try_bios_recovery_attempt( + machine_controller_config, + info.retry_count, + info.bios_job_id, + failure, + )? + .into()); } if !matches!(job_state, libredfish::JobState::Scheduled) { return Err(StateHandlerError::GenericError(eyre!( @@ -243,6 +239,7 @@ pub(super) async fn advance_bios_config_job( "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" ); return Ok(try_bios_recovery_attempt( + machine_controller_config, info.retry_count, info.bios_job_id, failure, @@ -262,10 +259,13 @@ pub(super) async fn advance_bios_config_job( %failure, "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" ); - Ok( - try_bios_recovery_attempt(info.retry_count, info.bios_job_id, failure)? - .into(), - ) + Ok(try_bios_recovery_attempt( + machine_controller_config, + info.retry_count, + info.bios_job_id, + failure, + )? + .into()) } _ => Err(StateHandlerError::GenericError(eyre!( "waiting for BIOS job {:#?} to complete; current state: {job_state:#?}", @@ -351,20 +351,22 @@ pub(super) async fn advance_bios_config_job( /// Enter HandleBiosJobFailure recovery, or move to Failed when budget is exhausted. fn try_bios_recovery_attempt( + machine_controller_config: &MachineStateControllerConfig, retry_count: u32, bios_job_id: Option, failure: String, ) -> Result { - if retry_count >= MAX_BIOS_CONFIG_RETRIES { + if retry_count >= machine_controller_config.max_bios_config_retries { tracing::warn!( retry_count, - max_retries = MAX_BIOS_CONFIG_RETRIES, + max_retries = machine_controller_config.max_bios_config_retries, %failure, "BIOS recovery budget exhausted; moving host to Failed for manual remediation" ); return Ok(BiosRecoveryAttemptOutcome::Failed { failure: format!( - "{failure} (automated BIOS recovery exhausted after {MAX_BIOS_CONFIG_RETRIES} attempts)" + "{failure} (automated BIOS recovery exhausted after {} attempts)", + machine_controller_config.max_bios_config_retries ), }); } @@ -382,6 +384,7 @@ pub(super) async fn advance_polling_bios_setup( redfish_client: &dyn Redfish, mh_snapshot: &ManagedHostStateSnapshot, retry_count: u32, + machine_controller_config: &MachineStateControllerConfig, ) -> Result { let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); let stuck_for = mh_snapshot.host_snapshot.state.version.since_state_change(); @@ -395,7 +398,11 @@ pub(super) async fn advance_polling_bios_setup( Ok(PollingBiosSetupOutcome::Verified) } Ok(false) => { - if let Some(outcome) = escalate_stuck_polling_bios_setup(retry_count, stuck_for)? { + if let Some(outcome) = escalate_stuck_polling_bios_setup( + machine_controller_config, + retry_count, + stuck_for, + )? { return Ok(outcome); } Ok(PollingBiosSetupOutcome::Wait(format!( @@ -415,10 +422,11 @@ pub(super) async fn advance_polling_bios_setup( } fn escalate_stuck_polling_bios_setup( + machine_controller_config: &MachineStateControllerConfig, retry_count: u32, stuck_for: chrono::Duration, ) -> Result, StateHandlerError> { - if stuck_for <= POLLING_BIOS_SETUP_STUCK_THRESHOLD { + if stuck_for <= machine_controller_config.polling_bios_setup_stuck_threshold { return Ok(None); } @@ -434,7 +442,7 @@ fn escalate_stuck_polling_bios_setup( ); Ok(Some( - match try_bios_recovery_attempt(retry_count, None, failure)? { + match try_bios_recovery_attempt(machine_controller_config, retry_count, None, failure)? { BiosRecoveryAttemptOutcome::Continue(info) => { PollingBiosSetupOutcome::EnterRecovery(info) } @@ -480,16 +488,27 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_not_triggered_before_threshold() { - let result = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(10)).unwrap(); + let machine_controller_config = MachineStateControllerConfig::default(); + let result = escalate_stuck_polling_bios_setup( + &machine_controller_config, + 0, + chrono::Duration::minutes(10), + ) + .unwrap(); assert!(result.is_none()); } #[test] fn escalate_stuck_polling_bios_setup_enters_handle_bios_job_failure_when_stuck() { - let info = escalate_stuck_polling_bios_setup(0, chrono::Duration::minutes(16)) - .unwrap() - .expect("recovery should be triggered"); + let machine_controller_config = MachineStateControllerConfig::default(); + let info = escalate_stuck_polling_bios_setup( + &machine_controller_config, + 0, + chrono::Duration::minutes(16), + ) + .unwrap() + .expect("recovery should be triggered"); let PollingBiosSetupOutcome::EnterRecovery(info) = info else { panic!("expected EnterRecovery"); }; @@ -506,8 +525,10 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_respects_shared_retry_budget() { + let machine_controller_config = MachineStateControllerConfig::default(); let result = escalate_stuck_polling_bios_setup( - MAX_BIOS_CONFIG_RETRIES, + &machine_controller_config, + machine_controller_config.max_bios_config_retries, chrono::Duration::minutes(20), ) .unwrap() @@ -518,8 +539,10 @@ mod tests { #[test] fn try_bios_recovery_attempt_fails_when_budget_exhausted() { + let machine_controller_config = MachineStateControllerConfig::default(); let result = try_bios_recovery_attempt( - MAX_BIOS_CONFIG_RETRIES, + &machine_controller_config, + machine_controller_config.max_bios_config_retries, Some("job-1".to_string()), "job failed".to_string(), ) @@ -530,8 +553,10 @@ mod tests { #[test] fn escalate_stuck_polling_bios_setup_allows_last_budgeted_attempt() { + let machine_controller_config = MachineStateControllerConfig::default(); let outcome = escalate_stuck_polling_bios_setup( - MAX_BIOS_CONFIG_RETRIES - 1, + &machine_controller_config, + machine_controller_config.max_bios_config_retries - 1, chrono::Duration::minutes(20), ) .unwrap() @@ -540,6 +565,9 @@ mod tests { let PollingBiosSetupOutcome::EnterRecovery(info) = outcome else { panic!("expected EnterRecovery"); }; - assert_eq!(info.retry_count, MAX_BIOS_CONFIG_RETRIES); + assert_eq!( + info.retry_count, + machine_controller_config.max_bios_config_retries + ); } } diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index 3b5a7b6654..9453de56e6 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -1241,6 +1241,10 @@ pub fn get_config() -> CarbideConfig { controller: StateControllerConfig::default(), scout_reporting_timeout: Duration::weeks(52), uefi_boot_wait: Duration::seconds(0), + max_bios_config_retries: + MachineStateControllerConfig::max_bios_config_retries_default(), + polling_bios_setup_stuck_threshold: + MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default(), }, network_segment_state_controller: NetworkSegmentStateControllerConfig { network_segment_drain_time: Duration::seconds(2), From 1849ca087bb53adf018cc15a6cfbfc1ebd4a0fa2 Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Wed, 27 May 2026 21:45:23 +0000 Subject: [PATCH 6/7] chore: fmt Signed-off-by: Krish Dandiwala --- crates/api/src/tests/common/api_fixtures/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index 14fd22a2d4..6484de7311 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -1241,8 +1241,8 @@ pub fn get_config() -> CarbideConfig { controller: StateControllerConfig::default(), scout_reporting_timeout: Duration::weeks(52), uefi_boot_wait: Duration::seconds(0), - max_bios_config_retries: - MachineStateControllerConfig::max_bios_config_retries_default(), + max_bios_config_retries: MachineStateControllerConfig::max_bios_config_retries_default( + ), polling_bios_setup_stuck_threshold: MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default(), }, From 9fb9b61deac4ec41b9b9f417cad0d0e1374c67e9 Mon Sep 17 00:00:00 2001 From: Krish Dandiwala Date: Wed, 27 May 2026 22:08:16 +0000 Subject: [PATCH 7/7] chore: more cleanup :( Signed-off-by: Krish Dandiwala --- crates/api/src/state_controller/machine/handler.rs | 2 +- .../api/src/state_controller/machine/handler/bios_config.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/api/src/state_controller/machine/handler.rs index a0934b25c2..f6bde21687 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/api/src/state_controller/machine/handler.rs @@ -110,7 +110,7 @@ use crate::state_controller::machine::{ MeasuringOutcome, get_measuring_prerequisites, handle_measuring_state, }; -mod attestation; +pub mod attestation; mod bios_config; mod dpf; mod helpers; diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs index 40f437e590..143fbf7fc9 100644 --- a/crates/api/src/state_controller/machine/handler/bios_config.rs +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -24,6 +24,9 @@ use libredfish::{Redfish, SystemPowerControl}; use model::machine::{ BiosConfigInfo, BiosConfigState, ManagedHostState, ManagedHostStateSnapshot, PowerState, }; +use state_controller::state_handler::{ + StateHandlerContext, StateHandlerError, StateHandlerOutcome, +}; use super::{ ReachabilityParams, RebootStatus, call_machine_setup_and_handle_no_dpu_error, @@ -31,9 +34,6 @@ use super::{ }; use crate::state_controller::machine::config::MachineStateControllerConfig; use crate::state_controller::machine::context::MachineStateHandlerContextObjects; -use crate::state_controller::state_handler::{ - StateHandlerContext, StateHandlerError, StateHandlerOutcome, -}; /// Outcome of configure_host_bios function. pub(super) enum BiosConfigOutcome {