diff --git a/crates/api-model/src/machine/mod.rs b/crates/api-model/src/machine/mod.rs index 1442cbbfa0..d87d80cfea 100644 --- a/crates/api-model/src/machine/mod.rs +++ b/crates/api-model/src/machine/mod.rs @@ -1509,6 +1509,8 @@ pub enum FailureCause { DpfProvisioning { err: String }, SpdmAttestationFailed { err: String }, + + BiosSetupFailed { err: String }, } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] @@ -1660,7 +1662,10 @@ pub enum MachineState { WaitingForBiosJob { bios_config_info: BiosConfigInfo, }, - PollingBiosSetup, + PollingBiosSetup { + #[serde(default)] + retry_count: u32, + }, SetBootOrder { set_boot_order_info: Option, }, @@ -1716,6 +1721,10 @@ pub enum UefiSetupState { /// Tracks progress waiting for the Dell BIOS config job (from machine_setup PATCH) to complete /// before configuring boot order. Same pattern as SetBootOrderInfo / SetBootOrderState. +/// +/// `bios_job_id` is `Some` while polling a vendor BIOS job (e.g. Dell). `None` only during +/// `HandleBiosJobFailure` recovery from stuck PollingBiosSetup; non-Dell hosts reboot in +/// `configure_host_bios` and never enter job-polling substates. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] #[serde(rename_all = "lowercase")] pub struct BiosConfigInfo { @@ -1929,7 +1938,10 @@ pub enum HostPlatformConfigurationState { WaitingForBiosJob { bios_config_info: BiosConfigInfo, }, - PollingBiosSetup, + PollingBiosSetup { + #[serde(default)] + retry_count: u32, + }, SetBootOrder { set_boot_order_info: SetBootOrderInfo, }, @@ -2043,6 +2055,7 @@ impl Display for FailureCause { FailureCause::SpdmAttestationFailed { .. } => { write!(f, "SpdmAttestationFailed") } + FailureCause::BiosSetupFailed { .. } => write!(f, "BiosSetupFailed"), } } } @@ -2867,7 +2880,38 @@ mod tests { assert_eq!( deserialized, ManagedHostState::HostInit { - machine_state: MachineState::PollingBiosSetup, + machine_state: MachineState::PollingBiosSetup { retry_count: 0 }, + } + ); + } + + #[test] + fn test_json_deserialize_polling_bios_setup_with_retry_count() { + let serialized = + r#"{"state":"hostinit","machine_state":{"state":"pollingbiossetup","retry_count":2}}"#; + let deserialized: ManagedHostState = serde_json::from_str(serialized).unwrap(); + + assert_eq!( + deserialized, + ManagedHostState::HostInit { + machine_state: MachineState::PollingBiosSetup { retry_count: 2 }, + } + ); + } + + #[test] + fn test_json_deserialize_host_platform_configuration_polling_bios_setup_legacy() { + let serialized = r#"{"state":"assigned","instance_state":{"state":"hostplatformconfiguration","platform_config_state":{"state":"pollingbiossetup"}}}"#; + let deserialized: ManagedHostState = serde_json::from_str(serialized).unwrap(); + + assert_eq!( + deserialized, + ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: HostPlatformConfigurationState::PollingBiosSetup { + retry_count: 0, + }, + }, } ); } diff --git a/crates/api/src/cfg/README.md b/crates/api/src/cfg/README.md index defc651db8..0c2f2fa387 100644 --- a/crates/api/src/cfg/README.md +++ b/crates/api/src/cfg/README.md @@ -192,6 +192,8 @@ Extends `StateControllerConfig` with: | `dpu_up_threshold` | `Duration` | `5m` | Max time without DPU health report before assuming it's down. | | `scout_reporting_timeout` | `Duration` | `5m` | Duration without scout report before host is unhealthy. | | `uefi_boot_wait` | `Duration` | `5m` | Wait time for UEFI boot completion after host reboot. | +| `max_bios_config_retries` | `u32` | `3` | Max HandleBiosJobFailure recovery cycles during BIOS configuration. | +| `polling_bios_setup_stuck_threshold` | `Duration` | `15m` | Time in PollingBiosSetup with `is_bios_setup == false` before recovery escalation. | ### `NetworkSegmentStateControllerConfig` diff --git a/crates/api/src/cfg/file.rs b/crates/api/src/cfg/file.rs index c41626dd1e..97a774b56f 100644 --- a/crates/api/src/cfg/file.rs +++ b/crates/api/src/cfg/file.rs @@ -2352,6 +2352,8 @@ mod tests { dpu_up_threshold: Duration::weeks(1), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), }; let config_str = serde_json::to_string(&input).unwrap(); @@ -2395,6 +2397,8 @@ mod tests { dpu_up_threshold: Duration::weeks(1), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); } @@ -2415,6 +2419,8 @@ mod tests { dpu_up_threshold: Duration::weeks(1), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); } @@ -2707,6 +2713,8 @@ mod tests { dpu_up_threshold: Duration::minutes(77), scout_reporting_timeout: Duration::minutes(5), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); assert_eq!( @@ -2892,6 +2900,8 @@ mod tests { dpu_up_threshold: Duration::minutes(33), scout_reporting_timeout: Duration::minutes(20), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); assert_eq!( @@ -3201,6 +3211,8 @@ mod tests { dpu_up_threshold: Duration::minutes(77), scout_reporting_timeout: Duration::minutes(20), uefi_boot_wait: Duration::minutes(5), + max_bios_config_retries: 3, + polling_bios_setup_stuck_threshold: Duration::minutes(15), } ); assert_eq!( diff --git a/crates/api/src/state_controller/machine/config/controller.rs b/crates/api/src/state_controller/machine/config/controller.rs index d5acae4fe3..8af5c04e28 100644 --- a/crates/api/src/state_controller/machine/config/controller.rs +++ b/crates/api/src/state_controller/machine/config/controller.rs @@ -70,6 +70,17 @@ pub struct MachineStateControllerConfig { serialize_with = "as_duration" )] pub uefi_boot_wait: Duration, + /// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery. + #[serde(default = "MachineStateControllerConfig::max_bios_config_retries_default")] + pub max_bios_config_retries: u32, + /// How long PollingBiosSetup may sit on Ok(false) before escalating into + /// HandleBiosJobFailure recovery. + #[serde( + default = "MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default", + deserialize_with = "deserialize_duration_chrono", + serialize_with = "as_duration" + )] + pub polling_bios_setup_stuck_threshold: Duration, } impl MachineStateControllerConfig { @@ -96,6 +107,14 @@ impl MachineStateControllerConfig { pub fn uefi_boot_wait_default() -> Duration { Duration::minutes(5) } + + pub fn max_bios_config_retries_default() -> u32 { + 3 + } + + pub fn polling_bios_setup_stuck_threshold_default() -> Duration { + Duration::minutes(15) + } } impl Default for MachineStateControllerConfig { @@ -109,6 +128,10 @@ impl Default for MachineStateControllerConfig { scout_reporting_timeout: MachineStateControllerConfig::scout_reporting_timeout_default( ), uefi_boot_wait: MachineStateControllerConfig::uefi_boot_wait_default(), + max_bios_config_retries: MachineStateControllerConfig::max_bios_config_retries_default( + ), + polling_bios_setup_stuck_threshold: + MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default(), } } } diff --git a/crates/api/src/state_controller/machine/handler.rs b/crates/api/src/state_controller/machine/handler.rs index d850443fd8..f6bde21687 100644 --- a/crates/api/src/state_controller/machine/handler.rs +++ b/crates/api/src/state_controller/machine/handler.rs @@ -68,18 +68,17 @@ use model::machine::LockdownMode::{self, Enable}; use model::machine::infiniband::{IbConfigNotSyncedReason, ib_config_synced}; use model::machine::nvlink::nvlink_config_synced; use model::machine::{ - AttestationMode, BiosConfigInfo, BiosConfigState, BomValidating, BomValidatingContext, - CleanupContext, CleanupState, CreateBossVolumeContext, CreateBossVolumeState, - DpuDiscoveringState, DpuInitNextStateResolver, DpuInitState, FailureCause, FailureDetails, - FailureSource, HostPlatformConfigurationState, HostReprovisionState, InitialResetPhase, - InstallDpuOsState, InstanceNextStateResolver, InstanceState, LockdownInfo, LockdownState, - Machine, MachineLastRebootRequested, MachineLastRebootRequestedMode, MachineNextStateResolver, - MachineState, ManagedHostState, ManagedHostStateSnapshot, MeasuringState, - NetworkConfigUpdateState, NextStateBFBSupport, PerformPowerOperation, PowerDrainState, - PowerState, ReprovisionState, RetryInfo, SecureEraseBossContext, SecureEraseBossState, - SetBootOrderInfo, SetBootOrderState, SetSecureBootState, SpdmMeasuringState, StateMachineArea, - UefiSetupInfo, UefiSetupState, UnlockHostState, ValidationState, - dpf_based_dpu_provisioning_possible, get_display_ids, + AttestationMode, BomValidating, BomValidatingContext, CleanupContext, CleanupState, + CreateBossVolumeContext, CreateBossVolumeState, DpuDiscoveringState, DpuInitNextStateResolver, + DpuInitState, FailureCause, FailureDetails, FailureSource, HostPlatformConfigurationState, + HostReprovisionState, InitialResetPhase, InstallDpuOsState, InstanceNextStateResolver, + InstanceState, LockdownInfo, LockdownState, Machine, MachineLastRebootRequested, + MachineLastRebootRequestedMode, MachineNextStateResolver, MachineState, ManagedHostState, + ManagedHostStateSnapshot, MeasuringState, NetworkConfigUpdateState, NextStateBFBSupport, + PerformPowerOperation, PowerDrainState, PowerState, ReprovisionState, RetryInfo, + SecureEraseBossContext, SecureEraseBossState, SetBootOrderInfo, SetBootOrderState, + SetSecureBootState, SpdmMeasuringState, StateMachineArea, UefiSetupInfo, UefiSetupState, + UnlockHostState, ValidationState, dpf_based_dpu_provisioning_possible, get_display_ids, }; use model::power_manager::PowerHandlingOutcome; use model::resource_pool::common::CommonPools; @@ -112,11 +111,17 @@ use crate::state_controller::machine::{ }; pub mod attestation; +mod bios_config; mod dpf; mod helpers; mod machine_validation; mod power; mod sku; +use bios_config::{ + BiosConfigJobAdvanceOutcome, BiosConfigOutcome, PollingBiosSetupOutcome, + advance_bios_config_job, advance_polling_bios_setup, configure_host_bios, + handle_bios_setup_failed_recovery, +}; use helpers::{ DpuDiscoveringStateHelper, DpuInitStateHelper, ManagedHostStateHelper, NextState, ReprovisionStateHelper, all_equal, @@ -1413,6 +1418,18 @@ impl MachineStateHandler { None => Ok(StateHandlerOutcome::do_nothing()), } } + FailureCause::BiosSetupFailed { .. } if machine_id.machine_type().is_host() => { + let recovered = ManagedHostState::HostInit { + machine_state: MachineState::SetBootOrder { + set_boot_order_info: Some(SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state: SetBootOrderState::SetBootOrder, + retry_count: 0, + }), + }, + }; + handle_bios_setup_failed_recovery(ctx, mh_snapshot, recovered).await + } _ => { // Do nothing. // Handle error cause and decide how to recover if possible. @@ -4164,17 +4181,6 @@ pub struct RebootStatus { status: String, // what we did or are waiting for } -/// Outcome of configure_host_bios function. -enum BiosConfigOutcome { - Done, - WaitingForReboot(String), - /// Dell BIOS PATCH returned a job ID; wait for it to complete before boot order. - WaitingForBiosJob(BiosConfigInfo), -} - -/// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery (matches boot-order retry budget). -const MAX_BIOS_CONFIG_RETRIES: u32 = 3; - /// Outcome of set_host_boot_order function. enum SetBootOrderOutcome { Continue(SetBootOrderInfo), @@ -4863,7 +4869,9 @@ impl StateHandler for HostMachineStateHandler { { BiosConfigOutcome::Done => Ok(StateHandlerOutcome::transition( ManagedHostState::HostInit { - machine_state: MachineState::PollingBiosSetup, + machine_state: MachineState::PollingBiosSetup { + retry_count: *retry_count, + }, }, )), BiosConfigOutcome::WaitingForBiosJob(bios_config_info) => Ok( @@ -4898,9 +4906,24 @@ impl StateHandler for HostMachineStateHandler { ), BiosConfigJobAdvanceOutcome::Done => Ok(StateHandlerOutcome::transition( ManagedHostState::HostInit { - machine_state: MachineState::PollingBiosSetup, + machine_state: MachineState::PollingBiosSetup { + retry_count: bios_config_info.retry_count, + }, }, )), + BiosConfigJobAdvanceOutcome::Failed { failure } => { + Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::HostInit, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + retry_count: 0, + })) + } BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count } => { Ok(StateHandlerOutcome::transition( ManagedHostState::HostInit { @@ -4915,7 +4938,7 @@ impl StateHandler for HostMachineStateHandler { } } } - MachineState::PollingBiosSetup => { + MachineState::PollingBiosSetup { retry_count } => { let next_state = ManagedHostState::HostInit { machine_state: MachineState::SetBootOrder { set_boot_order_info: Some(SetBootOrderInfo { @@ -4930,35 +4953,37 @@ impl StateHandler for HostMachineStateHandler { .services .create_redfish_client_from_machine(&mh_snapshot.host_snapshot) .await?; - - let boot_interface_mac = - mh_snapshot.boot_interface_mac().map(|m| m.to_string()); - - match redfish_client - .is_bios_setup(boot_interface_mac.as_deref()) - .await + match advance_polling_bios_setup( + redfish_client.as_ref(), + mh_snapshot, + *retry_count, + &ctx.services.site_config.machine_state_controller, + ) + .await? { - Ok(true) => { - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - "BIOS setup verified successfully" - ); + PollingBiosSetupOutcome::Verified => { Ok(StateHandlerOutcome::transition(next_state)) } - Ok(false) => Ok(StateHandlerOutcome::wait( - "Polling BIOS setup status, waiting for settings to be applied" - .to_string(), - )), - Err(e) => { - tracing::warn!( - machine_id = %mh_snapshot.host_snapshot.id, - error = %e, - "Failed to check BIOS setup status, will retry" - ); - Ok(StateHandlerOutcome::wait(format!( - "Failed to check BIOS setup status: {}. Will retry.", - e - ))) + PollingBiosSetupOutcome::EnterRecovery(bios_config_info) => Ok( + StateHandlerOutcome::transition(ManagedHostState::HostInit { + machine_state: MachineState::WaitingForBiosJob { bios_config_info }, + }), + ), + PollingBiosSetupOutcome::Failed { failure } => { + Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::HostInit, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + retry_count: 0, + })) + } + PollingBiosSetupOutcome::Wait(reason) => { + Ok(StateHandlerOutcome::wait(reason)) } } } @@ -6194,20 +6219,37 @@ impl StateHandler for InstanceStateHandler { InstanceState::Failed { details, machine_id, - } => { - // Only way to proceed is to - // 1. Force-delete the machine. - // 2. If failed during reprovision, fix the config/hw issue and - // retrigger DPU reprovision. - tracing::warn!( - "Instance id {}/machine: {} stuck in failed state. details: {:?}, failed machine: {}", - instance.id, - host_machine_id, - details, - machine_id - ); - Ok(StateHandlerOutcome::do_nothing()) - } + } => match details.cause { + FailureCause::BiosSetupFailed { .. } if machine_id.machine_type().is_host() => { + let recovered = ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: + HostPlatformConfigurationState::SetBootOrder { + set_boot_order_info: SetBootOrderInfo { + set_boot_order_jid: None, + set_boot_order_state: SetBootOrderState::SetBootOrder, + retry_count: 0, + }, + }, + }, + }; + handle_bios_setup_failed_recovery(ctx, mh_snapshot, recovered).await + } + _ => { + // Only way to proceed for other causes is to + // 1. Force-delete the machine. + // 2. If failed during reprovision, fix the config/hw issue and + // retrigger DPU reprovision. + tracing::warn!( + "Instance id {}/machine: {} stuck in failed state. details: {:?}, failed machine: {}", + instance.id, + host_machine_id, + details, + machine_id + ); + Ok(StateHandlerOutcome::do_nothing()) + } + }, InstanceState::HostReprovision { .. } => { self.host_upgrade .handle_host_reprovision( @@ -9489,7 +9531,7 @@ fn can_restart_reprovision(dpu_snapshots: &[Machine], version: ConfigVersion) -> /// TODO(ken): This is a temporary workaround for work-in-progress on zero-DPU support (August 2024) /// The way we should do this going forward is to plumb the actual non-DPU MAC address we want to /// boot from, instead of special-casing NoDpu errors. -async fn call_machine_setup_and_handle_no_dpu_error( +pub(super) async fn call_machine_setup_and_handle_no_dpu_error( redfish_client: &dyn Redfish, boot_interface_mac: Option<&str>, expected_dpu_count: usize, @@ -9961,7 +10003,9 @@ async fn handle_instance_host_platform_config( ) .await? { - BiosConfigOutcome::Done => HostPlatformConfigurationState::PollingBiosSetup, + BiosConfigOutcome::Done => { + HostPlatformConfigurationState::PollingBiosSetup { retry_count } + } BiosConfigOutcome::WaitingForBiosJob(bios_config_info) => { HostPlatformConfigurationState::WaitingForBiosJob { bios_config_info } } @@ -9992,7 +10036,25 @@ async fn handle_instance_host_platform_config( } } BiosConfigJobAdvanceOutcome::Done => { - HostPlatformConfigurationState::PollingBiosSetup + HostPlatformConfigurationState::PollingBiosSetup { + retry_count: bios_config_info.retry_count, + } + } + BiosConfigJobAdvanceOutcome::Failed { failure } => { + return Ok(StateHandlerOutcome::transition( + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::AssignedInstance, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + }, + }, + )); } BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { retry_count: next_count, @@ -10012,7 +10074,7 @@ async fn handle_instance_host_platform_config( }, )); } - HostPlatformConfigurationState::PollingBiosSetup => { + HostPlatformConfigurationState::PollingBiosSetup { retry_count } => { let next_instance_state = InstanceState::HostPlatformConfiguration { platform_config_state: HostPlatformConfigurationState::SetBootOrder { set_boot_order_info: SetBootOrderInfo { @@ -10023,34 +10085,45 @@ async fn handle_instance_host_platform_config( }, }; - let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); - - match redfish_client - .is_bios_setup(boot_interface_mac.as_deref()) - .await + match advance_polling_bios_setup( + redfish_client.as_ref(), + mh_snapshot, + retry_count, + &ctx.services.site_config.machine_state_controller, + ) + .await? { - Ok(true) => { - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - "BIOS setup verified successfully" - ); - next_instance_state + PollingBiosSetupOutcome::Verified => next_instance_state, + PollingBiosSetupOutcome::EnterRecovery(bios_config_info) => { + return Ok(StateHandlerOutcome::transition( + ManagedHostState::Assigned { + instance_state: InstanceState::HostPlatformConfiguration { + platform_config_state: + HostPlatformConfigurationState::WaitingForBiosJob { + bios_config_info, + }, + }, + }, + )); } - Ok(false) => { - return Ok(StateHandlerOutcome::wait( - "Polling BIOS setup status, waiting for settings to be applied".to_string(), + PollingBiosSetupOutcome::Failed { failure } => { + return Ok(StateHandlerOutcome::transition( + ManagedHostState::Assigned { + instance_state: InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::BiosSetupFailed { err: failure }, + failed_at: Utc::now(), + source: FailureSource::StateMachineArea( + StateMachineArea::AssignedInstance, + ), + }, + machine_id: mh_snapshot.host_snapshot.id, + }, + }, )); } - Err(e) => { - tracing::warn!( - machine_id = %mh_snapshot.host_snapshot.id, - error = %e, - "Failed to check BIOS setup status, will retry" - ); - return Ok(StateHandlerOutcome::wait(format!( - "Failed to check BIOS setup status: {}. Will retry.", - e - ))); + PollingBiosSetupOutcome::Wait(reason) => { + return Ok(StateHandlerOutcome::wait(reason)); } } } @@ -10103,306 +10176,6 @@ async fn handle_instance_host_platform_config( Ok(StateHandlerOutcome::transition(next_state)) } -async fn configure_host_bios( - ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, - reachability_params: &ReachabilityParams, - redfish_client: &dyn Redfish, - mh_snapshot: &ManagedHostStateSnapshot, - retry_count: u32, -) -> Result { - let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); - - let bios_job_id = match call_machine_setup_and_handle_no_dpu_error( - redfish_client, - boot_interface_mac.as_deref(), - mh_snapshot.host_snapshot.associated_dpu_machine_ids().len(), - &ctx.services.site_config, - ) - .await - { - Err(e) => { - tracing::warn!( - "redfish machine_setup failed for {}, potentially due to known race condition between UEFI POST and BMC. triggering force-restart if needed. err: {}", - mh_snapshot.host_snapshot.id, - e - ); - - // if machine_setup failed, reboot to potentially work around - // a known race between the DPU UEFI and the BMC, where if - // the BMC is not up when DPU UEFI runs, then Attributes might - // not come through. The fix is to force-restart the DPU to - // re-POST. - // - // As of July 2024, Josh Price said there's an NBU FR to fix - // this, but it wasn't target to a release yet. - let reboot_status = if mh_snapshot.host_snapshot.last_reboot_requested.is_none() { - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart) - .await?; - - RebootStatus { - increase_retry_count: true, - status: "Restarted host".to_string(), - } - } else { - trigger_reboot_if_needed( - &mh_snapshot.host_snapshot, - mh_snapshot, - None, - reachability_params, - ctx, - ) - .await? - }; - return Ok(BiosConfigOutcome::WaitingForReboot(format!( - "redfish machine_setup failed: {e}; triggered host reboot: {reboot_status:#?}" - ))); - } - Ok(jid) => jid, - }; - - if let Some(job_id) = &bios_job_id { - return Ok(BiosConfigOutcome::WaitingForBiosJob(BiosConfigInfo { - bios_job_id: Some(job_id.clone()), - bios_config_state: BiosConfigState::WaitForBiosJobScheduled, - retry_count, - })); - } - - // No job to wait for (non-Dell or vendor that doesn't return job); reboot to apply and continue. - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; - Ok(BiosConfigOutcome::Done) -} - -/// Outcome of advancing the BIOS config job state machine (Dell: wait for BIOS PATCH job before boot order). -enum BiosConfigJobAdvanceOutcome { - Continue(BiosConfigInfo), - Done, - /// Same state, but wait (e.g. waiting for power down or BMC to come back). - Wait(String), - /// After successful power/BMC recovery from a failed BIOS job: re-run machine_setup (not PollingBiosSetup). - RetryPlatformConfiguration { - retry_count: u32, - }, -} - -fn bios_config_enter_handle_failure( - info: &BiosConfigInfo, - failure: String, - host_id: &MachineId, -) -> Result { - if info.retry_count >= MAX_BIOS_CONFIG_RETRIES { - return Err(StateHandlerError::GenericError(eyre::eyre!( - "BIOS config job failure remediation exceeded max retries ({MAX_BIOS_CONFIG_RETRIES}) for host {host_id}: {failure}" - ))); - } - Ok(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::HandleBiosJobFailure { - failure, - power_state: PowerState::Off, - }, - retry_count: info.retry_count + 1, - }) -} - -/// Advance one step of the BIOS config job wait state machine. Same pattern as set_host_boot_order. -async fn advance_bios_config_job( - ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, - redfish_client: &dyn Redfish, - mh_snapshot: &ManagedHostStateSnapshot, - info: BiosConfigInfo, -) -> Result { - match info.bios_config_state { - BiosConfigState::WaitForBiosJobScheduled => { - if let Some(job_id) = &info.bios_job_id { - let job_state = redfish_client - .get_job_state(job_id) - .await - .map_err(|e| redfish_error("get_job_state", e))?; - if matches!( - job_state, - libredfish::JobState::ScheduledWithErrors - | libredfish::JobState::CompletedWithErrors - ) { - let failure = format!("BIOS job {} failed with state {job_state:#?}", job_id); - tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id - ); - return Ok(BiosConfigJobAdvanceOutcome::Continue( - bios_config_enter_handle_failure( - &info, - failure, - &mh_snapshot.host_snapshot.id, - )?, - )); - } - if !matches!(job_state, libredfish::JobState::Scheduled) { - return Err(StateHandlerError::GenericError(eyre::eyre!( - "waiting for BIOS job {:#?} to be scheduled; current state: {job_state:#?}", - job_id - ))); - } - } - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::RebootHost, - retry_count: info.retry_count, - })) - } - BiosConfigState::RebootHost => { - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::WaitForBiosJobCompletion, - retry_count: info.retry_count, - })) - } - BiosConfigState::WaitForBiosJobCompletion => { - const JOB_QUERY_WAIT_MINUTES: i64 = 5; - if let Some(job_id) = &info.bios_job_id { - let job_state = match redfish_client.get_job_state(job_id).await { - Ok(s) => s, - Err(e) => { - let minutes_since_state_change = mh_snapshot - .host_snapshot - .state - .version - .since_state_change() - .num_minutes(); - if minutes_since_state_change < JOB_QUERY_WAIT_MINUTES { - return Err(redfish_error("get_job_state", e)); - } - let failure = format!( - "BIOS config job {} lookup failed after {} min: {}", - job_id, minutes_since_state_change, e - ); - tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id - ); - return Ok(BiosConfigJobAdvanceOutcome::Continue( - bios_config_enter_handle_failure( - &info, - failure, - &mh_snapshot.host_snapshot.id, - )?, - )); - } - }; - match job_state { - libredfish::JobState::Completed => Ok(BiosConfigJobAdvanceOutcome::Done), - libredfish::JobState::ScheduledWithErrors - | libredfish::JobState::CompletedWithErrors => { - let failure = format!( - "BIOS config job {} failed with state {job_state:#?}", - job_id - ); - tracing::warn!( - "{} for {}, transitioning to HandleBiosJobFailure (power cycle + BMC reset)", - failure, - mh_snapshot.host_snapshot.id, - ); - Ok(BiosConfigJobAdvanceOutcome::Continue( - bios_config_enter_handle_failure( - &info, - failure, - &mh_snapshot.host_snapshot.id, - )?, - )) - } - _ => Err(StateHandlerError::GenericError(eyre::eyre!( - "waiting for BIOS job {:#?} to complete; current state: {job_state:#?}", - job_id - ))), - } - } else { - Ok(BiosConfigJobAdvanceOutcome::Done) - } - } - BiosConfigState::HandleBiosJobFailure { - failure, - power_state, - } => { - let current_power_state = redfish_client - .get_power_state() - .await - .map_err(|e| redfish_error("get_power_state", e))?; - - match power_state { - PowerState::Off => { - if current_power_state != libredfish::PowerState::Off { - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceOff) - .await?; - return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: waiting for {} to power down; current power state: {current_power_state}; failure: {}", - mh_snapshot.host_snapshot.id, failure - ))); - } - tracing::info!( - "HandleBiosJobFailure: Resetting BMC for {} after BIOS job failure: {}", - mh_snapshot.host_snapshot.id, - failure - ); - redfish_client - .bmc_reset() - .await - .map_err(|e| redfish_error("bmc_reset", e))?; - Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { - bios_job_id: info.bios_job_id.clone(), - bios_config_state: BiosConfigState::HandleBiosJobFailure { - failure: failure.clone(), - power_state: PowerState::On, - }, - retry_count: info.retry_count, - })) - } - PowerState::On => { - if current_power_state != libredfish::PowerState::On { - let basetime = mh_snapshot - .host_snapshot - .last_reboot_requested - .as_ref() - .map(|x| x.time) - .unwrap_or(mh_snapshot.host_snapshot.state.version.timestamp()); - let power_down_wait = ctx - .services - .site_config - .machine_state_controller - .power_down_wait; - if Utc::now().signed_duration_since(basetime) < power_down_wait { - return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: waiting for BMC to come back online for {}; failure: {}", - mh_snapshot.host_snapshot.id, failure - ))); - } - handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::On) - .await?; - return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( - "HandleBiosJobFailure: powering on {} after BMC reset; failure: {}", - mh_snapshot.host_snapshot.id, failure - ))); - } - tracing::info!( - machine_id = %mh_snapshot.host_snapshot.id, - retry_count = info.retry_count, - "HandleBiosJobFailure: BMC reset complete; re-running platform configuration (machine_setup) — power cycle does not apply BIOS attributes", - ); - Ok(BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { - retry_count: info.retry_count, - }) - } - _ => Err(StateHandlerError::GenericError(eyre::eyre!( - "HandleBiosJobFailure: unexpected power state {power_state:#?} for {}", - mh_snapshot.host_snapshot.id - ))), - } - } - } -} - async fn set_host_boot_order( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, reachability_params: &ReachabilityParams, diff --git a/crates/api/src/state_controller/machine/handler/bios_config.rs b/crates/api/src/state_controller/machine/handler/bios_config.rs new file mode 100644 index 0000000000..143fbf7fc9 --- /dev/null +++ b/crates/api/src/state_controller/machine/handler/bios_config.rs @@ -0,0 +1,573 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! BIOS configuration: machine_setup, Dell job wait/recovery, and PollingBiosSetup escalation. + +use carbide_redfish::libredfish::error::state_handler_redfish_error as redfish_error; +use chrono::Utc; +use eyre::eyre; +use libredfish::{Redfish, SystemPowerControl}; +use model::machine::{ + BiosConfigInfo, BiosConfigState, ManagedHostState, ManagedHostStateSnapshot, PowerState, +}; +use state_controller::state_handler::{ + StateHandlerContext, StateHandlerError, StateHandlerOutcome, +}; + +use super::{ + ReachabilityParams, RebootStatus, call_machine_setup_and_handle_no_dpu_error, + handler_host_power_control, trigger_reboot_if_needed, +}; +use crate::state_controller::machine::config::MachineStateControllerConfig; +use crate::state_controller::machine::context::MachineStateHandlerContextObjects; + +/// Outcome of configure_host_bios function. +pub(super) enum BiosConfigOutcome { + Done, + WaitingForReboot(String), + /// Dell BIOS PATCH returned a job ID; wait for it to complete before boot order. + WaitingForBiosJob(BiosConfigInfo), +} + +/// Outcome of advancing the BIOS config job state machine (Dell: wait for BIOS PATCH job before boot order). +pub(super) enum BiosConfigJobAdvanceOutcome { + Continue(BiosConfigInfo), + /// Dell BIOS job completed; proceed to verify settings via PollingBiosSetup. + Done, + Failed { + failure: String, + }, + /// Same state, but wait (e.g. waiting for power down or BMC to come back). + Wait(String), + /// After successful power/BMC recovery from a failed BIOS job: re-run machine_setup (not PollingBiosSetup). + RetryPlatformConfiguration { + retry_count: u32, + }, +} + +#[derive(Debug)] +pub(super) enum PollingBiosSetupOutcome { + Verified, + Wait(String), + EnterRecovery(BiosConfigInfo), + Failed { failure: String }, +} + +/// Outcome of entering HandleBiosJobFailure recovery, or failing once the budget is exhausted. +enum BiosRecoveryAttemptOutcome { + Continue(BiosConfigInfo), + Failed { failure: String }, +} + +impl From for BiosConfigJobAdvanceOutcome { + fn from(outcome: BiosRecoveryAttemptOutcome) -> Self { + match outcome { + BiosRecoveryAttemptOutcome::Continue(info) => Self::Continue(info), + BiosRecoveryAttemptOutcome::Failed { failure } => Self::Failed { failure }, + } + } +} + +pub(super) async fn configure_host_bios( + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, + reachability_params: &ReachabilityParams, + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + retry_count: u32, +) -> Result { + let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); + + let bios_job_id = match call_machine_setup_and_handle_no_dpu_error( + redfish_client, + boot_interface_mac.as_deref(), + mh_snapshot.host_snapshot.associated_dpu_machine_ids().len(), + &ctx.services.site_config, + ) + .await + { + Err(e) => { + tracing::warn!( + "redfish machine_setup failed for {}, potentially due to known race condition between UEFI POST and BMC. triggering force-restart if needed. err: {}", + mh_snapshot.host_snapshot.id, + e + ); + + // if machine_setup failed, reboot to potentially work around + // a known race between the DPU UEFI and the BMC, where if + // the BMC is not up when DPU UEFI runs, then Attributes might + // not come through. The fix is to force-restart the DPU to + // re-POST. + // + // As of July 2024, Josh Price said there's an NBU FR to fix + // this, but it wasn't target to a release yet. + let reboot_status = if mh_snapshot.host_snapshot.last_reboot_requested.is_none() { + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart) + .await?; + + RebootStatus { + increase_retry_count: true, + status: "Restarted host".to_string(), + } + } else { + trigger_reboot_if_needed( + &mh_snapshot.host_snapshot, + mh_snapshot, + None, + reachability_params, + ctx, + ) + .await? + }; + return Ok(BiosConfigOutcome::WaitingForReboot(format!( + "redfish machine_setup failed: {e}; triggered host reboot: {reboot_status:#?}" + ))); + } + Ok(jid) => jid, + }; + + if let Some(job_id) = bios_job_id { + return Ok(BiosConfigOutcome::WaitingForBiosJob(BiosConfigInfo { + bios_job_id: Some(job_id), + bios_config_state: BiosConfigState::WaitForBiosJobScheduled, + retry_count, + })); + } + + // No job to wait for (non-Dell or vendor that doesn't return job); reboot to apply and continue. + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; + Ok(BiosConfigOutcome::Done) +} + +/// Advance one step of the BIOS config job wait state machine. +pub(super) async fn advance_bios_config_job( + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + info: BiosConfigInfo, +) -> Result { + let machine_controller_config = &ctx.services.site_config.machine_state_controller; + match info.bios_config_state { + BiosConfigState::WaitForBiosJobScheduled => { + let job_id = info.bios_job_id.as_ref().ok_or_else(|| { + StateHandlerError::GenericError(eyre!( + "WaitForBiosJobScheduled requires bios_job_id for host {}", + mh_snapshot.host_snapshot.id + )) + })?; + let job_state = redfish_client + .get_job_state(job_id) + .await + .map_err(|e| redfish_error("get_job_state", e))?; + if matches!( + job_state, + libredfish::JobState::ScheduledWithErrors + | libredfish::JobState::CompletedWithErrors + ) { + let failure = format!("BIOS job {} failed with state {job_state:#?}", job_id); + tracing::warn!( + %failure, + "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" + ); + return Ok(try_bios_recovery_attempt( + machine_controller_config, + info.retry_count, + info.bios_job_id, + failure, + )? + .into()); + } + if !matches!(job_state, libredfish::JobState::Scheduled) { + return Err(StateHandlerError::GenericError(eyre!( + "waiting for BIOS job {:#?} to be scheduled; current state: {job_state:#?}", + job_id + ))); + } + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id: info.bios_job_id, + bios_config_state: BiosConfigState::RebootHost, + retry_count: info.retry_count, + })) + } + BiosConfigState::RebootHost => { + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart).await?; + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id: info.bios_job_id, + bios_config_state: BiosConfigState::WaitForBiosJobCompletion, + retry_count: info.retry_count, + })) + } + BiosConfigState::WaitForBiosJobCompletion => { + const JOB_QUERY_WAIT_MINUTES: i64 = 5; + let job_id = info.bios_job_id.as_ref().ok_or_else(|| { + StateHandlerError::GenericError(eyre!( + "WaitForBiosJobCompletion requires bios_job_id for host {}", + mh_snapshot.host_snapshot.id + )) + })?; + let job_state = match redfish_client.get_job_state(job_id).await { + Ok(s) => s, + Err(e) => { + let minutes_since_state_change = mh_snapshot + .host_snapshot + .state + .version + .since_state_change() + .num_minutes(); + if minutes_since_state_change < JOB_QUERY_WAIT_MINUTES { + return Err(redfish_error("get_job_state", e)); + } + let failure = format!( + "BIOS config job {} lookup failed after {} min: {}", + job_id, minutes_since_state_change, e + ); + tracing::warn!( + %failure, + "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" + ); + return Ok(try_bios_recovery_attempt( + machine_controller_config, + info.retry_count, + info.bios_job_id, + failure, + )? + .into()); + } + }; + match job_state { + libredfish::JobState::Completed => Ok(BiosConfigJobAdvanceOutcome::Done), + libredfish::JobState::ScheduledWithErrors + | libredfish::JobState::CompletedWithErrors => { + let failure = format!( + "BIOS config job {} failed with state {job_state:#?}", + job_id + ); + tracing::warn!( + %failure, + "transitioning to HandleBiosJobFailure (power cycle + BMC reset)" + ); + Ok(try_bios_recovery_attempt( + machine_controller_config, + info.retry_count, + info.bios_job_id, + failure, + )? + .into()) + } + _ => Err(StateHandlerError::GenericError(eyre!( + "waiting for BIOS job {:#?} to complete; current state: {job_state:#?}", + job_id + ))), + } + } + BiosConfigState::HandleBiosJobFailure { + failure, + power_state, + } => { + let current_power_state = redfish_client + .get_power_state() + .await + .map_err(|e| redfish_error("get_power_state", e))?; + + match power_state { + PowerState::Off => { + if current_power_state != libredfish::PowerState::Off { + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceOff) + .await?; + return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( + "HandleBiosJobFailure: waiting for power down; current power state: {current_power_state}; failure: {failure}" + ))); + } + tracing::info!( + %failure, + "HandleBiosJobFailure: resetting BMC after BIOS job failure" + ); + redfish_client + .bmc_reset() + .await + .map_err(|e| redfish_error("bmc_reset", e))?; + Ok(BiosConfigJobAdvanceOutcome::Continue(BiosConfigInfo { + bios_job_id: info.bios_job_id, + bios_config_state: BiosConfigState::HandleBiosJobFailure { + failure, + power_state: PowerState::On, + }, + retry_count: info.retry_count, + })) + } + PowerState::On => { + if current_power_state != libredfish::PowerState::On { + let basetime = mh_snapshot + .host_snapshot + .last_reboot_requested + .as_ref() + .map(|x| x.time) + .unwrap_or(mh_snapshot.host_snapshot.state.version.timestamp()); + let power_down_wait = ctx + .services + .site_config + .machine_state_controller + .power_down_wait; + if Utc::now().signed_duration_since(basetime) < power_down_wait { + return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( + "HandleBiosJobFailure: waiting for BMC to come back online; failure: {failure}" + ))); + } + handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::On) + .await?; + return Ok(BiosConfigJobAdvanceOutcome::Wait(format!( + "HandleBiosJobFailure: powering on after BMC reset; failure: {failure}" + ))); + } + tracing::info!( + retry_count = info.retry_count, + "HandleBiosJobFailure: BMC reset complete; re-running platform configuration (machine_setup) — power cycle does not apply BIOS attributes", + ); + Ok(BiosConfigJobAdvanceOutcome::RetryPlatformConfiguration { + retry_count: info.retry_count, + }) + } + _ => Err(StateHandlerError::GenericError(eyre!( + "HandleBiosJobFailure: unexpected power state {power_state:#?} for {}", + mh_snapshot.host_snapshot.id + ))), + } + } + } +} + +/// Enter HandleBiosJobFailure recovery, or move to Failed when budget is exhausted. +fn try_bios_recovery_attempt( + machine_controller_config: &MachineStateControllerConfig, + retry_count: u32, + bios_job_id: Option, + failure: String, +) -> Result { + if retry_count >= machine_controller_config.max_bios_config_retries { + tracing::warn!( + retry_count, + max_retries = machine_controller_config.max_bios_config_retries, + %failure, + "BIOS recovery budget exhausted; moving host to Failed for manual remediation" + ); + return Ok(BiosRecoveryAttemptOutcome::Failed { + failure: format!( + "{failure} (automated BIOS recovery exhausted after {} attempts)", + machine_controller_config.max_bios_config_retries + ), + }); + } + Ok(BiosRecoveryAttemptOutcome::Continue(BiosConfigInfo { + bios_job_id, + bios_config_state: BiosConfigState::HandleBiosJobFailure { + failure, + power_state: PowerState::Off, + }, + retry_count: retry_count + 1, + })) +} + +pub(super) async fn advance_polling_bios_setup( + redfish_client: &dyn Redfish, + mh_snapshot: &ManagedHostStateSnapshot, + retry_count: u32, + machine_controller_config: &MachineStateControllerConfig, +) -> Result { + let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); + let stuck_for = mh_snapshot.host_snapshot.state.version.since_state_change(); + + match redfish_client + .is_bios_setup(boot_interface_mac.as_deref()) + .await + { + Ok(true) => { + tracing::info!("BIOS setup verified successfully"); + Ok(PollingBiosSetupOutcome::Verified) + } + Ok(false) => { + if let Some(outcome) = escalate_stuck_polling_bios_setup( + machine_controller_config, + retry_count, + stuck_for, + )? { + return Ok(outcome); + } + Ok(PollingBiosSetupOutcome::Wait(format!( + "Polling BIOS setup status, waiting for settings to be applied (retry_count={retry_count})" + ))) + } + Err(e) => { + tracing::warn!( + error = %e, + "Failed to check BIOS setup status, will retry" + ); + Ok(PollingBiosSetupOutcome::Wait(format!( + "Failed to check BIOS setup status: {e}. Will retry." + ))) + } + } +} + +fn escalate_stuck_polling_bios_setup( + machine_controller_config: &MachineStateControllerConfig, + retry_count: u32, + stuck_for: chrono::Duration, +) -> Result, StateHandlerError> { + if stuck_for <= machine_controller_config.polling_bios_setup_stuck_threshold { + return Ok(None); + } + + tracing::warn!( + ?stuck_for, + retry_count, + "PollingBiosSetup stuck; attempting HandleBiosJobFailure recovery (power-off + BMC reset + power-on + re-run machine_setup)" + ); + + let failure = format!( + "PollingBiosSetup stuck for {} minutes (is_bios_setup returned false)", + stuck_for.num_minutes() + ); + + Ok(Some( + match try_bios_recovery_attempt(machine_controller_config, retry_count, None, failure)? { + BiosRecoveryAttemptOutcome::Continue(info) => { + PollingBiosSetupOutcome::EnterRecovery(info) + } + BiosRecoveryAttemptOutcome::Failed { failure } => { + PollingBiosSetupOutcome::Failed { failure } + } + }, + )) +} + +pub(super) async fn handle_bios_setup_failed_recovery( + ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, + mh_snapshot: &ManagedHostStateSnapshot, + recovered_state: ManagedHostState, +) -> Result, StateHandlerError> { + let boot_interface_mac = mh_snapshot.boot_interface_mac().map(|m| m.to_string()); + let redfish_client = ctx + .services + .create_redfish_client_from_machine(&mh_snapshot.host_snapshot) + .await?; + match redfish_client + .is_bios_setup(boot_interface_mac.as_deref()) + .await + { + Ok(true) => { + tracing::info!("BIOS setup verified after manual remediation; resuming state machine"); + Ok(StateHandlerOutcome::transition(recovered_state)) + } + Ok(false) => Ok(StateHandlerOutcome::do_nothing()), + Err(e) => { + tracing::warn!( + error = %e, + "Failed to check BIOS setup status, will retry" + ); + Ok(StateHandlerOutcome::do_nothing()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escalate_stuck_polling_bios_setup_not_triggered_before_threshold() { + let machine_controller_config = MachineStateControllerConfig::default(); + let result = escalate_stuck_polling_bios_setup( + &machine_controller_config, + 0, + chrono::Duration::minutes(10), + ) + .unwrap(); + + assert!(result.is_none()); + } + + #[test] + fn escalate_stuck_polling_bios_setup_enters_handle_bios_job_failure_when_stuck() { + let machine_controller_config = MachineStateControllerConfig::default(); + let info = escalate_stuck_polling_bios_setup( + &machine_controller_config, + 0, + chrono::Duration::minutes(16), + ) + .unwrap() + .expect("recovery should be triggered"); + let PollingBiosSetupOutcome::EnterRecovery(info) = info else { + panic!("expected EnterRecovery"); + }; + assert_eq!(info.bios_job_id, None); + assert_eq!(info.retry_count, 1); + assert!(matches!( + info.bios_config_state, + BiosConfigState::HandleBiosJobFailure { + power_state: PowerState::Off, + .. + } + )); + } + + #[test] + fn escalate_stuck_polling_bios_setup_respects_shared_retry_budget() { + let machine_controller_config = MachineStateControllerConfig::default(); + let result = escalate_stuck_polling_bios_setup( + &machine_controller_config, + machine_controller_config.max_bios_config_retries, + chrono::Duration::minutes(20), + ) + .unwrap() + .expect("expected Failed outcome"); + + assert!(matches!(result, PollingBiosSetupOutcome::Failed { .. })); + } + + #[test] + fn try_bios_recovery_attempt_fails_when_budget_exhausted() { + let machine_controller_config = MachineStateControllerConfig::default(); + let result = try_bios_recovery_attempt( + &machine_controller_config, + machine_controller_config.max_bios_config_retries, + Some("job-1".to_string()), + "job failed".to_string(), + ) + .unwrap(); + + assert!(matches!(result, BiosRecoveryAttemptOutcome::Failed { .. })); + } + + #[test] + fn escalate_stuck_polling_bios_setup_allows_last_budgeted_attempt() { + let machine_controller_config = MachineStateControllerConfig::default(); + let outcome = escalate_stuck_polling_bios_setup( + &machine_controller_config, + machine_controller_config.max_bios_config_retries - 1, + chrono::Duration::minutes(20), + ) + .unwrap() + .expect("last budgeted recovery should be allowed"); + + let PollingBiosSetupOutcome::EnterRecovery(info) = outcome else { + panic!("expected EnterRecovery"); + }; + assert_eq!( + info.retry_count, + machine_controller_config.max_bios_config_retries + ); + } +} diff --git a/crates/api/src/state_controller/machine/io.rs b/crates/api/src/state_controller/machine/io.rs index 78b36191f6..046df2dab4 100644 --- a/crates/api/src/state_controller/machine/io.rs +++ b/crates/api/src/state_controller/machine/io.rs @@ -176,7 +176,7 @@ impl StateControllerIO for MachineStateControllerIO { "waitingforplatformconfiguration" } MachineState::WaitingForBiosJob { .. } => "waitingforbiosjob", - MachineState::PollingBiosSetup => "pollingbiossetup", + MachineState::PollingBiosSetup { .. } => "pollingbiossetup", MachineState::SetBootOrder { .. } => "setbootorder", MachineState::UefiSetup { .. } => "uefisetup", MachineState::WaitingForDiscovery => "waitingfordiscovery", diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index c751327de2..6484de7311 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -485,7 +485,7 @@ impl TestEnv { model::machine::MachineState::WaitingForPlatformConfiguration { .. } => { machine_state } - model::machine::MachineState::PollingBiosSetup => machine_state, + model::machine::MachineState::PollingBiosSetup { .. } => machine_state, model::machine::MachineState::SetBootOrder { .. } => machine_state, model::machine::MachineState::UefiSetup { .. } => machine_state, model::machine::MachineState::WaitingForDiscovery => machine_state, @@ -1241,6 +1241,10 @@ pub fn get_config() -> CarbideConfig { controller: StateControllerConfig::default(), scout_reporting_timeout: Duration::weeks(52), uefi_boot_wait: Duration::seconds(0), + max_bios_config_retries: MachineStateControllerConfig::max_bios_config_retries_default( + ), + polling_bios_setup_stuck_threshold: + MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default(), }, network_segment_state_controller: NetworkSegmentStateControllerConfig { network_segment_drain_time: Duration::seconds(2), diff --git a/crates/api/src/tests/machine_history.rs b/crates/api/src/tests/machine_history.rs index 4294d1e78c..40b1db800b 100644 --- a/crates/api/src/tests/machine_history.rs +++ b/crates/api/src/tests/machine_history.rs @@ -50,7 +50,7 @@ async fn test_machine_state_history(pool: sqlx::PgPool) -> Result<(), Box, + is_bios_setup: Option, job_state_sequence: VecDeque, /// Records every call to `RedfishClientPool::create_client` so tests can /// assert what vendor was passed at each call site. @@ -151,6 +152,10 @@ impl RedfishSim { self.state.lock().unwrap().job_state_sequence = VecDeque::from(states); } + pub fn set_is_bios_setup(&self, ready: bool) { + self.state.lock().unwrap().is_bios_setup = Some(ready); + } + /// Returns a snapshot of every `create_client` call made through this sim, /// in the order they happened. Useful for asserting which vendor was /// passed at a given call site. @@ -1354,7 +1359,7 @@ impl Redfish for RedfishSimClient { &'a self, _: Option<&'a str>, ) -> libredfish::RedfishFuture<'a, Result> { - Box::pin(async move { Ok(true) }) + Box::pin(async move { Ok(self.state.lock().unwrap().is_bios_setup.unwrap_or(true)) }) } fn get_secure_boot_certificate<'a>( diff --git a/docs/architecture/state_machines/managedhost.md b/docs/architecture/state_machines/managedhost.md index 43a2e55dda..3486a74ff1 100644 --- a/docs/architecture/state_machines/managedhost.md +++ b/docs/architecture/state_machines/managedhost.md @@ -76,6 +76,10 @@ stateDiagram-v2 ForceDeletion --> [*] : Force deletion complete ``` +### `Failed` recovery semantics + +`FailureCause::BiosSetupFailed` is set when BIOS setup retries are exhausted during ingestion or instance deprovisioning. The Failed handler polls `is_bios_setup` and auto-recovers into `SetBootOrder` once the BMC reports success. + ## DPU Discovery State Details (DpuDiscoveringState) Shows the complete DPU discovery and configuration process: @@ -240,6 +244,10 @@ stateDiagram-v2 state "EnableIpmiOverLan" as HI_EnableIpmiOverLan state "WaitingForPlatformConfiguration" as HI_WaitingForPlatformConfiguration + state "WaitingForBiosJob" as HI_WaitingForBiosJob { + state "HandleBiosJobFailure" as HI_WBJ_HandleBiosJobFailure + } + state "PollingBiosSetup" as HI_PollingBiosSetup state "WaitingForDiscovery" as HI_WaitingForDiscovery state "Discovered" as HI_Discovered state "BomValidating/MatchingSku" as BomValidating_BV_MatchingSku @@ -251,9 +259,19 @@ stateDiagram-v2 DpuInitState_DI_WaitingForNetworkConfig --> HI_EnableIpmiOverLan Failed --> HI_WFL_TimeWaitForDPUDown Failed --> HI_M_WaitingForMeasurements + Failed --> HI_SBO_SetBootOrder : BiosSetupFailed AND is_bios_setup ok HI_EnableIpmiOverLan --> HI_WaitingForPlatformConfiguration : Enable IPMI over LAN access - HI_WaitingForPlatformConfiguration --> HI_SBO_SetBootOrder : Call machine setup/Restart Host + HI_WaitingForPlatformConfiguration --> HI_PollingBiosSetup : Call machine setup/Restart Host + HI_WaitingForPlatformConfiguration --> HI_WaitingForBiosJob : Dell BIOS job scheduled + HI_WaitingForBiosJob --> HI_PollingBiosSetup : BIOS job completed + HI_WaitingForBiosJob --> HI_WBJ_HandleBiosJobFailure : BIOS job failed + HI_WaitingForBiosJob --> Failed : Retry budget exhausted (BiosSetupFailed) + HI_WBJ_HandleBiosJobFailure --> HI_WaitingForPlatformConfiguration : Power off + BMC reset + power on + HI_PollingBiosSetup --> HI_PollingBiosSetup : Wait for BIOS setup + HI_PollingBiosSetup --> HI_WBJ_HandleBiosJobFailure : Stuck > 15 min, retry budget left + HI_PollingBiosSetup --> Failed : Stuck > 15 min, retry budget exhausted (BiosSetupFailed) + HI_PollingBiosSetup --> HI_SBO_SetBootOrder : BIOS is setup HI_SBO_SetBootOrder --> hi_sbo_if_zero_dpu hi_sbo_if_zero_dpu --> HI_SBO_WaitForSetBootOrderJobCompletion : No DPU @@ -446,6 +464,9 @@ stateDiagram-v2 } state "CheckHostConfig" as A_HPC_CheckHostConfig state "ConfigureBios" as A_HPC_ConfigureBios + state "WaitingForBiosJob" as A_HPC_WaitingForBiosJob { + state "HandleBiosJobFailure" as A_HPC_WBJ_HandleBiosJobFailure + } state "PollingBiosSetup" as A_HPC_PollingBiosSetup state "SetBootOrder" as A_HPC_SetBootOrder { state "SetBootOrder" as A_HPC_SBO_SetBootOrder @@ -513,7 +534,14 @@ stateDiagram-v2 A_HPC_CheckHostConfig --> A_HPC_ConfigureBios : Need config host boot order A_HPC_CheckHostConfig --> A_HPC_LockHost : No need config host boot order A_HPC_ConfigureBios --> A_HPC_PollingBiosSetup : Config BIOS + A_HPC_ConfigureBios --> A_HPC_WaitingForBiosJob : Dell BIOS job scheduled + A_HPC_WaitingForBiosJob --> A_HPC_PollingBiosSetup : BIOS job completed + A_HPC_WaitingForBiosJob --> A_HPC_WBJ_HandleBiosJobFailure : BIOS job failed + A_HPC_WaitingForBiosJob --> A_Failed : Retry budget exhausted (BiosSetupFailed) + A_HPC_WBJ_HandleBiosJobFailure --> A_HPC_ConfigureBios : Power off + BMC reset + power on A_HPC_PollingBiosSetup --> A_HPC_PollingBiosSetup : Wait for BIOS setup + A_HPC_PollingBiosSetup --> A_HPC_WBJ_HandleBiosJobFailure : Stuck > 15 min, retry budget left + A_HPC_PollingBiosSetup --> A_Failed : Stuck > 15 min, retry budget exhausted (BiosSetupFailed) A_HPC_PollingBiosSetup --> A_HPC_SBO_SetBootOrder : BIOS is setup A_HPC_SBO_SetBootOrder --> A_HPC_SBO_WaitForSetBootOrderJobScheduled : Set boot order job scheduled A_HPC_SBO_WaitForSetBootOrderJobScheduled --> A_HPC_SBO_RebootHost : Job scheduled @@ -524,6 +552,7 @@ stateDiagram-v2 state AnyState AnyState --> A_Failed : Any failure condition A_Failed --> A_Failed : Wait (stuck, manual action needed) + A_Failed --> A_HPC_SBO_SetBootOrder : BiosSetupFailed AND is_bios_setup ok ```