Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions crates/api-model/src/machine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1509,6 +1509,8 @@ pub enum FailureCause {
DpfProvisioning { err: String },

SpdmAttestationFailed { err: String },

BiosSetupFailed { err: String },
}

#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
Expand Down Expand Up @@ -1660,7 +1662,10 @@ pub enum MachineState {
WaitingForBiosJob {
bios_config_info: BiosConfigInfo,
},
PollingBiosSetup,
PollingBiosSetup {
#[serde(default)]
retry_count: u32,
},
SetBootOrder {
set_boot_order_info: Option<SetBootOrderInfo>,
},
Expand Down Expand Up @@ -1716,6 +1721,10 @@ pub enum UefiSetupState {

/// Tracks progress waiting for the Dell BIOS config job (from machine_setup PATCH) to complete
/// before configuring boot order. Same pattern as SetBootOrderInfo / SetBootOrderState.
///
/// `bios_job_id` is `Some` while polling a vendor BIOS job (e.g. Dell). `None` only during
/// `HandleBiosJobFailure` recovery from stuck PollingBiosSetup; non-Dell hosts reboot in
/// `configure_host_bios` and never enter job-polling substates.
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
#[serde(rename_all = "lowercase")]
pub struct BiosConfigInfo {
Expand Down Expand Up @@ -1929,7 +1938,10 @@ pub enum HostPlatformConfigurationState {
WaitingForBiosJob {
bios_config_info: BiosConfigInfo,
},
PollingBiosSetup,
PollingBiosSetup {
#[serde(default)]
retry_count: u32,
},
SetBootOrder {
set_boot_order_info: SetBootOrderInfo,
},
Expand Down Expand Up @@ -2043,6 +2055,7 @@ impl Display for FailureCause {
FailureCause::SpdmAttestationFailed { .. } => {
write!(f, "SpdmAttestationFailed")
}
FailureCause::BiosSetupFailed { .. } => write!(f, "BiosSetupFailed"),
}
}
}
Expand Down Expand Up @@ -2867,7 +2880,38 @@ mod tests {
assert_eq!(
deserialized,
ManagedHostState::HostInit {
machine_state: MachineState::PollingBiosSetup,
machine_state: MachineState::PollingBiosSetup { retry_count: 0 },
}
);
}

#[test]
fn test_json_deserialize_polling_bios_setup_with_retry_count() {
let serialized =
r#"{"state":"hostinit","machine_state":{"state":"pollingbiossetup","retry_count":2}}"#;
let deserialized: ManagedHostState = serde_json::from_str(serialized).unwrap();

assert_eq!(
deserialized,
ManagedHostState::HostInit {
machine_state: MachineState::PollingBiosSetup { retry_count: 2 },
}
);
}

#[test]
fn test_json_deserialize_host_platform_configuration_polling_bios_setup_legacy() {
let serialized = r#"{"state":"assigned","instance_state":{"state":"hostplatformconfiguration","platform_config_state":{"state":"pollingbiossetup"}}}"#;
let deserialized: ManagedHostState = serde_json::from_str(serialized).unwrap();

assert_eq!(
deserialized,
ManagedHostState::Assigned {
instance_state: InstanceState::HostPlatformConfiguration {
platform_config_state: HostPlatformConfigurationState::PollingBiosSetup {
retry_count: 0,
},
},
}
);
}
Expand Down
2 changes: 2 additions & 0 deletions crates/api/src/cfg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ Extends `StateControllerConfig` with:
| `dpu_up_threshold` | `Duration` | `5m` | Max time without DPU health report before assuming it's down. |
| `scout_reporting_timeout` | `Duration` | `5m` | Duration without scout report before host is unhealthy. |
| `uefi_boot_wait` | `Duration` | `5m` | Wait time for UEFI boot completion after host reboot. |
| `max_bios_config_retries` | `u32` | `3` | Max HandleBiosJobFailure recovery cycles during BIOS configuration. |
| `polling_bios_setup_stuck_threshold` | `Duration` | `15m` | Time in PollingBiosSetup with `is_bios_setup == false` before recovery escalation. |

### `NetworkSegmentStateControllerConfig`

Expand Down
12 changes: 12 additions & 0 deletions crates/api/src/cfg/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2352,6 +2352,8 @@ mod tests {
dpu_up_threshold: Duration::weeks(1),
scout_reporting_timeout: Duration::minutes(5),
uefi_boot_wait: Duration::minutes(5),
max_bios_config_retries: 3,
polling_bios_setup_stuck_threshold: Duration::minutes(15),
};

let config_str = serde_json::to_string(&input).unwrap();
Expand Down Expand Up @@ -2395,6 +2397,8 @@ mod tests {
dpu_up_threshold: Duration::weeks(1),
scout_reporting_timeout: Duration::minutes(5),
uefi_boot_wait: Duration::minutes(5),
max_bios_config_retries: 3,
polling_bios_setup_stuck_threshold: Duration::minutes(15),
}
);
}
Expand All @@ -2415,6 +2419,8 @@ mod tests {
dpu_up_threshold: Duration::weeks(1),
scout_reporting_timeout: Duration::minutes(5),
uefi_boot_wait: Duration::minutes(5),
max_bios_config_retries: 3,
polling_bios_setup_stuck_threshold: Duration::minutes(15),
}
);
}
Expand Down Expand Up @@ -2707,6 +2713,8 @@ mod tests {
dpu_up_threshold: Duration::minutes(77),
scout_reporting_timeout: Duration::minutes(5),
uefi_boot_wait: Duration::minutes(5),
max_bios_config_retries: 3,
polling_bios_setup_stuck_threshold: Duration::minutes(15),
}
);
assert_eq!(
Expand Down Expand Up @@ -2892,6 +2900,8 @@ mod tests {
dpu_up_threshold: Duration::minutes(33),
scout_reporting_timeout: Duration::minutes(20),
uefi_boot_wait: Duration::minutes(5),
max_bios_config_retries: 3,
polling_bios_setup_stuck_threshold: Duration::minutes(15),
}
);
assert_eq!(
Expand Down Expand Up @@ -3201,6 +3211,8 @@ mod tests {
dpu_up_threshold: Duration::minutes(77),
scout_reporting_timeout: Duration::minutes(20),
uefi_boot_wait: Duration::minutes(5),
max_bios_config_retries: 3,
polling_bios_setup_stuck_threshold: Duration::minutes(15),
}
);
assert_eq!(
Expand Down
23 changes: 23 additions & 0 deletions crates/api/src/state_controller/machine/config/controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ pub struct MachineStateControllerConfig {
serialize_with = "as_duration"
)]
pub uefi_boot_wait: Duration,
/// Max configure_host_bios retry cycles through HandleBiosJobFailure recovery.
#[serde(default = "MachineStateControllerConfig::max_bios_config_retries_default")]
pub max_bios_config_retries: u32,
/// How long PollingBiosSetup may sit on Ok(false) before escalating into
/// HandleBiosJobFailure recovery.
#[serde(
default = "MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default",
deserialize_with = "deserialize_duration_chrono",
serialize_with = "as_duration"
)]
pub polling_bios_setup_stuck_threshold: Duration,
}

impl MachineStateControllerConfig {
Expand All @@ -96,6 +107,14 @@ impl MachineStateControllerConfig {
pub fn uefi_boot_wait_default() -> Duration {
Duration::minutes(5)
}

pub fn max_bios_config_retries_default() -> u32 {
3
}

pub fn polling_bios_setup_stuck_threshold_default() -> Duration {
Duration::minutes(15)
}
}

impl Default for MachineStateControllerConfig {
Expand All @@ -109,6 +128,10 @@ impl Default for MachineStateControllerConfig {
scout_reporting_timeout: MachineStateControllerConfig::scout_reporting_timeout_default(
),
uefi_boot_wait: MachineStateControllerConfig::uefi_boot_wait_default(),
max_bios_config_retries: MachineStateControllerConfig::max_bios_config_retries_default(
),
polling_bios_setup_stuck_threshold:
MachineStateControllerConfig::polling_bios_setup_stuck_threshold_default(),
}
}
}
Loading
Loading