From 0e1d965a06442108318f7d3d1d0b23c8b8559906 Mon Sep 17 00:00:00 2001 From: mkoci Date: Tue, 12 May 2026 18:01:05 -0400 Subject: [PATCH 01/30] feat(health): add machine placement metadata to events Signed-off-by: mkoci --- crates/health/benches/collector_pipeline.rs | 3 ++ crates/health/benches/processor_pipeline.rs | 6 +++ crates/health/benches/sink_pipeline.rs | 3 ++ crates/health/example/config.bmc-mock.toml | 2 +- crates/health/example/config.example.toml | 2 +- crates/health/src/api_client.rs | 3 ++ crates/health/src/config.rs | 47 +++++++++++++++++ crates/health/src/endpoint/mod.rs | 53 +++++++++++++++++++- crates/health/src/endpoint/model.rs | 4 ++ crates/health/src/endpoint/sources.rs | 18 +++++++ crates/health/src/processor/health_report.rs | 3 ++ crates/health/src/sink/events.rs | 22 ++++++++ crates/health/src/sink/mod.rs | 9 ++++ 13 files changed, 172 insertions(+), 3 deletions(-) diff --git a/crates/health/benches/collector_pipeline.rs b/crates/health/benches/collector_pipeline.rs index e8e1821da0..5d28892a7d 100644 --- a/crates/health/benches/collector_pipeline.rs +++ b/crates/health/benches/collector_pipeline.rs @@ -57,6 +57,9 @@ fn event_context() -> EventContext { metadata: Some(EndpointMetadata::Machine(MachineData { machine_id: MACHINE_ID.parse().expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, } diff --git a/crates/health/benches/processor_pipeline.rs b/crates/health/benches/processor_pipeline.rs index 009431fdf8..dc0f34a72f 100644 --- a/crates/health/benches/processor_pipeline.rs +++ b/crates/health/benches/processor_pipeline.rs @@ -94,6 +94,9 @@ fn event_context() -> EventContext { metadata: Some(EndpointMetadata::Machine(MachineData { machine_id: MACHINE_ID.parse().expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, } @@ -268,6 +271,9 @@ fn rack_event_contexts(rack_id: &str, tray_count: usize) -> Vec { metadata: Some(EndpointMetadata::Machine(MachineData { machine_id: MACHINE_ID.parse().expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: Some(RackId::new(rack_id)), } diff --git a/crates/health/benches/sink_pipeline.rs b/crates/health/benches/sink_pipeline.rs index 6e03a74ef8..8e6b949aa6 100644 --- a/crates/health/benches/sink_pipeline.rs +++ b/crates/health/benches/sink_pipeline.rs @@ -67,6 +67,9 @@ fn event_context_for_machine(machine_id: &str) -> EventContext { metadata: Some(EndpointMetadata::Machine(MachineData { machine_id: machine_id.parse().expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, } diff --git a/crates/health/example/config.bmc-mock.toml b/crates/health/example/config.bmc-mock.toml index ac1b338490..c39610b1e0 100644 --- a/crates/health/example/config.bmc-mock.toml +++ b/crates/health/example/config.bmc-mock.toml @@ -23,7 +23,7 @@ mac = "aa:bb:cc:dd:ee:ff" username = "admin" password = "secret" rack_id = "RACK_1" -machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001" } +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } # switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001" } # power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-POWER-SHELF-001" } diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 49b9f0f114..8fb39db821 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -38,7 +38,7 @@ port = 443 mac = "aa:bb:cc:dd:ee:ff" username = "admin" password = "secret" -machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001" } +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.1.1" diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index cf018f46bb..102332e4b0 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -217,6 +217,9 @@ impl ApiClientWrapper { EndpointMetadata::Machine(MachineData { machine_id, machine_serial: info.dmi_data.map(|dmi| dmi.chassis_serial), + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, }) }); diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 1b62f9c6d0..2a3f3fe36c 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -111,6 +111,9 @@ pub struct StaticBmcEndpoint { pub struct StaticMachineEndpoint { pub id: String, pub serial: Option, + pub slot_number: Option, + pub tray_index: Option, + pub nvlink_domain_uuid: Option, } #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] @@ -1388,6 +1391,39 @@ power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1 ); } + #[test] + fn test_static_machine_endpoint_accepts_placement_and_nvlink_metadata() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +mac = "11:22:33:44:55:11" +username = "admin" +password = "pass" +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("failed to parse static machine endpoint config"); + + let machine = config.endpoint_sources.static_bmc_endpoints[0] + .machine + .as_ref() + .expect("machine metadata"); + + assert_eq!(machine.slot_number, Some(15)); + assert_eq!(machine.tray_index, Some(5)); + assert_eq!( + machine.nvlink_domain_uuid.as_deref(), + Some("00000000-0000-0000-0000-000000000000") + ); + } + #[test] fn test_static_endpoint_rejects_multiple_identity_types() { let toml_content = r#" @@ -1429,6 +1465,17 @@ switch = { serial = "SN-SW-001" } .switch .is_none() ); + let machine = config.endpoint_sources.static_bmc_endpoints[0] + .machine + .as_ref() + .expect("machine metadata"); + assert_eq!(machine.serial.as_deref(), Some("MN-001")); + assert_eq!(machine.slot_number, Some(15)); + assert_eq!(machine.tray_index, Some(5)); + assert_eq!( + machine.nvlink_domain_uuid.as_deref(), + Some("00000000-0000-0000-0000-000000000000") + ); assert_eq!( config.endpoint_sources.static_bmc_endpoints[1] .switch diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index 5b33ce2be1..04c98ec4e7 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -35,7 +35,9 @@ mod tests { use super::*; use crate::HealthError; - use crate::config::{StaticBmcEndpoint, StaticPowerShelfEndpoint, StaticSwitchEndpoint}; + use crate::config::{ + StaticBmcEndpoint, StaticMachineEndpoint, StaticPowerShelfEndpoint, StaticSwitchEndpoint, + }; fn make_test_endpoint(mac: MacAddress) -> BmcEndpoint { BmcEndpoint::with_fixed_credentials( @@ -230,6 +232,55 @@ mod tests { } } + #[tokio::test] + async fn test_static_machine_endpoint_sets_placement_and_nvlink_metadata() { + let machine_id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"); + let domain_uuid = "00000000-0000-0000-0000-000000000000" + .parse() + .expect("valid NVLink domain UUID"); + let configs = vec![StaticBmcEndpoint { + ip: "10.0.1.2".to_string(), + port: Some(443), + mac: "11:22:33:44:55:11".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: Some(StaticMachineEndpoint { + id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0".to_string(), + serial: Some("MN-001".to_string()), + slot_number: Some(15), + tray_index: Some(5), + nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()), + }), + power_shelf: None, + switch: None, + rack_id: Some("RACK_1".to_string()), + }]; + + let source = StaticEndpointSource::from_config(&configs); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + assert_eq!( + endpoints[0] + .rack_id + .as_ref() + .map(|rack_id| rack_id.as_str()), + Some("RACK_1") + ); + match &endpoints[0].metadata { + Some(EndpointMetadata::Machine(machine)) => { + assert_eq!(machine.machine_id, machine_id); + assert_eq!(machine.machine_serial.as_deref(), Some("MN-001")); + assert_eq!(machine.slot_number, Some(15)); + assert_eq!(machine.tray_index, Some(5)); + assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid)); + } + other => panic!("expected Machine metadata, got {other:?}"), + } + } + #[tokio::test] async fn test_static_endpoint_without_switch_serial_has_no_metadata() { let configs = vec![StaticBmcEndpoint { diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 78301cafca..e0e2ff7146 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -22,6 +22,7 @@ use std::pin::Pin; use std::sync::{Arc, RwLock}; use carbide_uuid::machine::MachineId; +use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; use carbide_uuid::switch::SwitchId; @@ -140,6 +141,9 @@ impl EndpointMetadata { pub struct MachineData { pub machine_id: MachineId, pub machine_serial: Option, + pub slot_number: Option, + pub tray_index: Option, + pub nvlink_domain_uuid: Option, } #[derive(Clone, Debug)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index 09a02e5468..8fde411024 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -18,6 +18,7 @@ use std::str::FromStr; use std::sync::Arc; +use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::rack::RackId; use mac_address::MacAddress; @@ -99,10 +100,27 @@ impl StaticEndpointSource { })) } else if let Some(machine) = &cfg.machine { let machine_id = &machine.id; + let nvlink_domain_uuid = machine.nvlink_domain_uuid.as_ref().and_then(|id| { + match NvLinkDomainId::from_str(id) { + Ok(id) => Some(id), + Err(error) => { + tracing::warn!( + ?error, + nvlink_domain_uuid = ?id, + "Invalid machine.nvlink_domain_uuid in static endpoint config" + ); + None + } + } + }); + match machine_id.parse() { Ok(machine_id) => Some(EndpointMetadata::Machine(MachineData { machine_id, machine_serial: machine.serial.clone(), + slot_number: machine.slot_number, + tray_index: machine.tray_index, + nvlink_domain_uuid, })), Err(error) => { tracing::warn!( diff --git a/crates/health/src/processor/health_report.rs b/crates/health/src/processor/health_report.rs index f351743875..ea0f9e40c9 100644 --- a/crates/health/src/processor/health_report.rs +++ b/crates/health/src/processor/health_report.rs @@ -278,6 +278,9 @@ mod tests { .parse() .expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, } diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 45a60beb0f..3bab5aff43 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use carbide_uuid::machine::MachineId; +use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; use carbide_uuid::switch::SwitchId; @@ -69,6 +70,27 @@ impl EventContext { } } + pub fn slot_number(&self) -> Option { + match &self.metadata { + Some(EndpointMetadata::Machine(machine)) => machine.slot_number, + _ => None, + } + } + + pub fn tray_index(&self) -> Option { + match &self.metadata { + Some(EndpointMetadata::Machine(machine)) => machine.tray_index, + _ => None, + } + } + + pub fn nvlink_domain_uuid(&self) -> Option { + match &self.metadata { + Some(EndpointMetadata::Machine(machine)) => machine.nvlink_domain_uuid, + _ => None, + } + } + pub fn switch_id(&self) -> Option { match &self.metadata { Some(EndpointMetadata::Switch(switch)) => switch.id, diff --git a/crates/health/src/sink/mod.rs b/crates/health/src/sink/mod.rs index 040af177e3..f88318b06c 100644 --- a/crates/health/src/sink/mod.rs +++ b/crates/health/src/sink/mod.rs @@ -161,6 +161,9 @@ mod tests { .parse() .expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, }; @@ -221,6 +224,9 @@ mod tests { .parse() .expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, }; @@ -273,6 +279,9 @@ mod tests { .parse() .expect("valid machine id"), machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, })), rack_id: None, }; From 0053968e4d4167e8742a5476c5ea7e045f12f506 Mon Sep 17 00:00:00 2001 From: mkoci Date: Tue, 12 May 2026 18:01:44 -0400 Subject: [PATCH 02/30] feat(health): forward machine metadata from API Signed-off-by: mkoci --- crates/health/src/api_client.rs | 34 +++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 102332e4b0..6aece448d5 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -210,18 +210,28 @@ impl ApiClientWrapper { )); }; let addr = BmcAddr::try_from(bmc_info)?; - let metadata = machine - .id - .zip(machine.discovery_info.clone()) - .map(|(machine_id, info)| { - EndpointMetadata::Machine(MachineData { - machine_id, - machine_serial: info.dmi_data.map(|dmi| dmi.chassis_serial), - slot_number: None, - tray_index: None, - nvlink_domain_uuid: None, - }) - }); + let metadata = machine.id.map(|machine_id| { + EndpointMetadata::Machine(MachineData { + machine_id, + machine_serial: machine + .discovery_info + .as_ref() + .and_then(|info| info.dmi_data.as_ref()) + .map(|dmi| dmi.chassis_serial.clone()), + slot_number: machine + .placement_in_rack + .as_ref() + .and_then(|placement| placement.slot_number), + tray_index: machine + .placement_in_rack + .as_ref() + .and_then(|placement| placement.tray_index), + nvlink_domain_uuid: machine + .nvlink_info + .as_ref() + .and_then(|info| info.domain_uuid), + }) + }); self.endpoint_with_auth(addr, metadata, machine.rack_id.clone()) .await From 037dd502b09ec7159e2b1acaf364d750665a3a48 Mon Sep 17 00:00:00 2001 From: mkoci Date: Tue, 12 May 2026 18:02:30 -0400 Subject: [PATCH 03/30] feat(health): expose machine metadata as Prometheus labels Signed-off-by: mkoci --- crates/health/src/sink/prometheus.rs | 62 ++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index fd368d7449..344d4b2ad7 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -100,6 +100,15 @@ impl PrometheusSink { if let Some(serial) = context.serial_number() { labels.push((Cow::Borrowed("serial_number"), serial.to_string())); } + if let Some(slot) = context.slot_number() { + labels.push((Cow::Borrowed("machine_slot_number"), slot.to_string())); + } + if let Some(tray) = context.tray_index() { + labels.push((Cow::Borrowed("machine_tray_index"), tray.to_string())); + } + if let Some(domain) = context.nvlink_domain_uuid() { + labels.push((Cow::Borrowed("nvlink_domain_uuid"), domain.to_string())); + } labels } @@ -213,3 +222,56 @@ impl DataSink for PrometheusSink { } } } + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use carbide_uuid::nvlink::NvLinkDomainId; + use mac_address::MacAddress; + + use super::*; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; + + #[test] + fn test_stream_static_labels_includes_machine_metadata() { + let context = EventContext { + endpoint_key: "42:9e:b1:bd:9d:dd".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().expect("valid ip"), + port: Some(443), + mac: MacAddress::from_str("42:9e:b1:bd:9d:dd").unwrap(), + }, + collector_type: "sensor_collector", + metadata: Some(EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: Some("MN-001".to_string()), + slot_number: Some(15), + tray_index: Some(5), + nvlink_domain_uuid: Some(NvLinkDomainId::nil()), + })), + rack_id: None, + }; + + let labels = PrometheusSink::stream_static_labels(&context); + let label_value = |key: &str| { + labels + .iter() + .find_map(|(label, value)| (label.as_ref() == key).then_some(value.as_str())) + }; + + assert_eq!( + label_value("machine_id"), + Some("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") + ); + assert_eq!(label_value("serial_number"), Some("MN-001")); + assert_eq!(label_value("machine_slot_number"), Some("15")); + assert_eq!(label_value("machine_tray_index"), Some("5")); + assert_eq!( + label_value("nvlink_domain_uuid"), + Some("00000000-0000-0000-0000-000000000000") + ); + } +} From 40c3d49791a62b6a0c054b3d5e44b9a4ad38c2e1 Mon Sep 17 00:00:00 2001 From: mkoci Date: Tue, 12 May 2026 18:03:09 -0400 Subject: [PATCH 04/30] feat(health): emit machine metadata as OTLP attributes Signed-off-by: mkoci --- crates/health/src/otlp/convert.rs | 80 ++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index 75126abe29..aa58c7f7be 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -42,6 +42,12 @@ fn string_value(s: String) -> Option { }) } +fn int_value(value: i64) -> Option { + Some(AnyValue { + value: Some(any_value::Value::IntValue(value)), + }) +} + fn kv(key: &str, val: String) -> KeyValue { KeyValue { key: key.to_string(), @@ -49,6 +55,13 @@ fn kv(key: &str, val: String) -> KeyValue { } } +fn int_kv(key: &str, value: i64) -> KeyValue { + KeyValue { + key: key.to_string(), + value: int_value(value), + } +} + fn resource_attributes(context: &EventContext) -> Vec { let mut attrs = vec![ kv("bmc.endpoint", context.endpoint_key.clone()), @@ -58,6 +71,15 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(machine_id) = context.machine_id() { attrs.push(kv("machine.id", machine_id.to_string())); } + if let Some(slot) = context.slot_number() { + attrs.push(int_kv("machine.slot_number", i64::from(slot))); + } + if let Some(tray) = context.tray_index() { + attrs.push(int_kv("machine.tray_index", i64::from(tray))); + } + if let Some(domain) = context.nvlink_domain_uuid() { + attrs.push(kv("nvlink.domain.uuid", domain.to_string())); + } attrs } @@ -168,10 +190,11 @@ mod tests { use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; + use carbide_uuid::nvlink::NvLinkDomainId; use mac_address::MacAddress; use super::*; - use crate::endpoint::BmcAddr; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, }; @@ -190,6 +213,61 @@ mod tests { } } + fn attr_value<'a>(attrs: &'a [KeyValue], key: &str) -> Option<&'a str> { + attrs + .iter() + .find(|attr| attr.key == key) + .and_then(|attr| attr.value.as_ref()) + .and_then(|value| match value.value.as_ref()? { + any_value::Value::StringValue(value) => Some(value.as_str()), + _ => None, + }) + } + + fn attr_int_value(attrs: &[KeyValue], key: &str) -> Option { + attrs + .iter() + .find(|attr| attr.key == key) + .and_then(|attr| attr.value.as_ref()) + .and_then(|value| match value.value.as_ref()? { + any_value::Value::IntValue(value) => Some(*value), + _ => None, + }) + } + + #[test] + fn resource_attributes_include_machine_metadata_when_present() { + let domain_uuid = NvLinkDomainId::nil(); + let context = EventContext { + endpoint_key: "42:9e:b1:bd:9d:dd".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + port: Some(443), + mac: MacAddress::from_str("42:9e:b1:bd:9d:dd").expect("valid mac"), + }, + collector_type: "test", + metadata: Some(EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: None, + slot_number: Some(15), + tray_index: Some(5), + nvlink_domain_uuid: Some(domain_uuid), + })), + rack_id: None, + }; + + let attrs = resource_attributes(&context); + + assert_eq!(attr_int_value(&attrs, "machine.slot_number"), Some(15)); + assert_eq!(attr_int_value(&attrs, "machine.tray_index"), Some(5)); + assert_eq!( + attr_value(&attrs, "nvlink.domain.uuid"), + Some("00000000-0000-0000-0000-000000000000") + ); + } + #[test] fn log_event_converts_to_otlp_record() { let ctx = test_context(); From f22528b14b36dfef49097d5aeb887de1f2957595 Mon Sep 17 00:00:00 2001 From: mkoci Date: Wed, 13 May 2026 21:12:28 -0400 Subject: [PATCH 05/30] docs(health): document hardware health metadata surfaces Signed-off-by: mkoci --- docs/architecture/health_aggregation.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/architecture/health_aggregation.md b/docs/architecture/health_aggregation.md index 9b7a01042d..8adbea27ba 100644 --- a/docs/architecture/health_aggregation.md +++ b/docs/architecture/health_aggregation.md @@ -265,6 +265,13 @@ Finally, `carbide-hw-health` also emits a health-rollup in `HealthReport` format This assessed health status is built by comparing the metrics that are emitted from BMCs against well-defined ranges or by interpreting the `health_ok` values provided by BMCs. +For production deployments, `carbide-hw-health` discovers machine, switch, and power-shelf BMC endpoints from Carbide API via `[endpoint_sources.carbide_api]`. Machine endpoints carry the inventory metadata needed to interpret hardware health in fleet context, including machine ID, serial number, rack ID, rack placement, and NVLink domain UUID when present. Local and test deployments can instead configure explicit machine, switch, or power-shelf identity with `[[endpoint_sources.static_bmc_endpoints]]`; static machine endpoints can include the same serial number, rack placement, and NVLink domain UUID metadata, and all static endpoints can provide `rack_id` when rack-level rollups are needed. + +The publishing sinks expose that inventory context using the conventions of the target backend: +- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`. +- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`. +- `[sinks.health_report]`, `[sinks.rack_health_report]`, `[sinks.switch_health_report]`, and `[sinks.power_shelf_health_report]` use the same event context when submitting assessed health reports back to Carbide API. The persisted `HealthReport` and `HealthProbeAlert` schemas remain the probe success/alert model described above. + ### BMC inventory monitoring The Site Explorer process within NICo Core periodically queries all Host and DPU BMCs in order to record certain BMC properties (e.g. components within a host and firmware versions). From 83149ec50fb2c6896dc66cf203f5056c5a42d25b Mon Sep 17 00:00:00 2001 From: mkoci Date: Thu, 14 May 2026 12:18:11 -0400 Subject: [PATCH 06/30] feat(health): expose switch placement metadata Signed-off-by: mkoci --- crates/health/example/config.bmc-mock.toml | 2 +- crates/health/example/config.example.toml | 2 +- crates/health/src/api_client.rs | 8 ++++ crates/health/src/config.rs | 18 +++++++- crates/health/src/discovery/iteration.rs | 2 + crates/health/src/discovery/spawn.rs | 2 + crates/health/src/endpoint/mod.rs | 4 ++ crates/health/src/endpoint/model.rs | 2 + crates/health/src/endpoint/sources.rs | 2 + crates/health/src/otlp/convert.rs | 50 +++++++++++++++++++- crates/health/src/sink/events.rs | 14 ++++++ crates/health/src/sink/prometheus.rs | 53 +++++++++++++++++++++- docs/architecture/health_aggregation.md | 6 +-- 13 files changed, 157 insertions(+), 8 deletions(-) diff --git a/crates/health/example/config.bmc-mock.toml b/crates/health/example/config.bmc-mock.toml index c39610b1e0..6f2e5638bc 100644 --- a/crates/health/example/config.bmc-mock.toml +++ b/crates/health/example/config.bmc-mock.toml @@ -24,7 +24,7 @@ username = "admin" password = "secret" rack_id = "RACK_1" machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "MN-001", slot_number = 15, tray_index = 5, nvlink_domain_uuid = "00000000-0000-0000-0000-000000000000" } -# switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001" } +# switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001", slot_number = 7, tray_index = 3 } # power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-POWER-SHELF-001" } [endpoint_sources.carbide_api] diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 8fb39db821..12067a6f3f 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -46,7 +46,7 @@ port = 443 mac = "11:22:33:44:55:66" username = "admin" password = "secret" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001" } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001", slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 6aece448d5..b49c78310b 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -260,6 +260,14 @@ impl ApiClientWrapper { Some(EndpointMetadata::Switch(SwitchData { id: switch.id, serial, + slot_number: switch + .placement_in_rack + .as_ref() + .and_then(|placement| placement.slot_number), + tray_index: switch + .placement_in_rack + .as_ref() + .and_then(|placement| placement.tray_index), })), None, ) diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 2a3f3fe36c..c94d99ce6b 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -128,6 +128,8 @@ pub struct StaticPowerShelfEndpoint { pub struct StaticSwitchEndpoint { pub id: Option, pub serial: Option, + pub slot_number: Option, + pub tray_index: Option, } impl Debug for StaticBmcEndpoint { @@ -1319,7 +1321,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "cumulus" password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001" } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" @@ -1375,6 +1377,20 @@ power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1 .and_then(|switch| switch.serial.as_deref()), Some("SN-SW-001") ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[2] + .switch + .as_ref() + .and_then(|switch| switch.slot_number), + Some(7) + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[2] + .switch + .as_ref() + .and_then(|switch| switch.tray_index), + Some(3) + ); assert_eq!( config.endpoint_sources.static_bmc_endpoints[3] .power_shelf diff --git a/crates/health/src/discovery/iteration.rs b/crates/health/src/discovery/iteration.rs index 38ec370292..12da6ebaff 100644 --- a/crates/health/src/discovery/iteration.rs +++ b/crates/health/src/discovery/iteration.rs @@ -113,6 +113,8 @@ mod tests { Some(EndpointMetadata::Switch(SwitchData { id: None, serial: format!("serial-{mac}"), + slot_number: None, + tray_index: None, })) } else { None diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 74c3cce86a..2bfbc901e8 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -391,6 +391,8 @@ mod tests { Some(EndpointMetadata::Switch(SwitchData { id: None, serial: "switch-serial-1".to_string(), + slot_number: None, + tray_index: None, })), None, ); diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index 04c98ec4e7..191679ad79 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -184,6 +184,8 @@ mod tests { switch: Some(StaticSwitchEndpoint { id: Some(switch_id.to_string()), serial: Some("SN-001".to_string()), + slot_number: Some(7), + tray_index: Some(3), }), rack_id: None, }]; @@ -196,6 +198,8 @@ mod tests { Some(EndpointMetadata::Switch(s)) => { assert_eq!(s.id, Some(switch_id)); assert_eq!(s.serial, "SN-001"); + assert_eq!(s.slot_number, Some(7)); + assert_eq!(s.tray_index, Some(3)); } other => panic!("expected Switch metadata, got {other:?}"), } diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index e0e2ff7146..06b2cf2a3a 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -156,6 +156,8 @@ pub struct PowerShelfData { pub struct SwitchData { pub id: Option, pub serial: String, + pub slot_number: Option, + pub tray_index: Option, } #[derive(Clone)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index 8fde411024..e2bc6bf8a0 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -97,6 +97,8 @@ impl StaticEndpointSource { Some(EndpointMetadata::Switch(SwitchData { id, serial, + slot_number: switch.slot_number, + tray_index: switch.tray_index, })) } else if let Some(machine) = &cfg.machine { let machine_id = &machine.id; diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index aa58c7f7be..fab1b521a6 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -71,6 +71,9 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(machine_id) = context.machine_id() { attrs.push(kv("machine.id", machine_id.to_string())); } + if let Some(switch_id) = context.switch_id() { + attrs.push(kv("switch.id", switch_id.to_string())); + } if let Some(slot) = context.slot_number() { attrs.push(int_kv("machine.slot_number", i64::from(slot))); } @@ -80,6 +83,12 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(domain) = context.nvlink_domain_uuid() { attrs.push(kv("nvlink.domain.uuid", domain.to_string())); } + if let Some(slot) = context.switch_slot_number() { + attrs.push(int_kv("switch.slot_number", i64::from(slot))); + } + if let Some(tray) = context.switch_tray_index() { + attrs.push(int_kv("switch.tray_index", i64::from(tray))); + } attrs } @@ -191,10 +200,11 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, }; @@ -213,6 +223,13 @@ mod tests { } } + fn test_switch_id(label: &str) -> SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } + fn attr_value<'a>(attrs: &'a [KeyValue], key: &str) -> Option<&'a str> { attrs .iter() @@ -268,6 +285,37 @@ mod tests { ); } + #[test] + fn resource_attributes_include_switch_placement_metadata_when_present() { + let switch_id = test_switch_id("switch-a"); + let switch_id_attr = switch_id.to_string(); + let context = EventContext { + endpoint_key: "11:22:33:44:55:66".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 1, 1)), + port: Some(443), + mac: MacAddress::from_str("11:22:33:44:55:66").expect("valid mac"), + }, + collector_type: "test", + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + })), + rack_id: None, + }; + + let attrs = resource_attributes(&context); + + assert_eq!( + attr_value(&attrs, "switch.id"), + Some(switch_id_attr.as_str()) + ); + assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); + assert_eq!(attr_int_value(&attrs, "switch.tray_index"), Some(3)); + } + #[test] fn log_event_converts_to_otlp_record() { let ctx = test_context(); diff --git a/crates/health/src/sink/events.rs b/crates/health/src/sink/events.rs index 3bab5aff43..73fd66d7b1 100644 --- a/crates/health/src/sink/events.rs +++ b/crates/health/src/sink/events.rs @@ -98,6 +98,20 @@ impl EventContext { } } + pub fn switch_slot_number(&self) -> Option { + match &self.metadata { + Some(EndpointMetadata::Switch(switch)) => switch.slot_number, + _ => None, + } + } + + pub fn switch_tray_index(&self) -> Option { + match &self.metadata { + Some(EndpointMetadata::Switch(switch)) => switch.tray_index, + _ => None, + } + } + pub fn power_shelf_id(&self) -> Option { match &self.metadata { Some(EndpointMetadata::PowerShelf(power_shelf)) => power_shelf.id, diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 344d4b2ad7..e1c70aae6f 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -97,6 +97,9 @@ impl PrometheusSink { if let Some(machine_id) = context.machine_id() { labels.push((Cow::Borrowed("machine_id"), machine_id.to_string())); } + if let Some(switch_id) = context.switch_id() { + labels.push((Cow::Borrowed("switch_id"), switch_id.to_string())); + } if let Some(serial) = context.serial_number() { labels.push((Cow::Borrowed("serial_number"), serial.to_string())); } @@ -109,6 +112,12 @@ impl PrometheusSink { if let Some(domain) = context.nvlink_domain_uuid() { labels.push((Cow::Borrowed("nvlink_domain_uuid"), domain.to_string())); } + if let Some(slot) = context.switch_slot_number() { + labels.push((Cow::Borrowed("switch_slot_number"), slot.to_string())); + } + if let Some(tray) = context.switch_tray_index() { + labels.push((Cow::Borrowed("switch_tray_index"), tray.to_string())); + } labels } @@ -228,10 +237,18 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData}; + + fn test_switch_id(label: &str) -> SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } #[test] fn test_stream_static_labels_includes_machine_metadata() { @@ -274,4 +291,38 @@ mod tests { Some("00000000-0000-0000-0000-000000000000") ); } + + #[test] + fn test_stream_static_labels_includes_switch_placement_metadata() { + let switch_id = test_switch_id("switch-a"); + let switch_id_label = switch_id.to_string(); + let context = EventContext { + endpoint_key: "11:22:33:44:55:66".to_string(), + addr: BmcAddr { + ip: "10.0.1.1".parse().expect("valid ip"), + port: Some(443), + mac: MacAddress::from_str("11:22:33:44:55:66").unwrap(), + }, + collector_type: "switch_collector", + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + })), + rack_id: None, + }; + + let labels = PrometheusSink::stream_static_labels(&context); + let label_value = |key: &str| { + labels + .iter() + .find_map(|(label, value)| (label.as_ref() == key).then_some(value.as_str())) + }; + + assert_eq!(label_value("switch_id"), Some(switch_id_label.as_str())); + assert_eq!(label_value("serial_number"), Some("SN-SWITCH-001")); + assert_eq!(label_value("switch_slot_number"), Some("7")); + assert_eq!(label_value("switch_tray_index"), Some("3")); + } } diff --git a/docs/architecture/health_aggregation.md b/docs/architecture/health_aggregation.md index 8adbea27ba..c5100b2dca 100644 --- a/docs/architecture/health_aggregation.md +++ b/docs/architecture/health_aggregation.md @@ -265,11 +265,11 @@ Finally, `carbide-hw-health` also emits a health-rollup in `HealthReport` format This assessed health status is built by comparing the metrics that are emitted from BMCs against well-defined ranges or by interpreting the `health_ok` values provided by BMCs. -For production deployments, `carbide-hw-health` discovers machine, switch, and power-shelf BMC endpoints from Carbide API via `[endpoint_sources.carbide_api]`. Machine endpoints carry the inventory metadata needed to interpret hardware health in fleet context, including machine ID, serial number, rack ID, rack placement, and NVLink domain UUID when present. Local and test deployments can instead configure explicit machine, switch, or power-shelf identity with `[[endpoint_sources.static_bmc_endpoints]]`; static machine endpoints can include the same serial number, rack placement, and NVLink domain UUID metadata, and all static endpoints can provide `rack_id` when rack-level rollups are needed. +For production deployments, `carbide-hw-health` discovers machine, switch, and power-shelf BMC endpoints from Carbide API via `[endpoint_sources.carbide_api]`. Machine endpoints carry the inventory metadata needed to interpret hardware health in fleet context, including machine ID, serial number, rack ID, rack placement, and NVLink domain UUID when present. Switch endpoints carry switch ID, serial number, and rack placement when present. Local and test deployments can instead configure explicit machine, switch, or power-shelf identity with `[[endpoint_sources.static_bmc_endpoints]]`; static machine endpoints can include the same serial number, rack placement, and NVLink domain UUID metadata, static switch endpoints can include serial number and rack placement metadata, and all static endpoints can provide `rack_id` when rack-level rollups are needed. The publishing sinks expose that inventory context using the conventions of the target backend: -- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`. -- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`. +- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`; switch metadata uses `switch_id`, `serial_number`, `switch_slot_number`, and `switch_tray_index`. +- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`; switch metadata uses `switch.id`, integer `switch.slot_number`, and integer `switch.tray_index`. - `[sinks.health_report]`, `[sinks.rack_health_report]`, `[sinks.switch_health_report]`, and `[sinks.power_shelf_health_report]` use the same event context when submitting assessed health reports back to Carbide API. The persisted `HealthReport` and `HealthProbeAlert` schemas remain the probe success/alert model described above. ### BMC inventory monitoring From 6331ef87babe51f87638eb17d5a330e1a054634e Mon Sep 17 00:00:00 2001 From: mkoci Date: Thu, 23 Apr 2026 13:24:16 +0200 Subject: [PATCH 07/30] refactor(health): rename SseConnectionGuard to StreamingConnectionGuard Signed-off-by: mkoci --- crates/health/src/collectors/runtime.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 94203d659a..5b9afc41e7 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -235,18 +235,18 @@ impl StreamMetrics { } } -/// RAII guard: increments `active_sse_connections` on construction, decrements on drop. +/// RAII guard: increments the passed IntGauge on construction, decrements on drop. /// Ensures every exit path from a connected stream (cancel, error, end, reconnect) dec's. -struct SseConnectionGuard(IntGauge); +pub(crate) struct StreamingConnectionGuard(IntGauge); -impl SseConnectionGuard { - fn inc(gauge: IntGauge) -> Self { +impl StreamingConnectionGuard { + pub(crate) fn inc(gauge: IntGauge) -> Self { gauge.inc(); Self(gauge) } } -impl Drop for SseConnectionGuard { +impl Drop for StreamingConnectionGuard { fn drop(&mut self) { self.0.dec(); } @@ -516,7 +516,7 @@ impl Collector { Ok(mut stream) => { // the guard lives exactly as long as we hold an open stream; Drop // handles dec for every exit path (shutdown, error, stream end). - let _conn_guard = SseConnectionGuard::inc(metrics.connected.clone()); + let _conn_guard = StreamingConnectionGuard::inc(metrics.connected.clone()); backoff.reset(); tracing::info!( collector_type, From 89acbe5512aeb981fe99b99d1eab6addc76898e2 Mon Sep 17 00:00:00 2001 From: mkoci Date: Thu, 23 Apr 2026 13:24:17 +0200 Subject: [PATCH 08/30] feat(health): vendor openconfig/gnmi protos for reproducible builds Signed-off-by: mkoci --- crates/health/build.rs | 26 +- .../openconfig/gnmi/proto/gnmi/gnmi.proto | 467 ++++++++++++++++++ .../gnmi/proto/gnmi_ext/gnmi_ext.proto | 161 ++++++ 3 files changed, 652 insertions(+), 2 deletions(-) create mode 100644 crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto create mode 100644 crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto diff --git a/crates/health/build.rs b/crates/health/build.rs index dbc1d6f838..9120b06a10 100644 --- a/crates/health/build.rs +++ b/crates/health/build.rs @@ -20,17 +20,39 @@ use std::path::PathBuf; fn main() -> Result<(), Box> { carbide_version::build(); - // vendored from opentelemetry-proto v1.5.0 let proto_dir = PathBuf::from("proto"); println!("cargo:rerun-if-changed=proto/"); + // vendored from opentelemetry-proto v1.5.0 tonic_prost_build::configure() .build_server(false) .build_client(true) .compile_protos( &[proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto")], - &[proto_dir], + std::slice::from_ref(&proto_dir), + )?; + + // vendored from openconfig/gnmi v0.11.0 + // gnmi_ext compiled separately so gnmi.proto can extern_path it and reuse the types + tonic_prost_build::configure() + .build_client(true) + .build_server(false) + .compile_protos( + &[proto_dir.join("github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto")], + std::slice::from_ref(&proto_dir), + )?; + + tonic_prost_build::configure() + .build_client(true) + .build_server(false) + .extern_path( + ".gnmi_ext", + "crate::collectors::nvue::gnmi::proto::gnmi_ext", + ) + .compile_protos( + &[proto_dir.join("github.com/openconfig/gnmi/proto/gnmi/gnmi.proto")], + std::slice::from_ref(&proto_dir), )?; Ok(()) diff --git a/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto new file mode 100644 index 0000000000..5738aedd2b --- /dev/null +++ b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto @@ -0,0 +1,467 @@ +// +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +syntax = "proto3"; + +import "google/protobuf/any.proto"; +import "google/protobuf/descriptor.proto"; +import "github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto"; + +// Package gNMI defines a service specification for the gRPC Network Management +// Interface. This interface is defined to be a standard interface via which +// a network management system ("client") can subscribe to state values, +// retrieve snapshots of state information, and manipulate the state of a data +// tree supported by a device ("target"). +// +// This document references the gNMI Specification which can be found at +// http://github.com/openconfig/reference/blob/master/rpc/gnmi +package gnmi; + +// Define a protobuf FileOption that defines the gNMI service version. +extend google.protobuf.FileOptions { + // The gNMI service semantic version. + string gnmi_service = 1001; +} + +// gNMI_service is the current version of the gNMI service, returned through +// the Capabilities RPC. +option (gnmi_service) = "0.10.0"; + +option go_package = "github.com/openconfig/gnmi/proto/gnmi"; +option java_multiple_files = true; +option java_outer_classname = "GnmiProto"; +option java_package = "com.github.gnmi.proto"; + + +service gNMI { + // Capabilities allows the client to retrieve the set of capabilities that + // is supported by the target. This allows the target to validate the + // service version that is implemented and retrieve the set of models that + // the target supports. The models can then be specified in subsequent RPCs + // to restrict the set of data that is utilized. + // Reference: gNMI Specification Section 3.2 + rpc Capabilities(CapabilityRequest) returns (CapabilityResponse); + // Retrieve a snapshot of data from the target. A Get RPC requests that the + // target snapshots a subset of the data tree as specified by the paths + // included in the message and serializes this to be returned to the + // client using the specified encoding. + // Reference: gNMI Specification Section 3.3 + rpc Get(GetRequest) returns (GetResponse); + // Set allows the client to modify the state of data on the target. The + // paths to modified along with the new values that the client wishes + // to set the value to. + // Reference: gNMI Specification Section 3.4 + rpc Set(SetRequest) returns (SetResponse); + // Subscribe allows a client to request the target to send it values + // of particular paths within the data tree. These values may be streamed + // at a particular cadence (STREAM), sent one off on a long-lived channel + // (POLL), or sent as a one-off retrieval (ONCE). + // Reference: gNMI Specification Section 3.5 + rpc Subscribe(stream SubscribeRequest) returns (stream SubscribeResponse); +} + +// Notification is a re-usable message that is used to encode data from the +// target to the client. A Notification carries two types of changes to the data +// tree: +// - Deleted values (delete) - a set of paths that have been removed from the +// data tree. +// - Updated values (update) - a set of path-value pairs indicating the path +// whose value has changed in the data tree. +// Reference: gNMI Specification Section 2.1 +message Notification { + int64 timestamp = 1; // Timestamp in nanoseconds since Epoch. + Path prefix = 2; // Prefix used for paths in the message. + repeated Update update = 4; // Data elements that have changed values. + repeated Path delete = 5; // Data elements that have been deleted. + // This notification contains a set of paths that are always updated together + // referenced by a globally unique prefix. + bool atomic = 6; + // Reserved field numbers and identifiers. + reserved "alias"; + reserved 3; +} + +// Update is a re-usable message that is used to store a particular Path, +// Value pair. +// Reference: gNMI Specification Section 2.1 +message Update { + Path path = 1; // The path (key) for the update. + Value value = 2 [deprecated = true]; // The value (value) for the update. + TypedValue val = 3; // The explicitly typed update value. + uint32 duplicates = 4; // Number of coalesced duplicates. +} + +// TypedValue is used to encode a value being sent between the client and +// target (originated by either entity). +message TypedValue { + // One of the fields within the val oneof is populated with the value + // of the update. The type of the value being included in the Update + // determines which field should be populated. In the case that the + // encoding is a particular form of the base protobuf type, a specific + // field is used to store the value (e.g., json_val). + oneof value { + string string_val = 1; // String value. + int64 int_val = 2; // Integer value. + uint64 uint_val = 3; // Unsigned integer value. + bool bool_val = 4; // Bool value. + bytes bytes_val = 5; // Arbitrary byte sequence value. + float float_val = 6 [deprecated = true]; // Deprecated - use double_val. + double double_val = 14; // Floating point value. + Decimal64 decimal_val = 7 + [deprecated = true]; // Deprecated - use double_val. + ScalarArray leaflist_val = 8; // Mixed type scalar array value. + google.protobuf.Any any_val = 9; // protobuf.Any encoded bytes. + bytes json_val = 10; // JSON-encoded text. + bytes json_ietf_val = 11; // JSON-encoded text per RFC7951. + string ascii_val = 12; // Arbitrary ASCII text. + // Protobuf binary encoded bytes. The message type is not included. + // See the specification at + // github.com/openconfig/reference/blob/master/rpc/gnmi/protobuf-vals.md + // for a complete specification. [Experimental] + bytes proto_bytes = 13; + } +} + +// Path encodes a data tree path as a series of repeated strings, with +// each element of the path representing a data tree node name and the +// associated attributes. +// Reference: gNMI Specification Section 2.2.2. +message Path { + // Elements of the path are no longer encoded as a string, but rather within + // the elem field as a PathElem message. + repeated string element = 1 [deprecated = true]; + string origin = 2; // Label to disambiguate path. + repeated PathElem elem = 3; // Elements of the path. + string target = 4; // The name of the target + // (Sec. 2.2.2.1) +} + +// PathElem encodes an element of a gNMI path, along with any attributes (keys) +// that may be associated with it. +// Reference: gNMI Specification Section 2.2.2. +message PathElem { + string name = 1; // The name of the element in the path. + map key = 2; // Map of key (attribute) name to value. +} + +// Value encodes a data tree node's value - along with the way in which +// the value is encoded. This message is deprecated by gNMI 0.3.0. +// Reference: gNMI Specification Section 2.2.3. +message Value { + option deprecated = true; + + bytes value = 1; // Value of the variable being transmitted. + Encoding type = 2; // Encoding used for the value field. +} + +// Encoding defines the value encoding formats that are supported by the gNMI +// protocol. These encodings are used by both the client (when sending Set +// messages to modify the state of the target) and the target when serializing +// data to be returned to the client (in both Subscribe and Get RPCs). +// Reference: gNMI Specification Section 2.3 +enum Encoding { + JSON = 0; // JSON encoded text. + BYTES = 1; // Arbitrarily encoded bytes. + PROTO = 2; // Encoded according to scalar values of TypedValue. + ASCII = 3; // ASCII text of an out-of-band agreed format. + JSON_IETF = 4; // JSON encoded text as per RFC7951. +} + +// Error message previously utilised to return errors to the client. Deprecated +// in favour of using the google.golang.org/genproto/googleapis/rpc/status +// message in the RPC response. +// Reference: gNMI Specification Section 2.5 +message Error { + option deprecated = true; + + uint32 code = 1; // Canonical gRPC error code. + string message = 2; // Human readable error. + google.protobuf.Any data = 3; // Optional additional information. +} + +// Decimal64 is used to encode a fixed precision decimal number. The value +// is expressed as a set of digits with the precision specifying the +// number of digits following the decimal point in the digit set. +// This message is deprecated in favor of encoding all floating point types +// as double precision. +message Decimal64 { + option deprecated = true; + + int64 digits = 1; // Set of digits. + uint32 precision = 2; // Number of digits following the decimal point. +} + +// ScalarArray is used to encode a mixed-type array of values. +message ScalarArray { + // The set of elements within the array. Each TypedValue message should + // specify only elements that have a field identifier of 1-7 (i.e., the + // values are scalar values). + repeated TypedValue element = 1; +} + +// SubscribeRequest is the message sent by the client to the target when +// initiating a subscription to a set of paths within the data tree. The +// request field must be populated and the initial message must specify a +// SubscriptionList to initiate a subscription. +// Reference: gNMI Specification Section 3.5.1.1 +message SubscribeRequest { + oneof request { + SubscriptionList subscribe = 1; // Specify the paths within a subscription. + Poll poll = 3; // Trigger a polled update. + } + // Extension messages associated with the SubscribeRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; + // Reserved field numbers and identifiers. + reserved 4; + reserved "aliases"; +} + +// Poll is sent within a SubscribeRequest to trigger the device to +// send telemetry updates for the paths that are associated with the +// subscription. +// Reference: gNMI Specification Section Section 3.5.1.4 +message Poll {} + +// SubscribeResponse is the message used by the target within a Subscribe RPC. +// The target includes a Notification message which is used to transmit values +// of the path(s) that are associated with the subscription. The same message +// is to indicate that the target has sent all data values once (is +// synchronized). +// Reference: gNMI Specification Section 3.5.1.4 +message SubscribeResponse { + oneof response { + Notification update = 1; // Changed or sampled value for a path. + // Indicate target has sent all values associated with the subscription + // at least once. + bool sync_response = 3; + // Deprecated in favour of google.golang.org/genproto/googleapis/rpc/status + Error error = 4 [deprecated = true]; + } + // Extension messages associated with the SubscribeResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; +} + +// SubscriptionList is used within a Subscribe message to specify the list of +// paths that the client wishes to subscribe to. The message consists of a +// list of (possibly prefixed) paths, and options that relate to the +// subscription. +// Reference: gNMI Specification Section 3.5.1.2 +message SubscriptionList { + Path prefix = 1; // Prefix used for paths. + repeated Subscription subscription = 2; // Set of subscriptions to create. + QOSMarking qos = 4; // DSCP marking to be used. + // Mode of the subscription. + enum Mode { + STREAM = 0; // Values streamed by the target (Sec. 3.5.1.5.2). + ONCE = 1; // Values sent once-off by the target (Sec. 3.5.1.5.1). + POLL = 2; // Values sent in response to a poll request (Sec. 3.5.1.5.3). + } + Mode mode = 5; + // Whether elements of the schema that are marked as eligible for aggregation + // should be aggregated or not. + bool allow_aggregation = 6; + // The set of schemas that define the elements of the data tree that should + // be sent by the target. + repeated ModelData use_models = 7; + // The encoding that the target should use within the Notifications generated + // corresponding to the SubscriptionList. + Encoding encoding = 8; + // An optional field to specify that only updates to current state should be + // sent to a client. If set, the initial state is not sent to the client but + // rather only the sync message followed by any subsequent updates to the + // current state. For ONCE and POLL modes, this causes the server to send only + // the sync message (Sec. 3.5.2.3). + bool updates_only = 9; + // Reserved field numbers and identifiers. + reserved 3; + reserved "use_aliases"; +} + +// Subscription is a single request within a SubscriptionList. The path +// specified is interpreted (along with the prefix) as the elements of the data +// tree that the client is subscribing to. The mode determines how the target +// should trigger updates to be sent. +// Reference: gNMI Specification Section 3.5.1.3 +message Subscription { + Path path = 1; // The data tree path. + SubscriptionMode mode = 2; // Subscription mode to be used. + uint64 sample_interval = 3; // ns between samples in SAMPLE mode. + // Indicates whether values that have not changed should be sent in a SAMPLE + // subscription. + bool suppress_redundant = 4; + // 1. A heartbeat interval MAY be specified along with an “on change” + // subscription - in this case, the value of the data item(s) MUST be re-sent + // once per heartbeat interval regardless of whether the value has changed or + // not. + // 2. A heartbeat_interval MAY be specified to modify the behavior of + // suppress_redundant in a sampled subscription. In this case, the + // target MUST generate one telemetry update per heartbeat interval, + // regardless of whether the suppress_redundant flag is set to true. + // This value is specified as an unsigned 64-bit integer in nanoseconds + uint64 heartbeat_interval = 5; +} + +// SubscriptionMode is the mode of the subscription, specifying how the +// target must return values in a subscription. +// Reference: gNMI Specification Section 3.5.1.3 +enum SubscriptionMode { + TARGET_DEFINED = 0; // The target selects the relevant mode for each element. + ON_CHANGE = 1; // The target sends an update on element value change. + SAMPLE = 2; // The target samples values according to the interval. +} + +// QOSMarking specifies the DSCP value to be set on transmitted telemetry +// updates from the target. +// Reference: gNMI Specification Section 3.5.1.2 +message QOSMarking { + uint32 marking = 1; +} + +// SetRequest is sent from a client to the target to update values in the data +// tree. Paths are either deleted by the client, or modified by means of being +// updated, or replaced. Where a replace is used, unspecified values are +// considered to be replaced, whereas when update is used the changes are +// considered to be incremental. The set of changes that are specified within +// a single SetRequest are considered to be a transaction. +// Reference: gNMI Specification Section 3.4.1 +message SetRequest { + Path prefix = 1; // Prefix used for paths in the message. + repeated Path delete = 2; // Paths to be deleted from the data tree. + repeated Update replace = 3; // Updates specifying elements to be replaced. + repeated Update update = 4; // Updates specifying elements to updated. + // Updates specifying elements to union and then replace the data tree. + // See the gNMI specification at + // https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md + // for details. + repeated Update union_replace = 6; + // Extension messages associated with the SetRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; +} + +// SetResponse is the response to a SetRequest, sent from the target to the +// client. It reports the result of the modifications to the data tree that were +// specified by the client. Errors for this RPC should be reported using the +// https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto +// message in the RPC return. The gnmi.Error message can be used to add +// additional details where required. Reference: gNMI Specification +// Section 3.4.2 +message SetResponse { + Path prefix = 1; // Prefix used for paths. + // A set of responses specifying the result of the operations specified in + // the SetRequest. + repeated UpdateResult response = 2; + Error message = 3 + [deprecated = true]; // The overall status of the transaction. + int64 timestamp = 4; // Timestamp of transaction (ns since epoch). + // Extension messages associated with the SetResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; +} + +// UpdateResult is used within the SetResponse message to communicate the +// result of an operation specified within a SetRequest message. +// Reference: gNMI Specification Section 3.4.2 +message UpdateResult { + // The operation that was associated with the Path specified. + enum Operation { + INVALID = 0; + DELETE = 1; // The result relates to a delete of Path. + REPLACE = 2; // The result relates to a replace of Path. + UPDATE = 3; // The result relates to an update of Path. + UNION_REPLACE = 4; // The result of a union_replace of Path or CLI origin. + } + // Deprecated timestamp for the UpdateResult, this field has been + // replaced by the timestamp within the SetResponse message, since + // all mutations effected by a set should be applied as a single + // transaction. + int64 timestamp = 1 [deprecated = true]; + Path path = 2; // Path associated with the update. + Error message = 3 [deprecated = true]; // Status of the update operation. + Operation op = 4; // Update operation type. +} + +// GetRequest is sent when a client initiates a Get RPC. It is used to specify +// the set of data elements for which the target should return a snapshot of +// data. The use_models field specifies the set of schema modules that are to +// be used by the target - where use_models is not specified then the target +// must use all schema models that it has. +// Reference: gNMI Specification Section 3.3.1 +message GetRequest { + Path prefix = 1; // Prefix used for paths. + repeated Path path = 2; // Paths requested by the client. + // Type of elements within the data tree. + enum DataType { + ALL = 0; // All data elements. + CONFIG = 1; // Config (rw) only elements. + STATE = 2; // State (ro) only elements. + // Data elements marked in the schema as operational. This refers to data + // elements whose value relates to the state of processes or interactions + // running on the device. + OPERATIONAL = 3; + } + DataType type = 3; // The type of data being requested. + Encoding encoding = 5; // Encoding to be used. + repeated ModelData use_models = 6; // The schema models to be used. + // Extension messages associated with the GetRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 7; +} + +// GetResponse is used by the target to respond to a GetRequest from a client. +// The set of Notifications corresponds to the data values that are requested +// by the client in the GetRequest. +// Reference: gNMI Specification Section 3.3.2 +message GetResponse { + repeated Notification notification = 1; // Data values. + Error error = 2 [deprecated = true]; // Errors that occurred in the Get. + // Extension messages associated with the GetResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 3; +} + +// CapabilityRequest is sent by the client in the Capabilities RPC to request +// that the target reports its capabilities. +// Reference: gNMI Specification Section 3.2.1 +message CapabilityRequest { + // Extension messages associated with the CapabilityRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 1; +} + +// CapabilityResponse is used by the target to report its capabilities to the +// client within the Capabilities RPC. +// Reference: gNMI Specification Section 3.2.2 +message CapabilityResponse { + repeated ModelData supported_models = 1; // Supported schema models. + repeated Encoding supported_encodings = 2; // Supported encodings. + string gNMI_version = 3; // Supported gNMI version. + // Extension messages associated with the CapabilityResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 4; +} + +// ModelData is used to describe a set of schema modules. It can be used in a +// CapabilityResponse where a target reports the set of modules that it +// supports, and within the SubscribeRequest and GetRequest messages to specify +// the set of models from which data tree elements should be reported. +// Reference: gNMI Specification Section 3.2.3 +message ModelData { + string name = 1; // Name of the model. + string organization = 2; // Organization publishing the model. + string version = 3; // Semantic version of the model. +} diff --git a/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto new file mode 100644 index 0000000000..ada5e39a5d --- /dev/null +++ b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto @@ -0,0 +1,161 @@ +// +// Copyright 2018 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +syntax = "proto3"; + +import "google/protobuf/duration.proto"; + +// Package gnmi_ext defines a set of extensions messages which can be optionally +// included with the request and response messages of gNMI RPCs. A set of +// well-known extensions are defined within this file, along with a registry for +// extensions defined outside of this package. +package gnmi_ext; + +option go_package = "github.com/openconfig/gnmi/proto/gnmi_ext"; + +// The Extension message contains a single gNMI extension. +message Extension { + oneof ext { + RegisteredExtension registered_ext = 1; // A registered extension. + // Well known extensions. + MasterArbitration master_arbitration = 2; // Master arbitration extension. + History history = 3; // History extension. + Commit commit = 4; // Commit confirmed extension. + Depth depth = 5; // Depth extension. + } +} + +// The RegisteredExtension message defines an extension which is defined outside +// of this file. +message RegisteredExtension { + ExtensionID id = 1; // The unique ID assigned to this extension. + bytes msg = 2; // The binary-marshalled protobuf extension payload. +} + +// RegisteredExtension is an enumeration acting as a registry for extensions +// defined by external sources. +enum ExtensionID { + EID_UNSET = 0; + // New extensions are to be defined within this enumeration - their definition + // MUST link to a reference describing their implementation. + + // An experimental extension that may be used during prototyping of a new + // extension. + EID_EXPERIMENTAL = 999; +} + +// MasterArbitration is used to select the master among multiple gNMI clients +// with the same Roles. The client with the largest election_id is honored as +// the master. +// The document about gNMI master arbitration can be found at +// https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-master-arbitration.md +message MasterArbitration { + Role role = 1; + Uint128 election_id = 2; +} + +// Representation of unsigned 128-bit integer. +message Uint128 { + uint64 high = 1; + uint64 low = 2; +} + +// There can be one master for each role. The role is identified by its id. +message Role { + string id = 1; + // More fields can be added if needed, for example, to specify what paths the + // role can read/write. +} + +// The History extension allows clients to request historical data. Its +// spec can be found at +// https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-history.md +message History { + oneof request { + int64 snapshot_time = 1; // Nanoseconds since the epoch + TimeRange range = 2; + } +} + +message TimeRange { + int64 start = 1; // Nanoseconds since the epoch + int64 end = 2; // Nanoseconds since the epoch +} + +// Commit confirmed extension allows automated revert of the configuration after +// certain duration if an explicit confirmation is not issued. It allows +// explicit cancellation of the commit during the rollback window. There cannot +// be more than one commit active at a given time. The document about gNMI +// commit confirmed can be found at +// https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-commit-confirmed.md +message Commit { + // ID is provided by the client during the commit request. During confirm and + // cancel actions the provided ID should match the ID provided during commit. + // If ID is not passed in any actions server shall return error. + // Required. + string id = 1; + oneof action { + // commit action creates a new commit. If a commit is on-going, server + // returns error. + CommitRequest commit = 2; + // confirm action will confirm an on-going commit, the ID provided during + // confirm should match the on-going commit ID. + CommitConfirm confirm = 3; + // cancel action will cancel an on-going commit, the ID provided during + // cancel should match the on-going commit ID. + CommitCancel cancel = 4; + // set rollback duration action sets the rollback duration of an on-going commit + // to a new value. + // The ID provided with the Commit message should match the on-going commit ID. + CommitSetRollbackDuration set_rollback_duration = 5; + } +} + +// CommitRequest is used to create a new confirmed commit. It hold additional +// parameter requried for commit action. +message CommitRequest { + // Maximum duration to wait for a confirmaton before reverting the commit. + google.protobuf.Duration rollback_duration = 1; +} + +// CommitConfirm is used to confirm an on-going commit. It hold additional +// parameter requried for confirm action. +message CommitConfirm {} + +// CommitCancel is used to cancel an on-going commit. It hold additional +// parameter requried for cancel action. +message CommitCancel {} + +// CommitSetRollbackDuration is used to set the existing rollback duration value +// of an on-going commit to a new desired value. +message CommitSetRollbackDuration { + // Maximum duration to wait for a confirmaton before reverting the commit. + google.protobuf.Duration rollback_duration = 1; +} + +// Depth allows clients to specify the depth of the subtree to be returned in +// the response. The depth is specified as the number of levels below the +// specified path. +// The depth is applied to all paths in the Get or Subscribe request. +// The document about gNMI depth can be found at +// https://github.com/openconfig/reference/tree/master/rpc/gnmi/gnmi-depth.md +message Depth { + // The level of the subtree to be returned in the response. + // Value of 0 means no depth limit and behaves the same as if the extension + // was not specified. + // Value of 1 means only the specified path and its direct children will be + // returned. + uint32 level = 1; +} From 57f9b0b5e5ddec4116d53aa10735d665f048673e Mon Sep 17 00:00:00 2001 From: mkoci Date: Thu, 23 Apr 2026 13:24:17 +0200 Subject: [PATCH 09/30] feat(health): add NVUE gNMI streaming collector Signed-off-by: mkoci --- Cargo.lock | 4 + crates/health/Cargo.toml | 4 + crates/health/example/config.example.toml | 14 + crates/health/src/collectors/mod.rs | 1 + .../health/src/collectors/nvue/gnmi/client.rs | 468 +++++++++ crates/health/src/collectors/nvue/gnmi/mod.rs | 32 + .../collectors/nvue/gnmi/sample_processor.rs | 898 ++++++++++++++++++ .../src/collectors/nvue/gnmi/subscriber.rs | 408 ++++++++ crates/health/src/collectors/nvue/mod.rs | 2 + crates/health/src/collectors/nvue/tls.rs | 74 ++ crates/health/src/collectors/runtime.rs | 17 + crates/health/src/config.rs | 49 + crates/health/src/discovery/context.rs | 9 + crates/health/src/discovery/spawn.rs | 31 +- crates/health/src/lib.rs | 3 + 15 files changed, 2013 insertions(+), 1 deletion(-) create mode 100644 crates/health/src/collectors/nvue/gnmi/client.rs create mode 100644 crates/health/src/collectors/nvue/gnmi/mod.rs create mode 100644 crates/health/src/collectors/nvue/gnmi/sample_processor.rs create mode 100644 crates/health/src/collectors/nvue/gnmi/subscriber.rs create mode 100644 crates/health/src/collectors/nvue/tls.rs diff --git a/Cargo.lock b/Cargo.lock index 4a3f63c631..e68d725fe6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1860,6 +1860,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper", + "hyper-rustls", "hyper-util", "logfmt", "mac_address", @@ -1869,12 +1870,15 @@ dependencies = [ "prost-types", "rand 0.10.1", "reqwest 0.13.3", + "rustls", + "rustls-pki-types", "serde", "serde_json", "serde_with", "tempfile", "thiserror 2.0.18", "tokio", + "tokio-stream", "tokio-util", "tonic", "tonic-prost", diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index f645e06221..6ce872a46a 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -37,13 +37,17 @@ http = { workspace = true } humantime = { workspace = true } humantime-serde = { workspace = true } hyper = { workspace = true } +hyper-rustls = { workspace = true, features = ["http2"] } hyper-util = { workspace = true } mac_address = { workspace = true } prometheus = { workspace = true } reqwest = { workspace = true, features = ["query", "json"] } +rustls = { workspace = true } +rustls-pki-types = { workspace = true } serde = { features = ["derive"], workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } +tokio-stream = { workspace = true } tokio-util = { workspace = true } tracing = { workspace = true } tracing-subscriber = { features = [ diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 12067a6f3f..d57a5e1b3a 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -156,6 +156,20 @@ cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true +# NVUE gNMI streaming collector (switches only, disabled by default). +# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink +# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when +# configured separately) pushes to an OTel Collector. +[collectors.nvue.gnmi] +gnmi_port = 9339 +sample_interval = "5m" +request_timeout = "30s" + +[collectors.nvue.gnmi.paths] +components_enabled = true +interfaces_enabled = true +leak_sensors_enabled = true + # ============================================================================== # Processors # ============================================================================== diff --git a/crates/health/src/collectors/mod.rs b/crates/health/src/collectors/mod.rs index 5b281ff687..bd1de1750b 100644 --- a/crates/health/src/collectors/mod.rs +++ b/crates/health/src/collectors/mod.rs @@ -27,6 +27,7 @@ pub use firmware::{FirmwareCollector, FirmwareCollectorConfig}; pub use leak_detector::{LeakDetectorCollector, LeakDetectorCollectorConfig}; pub use logs::{LogsCollector, LogsCollectorConfig, SseLogCollector, SseLogCollectorConfig}; pub use nmxt::{NmxtCollector, NmxtCollectorConfig}; +pub use nvue::gnmi::subscriber::spawn_gnmi_collector; pub use nvue::rest::collector::{NvueRestCollector, NvueRestCollectorConfig}; pub use runtime::{ BackoffConfig, Collector, CollectorStartContext, EventStream, ExponentialBackoff, diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs new file mode 100644 index 0000000000..1c0537098f --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -0,0 +1,468 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::time::Duration; + +use tonic::metadata::MetadataMap; +use tonic::transport::{Channel, Endpoint}; +use tonic::{Extensions, Request}; + +use super::proto::g_nmi_client::GNmiClient as TonicGnmiClient; +use super::proto::subscription_list::Mode as SubscriptionListMode; +use super::proto::{ + self, Encoding, Path, PathElem, SubscribeRequest, Subscription, SubscriptionList, + SubscriptionMode, +}; +use crate::HealthError; +use crate::config::NvueGnmiPaths; + +pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { + let mut paths = Vec::with_capacity(3); + if paths_config.components_enabled { + paths.push(Path { + elem: vec![ + PathElem { + name: "components".into(), + key: Default::default(), + }, + PathElem { + name: "component".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } + if paths_config.interfaces_enabled { + paths.push(Path { + elem: vec![ + PathElem { + name: "interfaces".into(), + key: Default::default(), + }, + PathElem { + name: "interface".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } + if paths_config.leak_sensors_enabled { + paths.push(Path { + elem: vec![ + PathElem { + name: "platform-general".into(), + key: Default::default(), + }, + PathElem { + name: "leak-sensors".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } + paths +} + +#[derive(Clone)] +pub struct GnmiClient { + switch_id: String, + host: String, + port: u16, + username: Option, + password: Option, + request_timeout: Duration, +} + +impl GnmiClient { + pub fn new( + switch_id: String, + host: &str, + port: u16, + username: Option, + password: Option, + request_timeout: Duration, + ) -> Self { + Self { + switch_id, + host: host.to_string(), + port, + username, + password, + request_timeout, + } + } + + async fn connect(&self) -> Result, HealthError> { + let target = format!("{}:{}", self.host, self.port); + + let uri = http::Uri::builder() + .scheme("https") + .authority(target.as_str()) + .path_and_query("/") + .build() + .map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: invalid endpoint URI: {e}", + self.switch_id + )) + })?; + + let endpoint = Endpoint::from(uri) + .connect_timeout(self.request_timeout) + .timeout(self.request_timeout); + + let tls_config = crate::collectors::nvue::tls::self_signed_tls_config(); + let connector = hyper_rustls::HttpsConnectorBuilder::new() + .with_tls_config(tls_config) + .https_only() + .enable_http2() + .build(); + + let channel = endpoint + .connect_with_connector(connector) + .await + .map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: connection failed to {target}: {e}", + self.switch_id + )) + })?; + + tracing::debug!( + switch_id = %self.switch_id, + target = %target, + "gNMI TLS channel established (skip-verify)" + ); + + Ok(TonicGnmiClient::new(channel)) + } + + /// open a gNMI SAMPLE streaming subscription + pub async fn subscribe_sample( + &self, + paths: &[Path], + sample_interval_nanos: u64, + ) -> Result, HealthError> { + let mut client = self.connect().await?; + + let subscribe_request = build_sample_subscribe_request(paths, sample_interval_nanos); + + let auth = build_auth_metadata(&self.username, &self.password)?; + let stream = tokio_stream::once(subscribe_request); + let request = Request::from_parts(auth, Extensions::default(), stream); + + let response = client.subscribe(request).await.map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: subscribe_sample RPC failed: {e}", + self.switch_id + )) + })?; + + tracing::debug!( + switch_id = %self.switch_id, + sample_interval_nanos, + "gNMI SAMPLE stream opened" + ); + + Ok(response.into_inner()) + } +} + +fn build_sample_subscribe_request(paths: &[Path], sample_interval_nanos: u64) -> SubscribeRequest { + let subscription_list = SubscriptionList { + prefix: Some(Path { + target: "nvos".to_string(), + ..Default::default() + }), + subscription: paths + .iter() + .map(|path| Subscription { + path: Some(path.clone()), + mode: SubscriptionMode::Sample.into(), + sample_interval: sample_interval_nanos, + ..Default::default() + }) + .collect(), + mode: SubscriptionListMode::Stream.into(), + encoding: Encoding::Json.into(), + ..Default::default() + }; + + SubscribeRequest { + request: Some(proto::subscribe_request::Request::Subscribe( + subscription_list, + )), + extension: vec![], + } +} + +fn build_auth_metadata( + username: &Option, + password: &Option, +) -> Result { + let mut meta = MetadataMap::new(); + if let Some(username) = username { + let value = username.parse().map_err(|e| { + HealthError::GnmiError(format!("invalid username for gRPC metadata: {e}")) + })?; + meta.insert("username", value); + } + if let Some(password) = password { + let value = password + .parse() + .map_err(|_e| HealthError::GnmiError("invalid password for gRPC metadata".into()))?; + meta.insert("password", value); + } + Ok(meta) +} + +/// Extract a string from a `TypedValue`, handling JSON-encoded bytes as well +/// as native string values. +pub fn typed_value_to_string(val: &proto::TypedValue) -> Option { + use proto::typed_value::Value; + match &val.value { + Some(Value::StringVal(s)) => Some(s.clone()), + Some(Value::JsonVal(bytes)) | Some(Value::JsonIetfVal(bytes)) => { + let s = String::from_utf8_lossy(bytes); + let trimmed = s.trim().trim_matches('"'); + Some(trimmed.to_string()) + } + Some(Value::AsciiVal(s)) => Some(s.clone()), + Some(Value::IntVal(v)) => Some(v.to_string()), + Some(Value::UintVal(v)) => Some(v.to_string()), + Some(Value::BoolVal(v)) => Some(v.to_string()), + Some(Value::FloatVal(v)) => Some(v.to_string()), + Some(Value::DoubleVal(v)) => Some(v.to_string()), + _ => None, + } +} + +/// Extract a float from a `TypedValue`, handling JSON-encoded bytes, native +/// numeric values, and string representations. +pub fn typed_value_to_f64(val: &proto::TypedValue) -> Option { + use proto::typed_value::Value; + match &val.value { + Some(Value::DoubleVal(v)) => Some(*v), + Some(Value::FloatVal(v)) => Some(*v as f64), + Some(Value::IntVal(v)) => Some(*v as f64), + Some(Value::UintVal(v)) => Some(*v as f64), + Some(Value::StringVal(s)) => s.parse().ok(), + Some(Value::JsonVal(bytes)) | Some(Value::JsonIetfVal(bytes)) => { + let s = String::from_utf8_lossy(bytes); + s.trim().trim_matches('"').parse().ok() + } + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_typed_value_to_string_string_val() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal("healthy".to_string())), + }; + assert_eq!(typed_value_to_string(&val), Some("healthy".to_string())); + } + + #[test] + fn test_typed_value_to_string_json_val() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"\"degraded\"".to_vec())), + }; + assert_eq!(typed_value_to_string(&val), Some("degraded".to_string())); + } + + #[test] + fn test_typed_value_to_string_json_unquoted() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"42".to_vec())), + }; + assert_eq!(typed_value_to_string(&val), Some("42".to_string())); + } + + #[test] + fn test_typed_value_to_string_int() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::IntVal(-5)), + }; + assert_eq!(typed_value_to_string(&val), Some("-5".to_string())); + } + + #[test] + fn test_typed_value_to_string_uint() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::UintVal(100)), + }; + assert_eq!(typed_value_to_string(&val), Some("100".to_string())); + } + + #[test] + fn test_typed_value_to_string_bool() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::BoolVal(true)), + }; + assert_eq!(typed_value_to_string(&val), Some("true".to_string())); + } + + #[test] + fn test_typed_value_to_string_none() { + let val = proto::TypedValue { value: None }; + assert_eq!(typed_value_to_string(&val), None); + } + + #[test] + fn test_typed_value_to_f64_double() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(42.5)), + }; + assert_eq!(typed_value_to_f64(&val), Some(42.5)); + } + + #[test] + fn test_typed_value_to_f64_int() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::IntVal(42)), + }; + assert_eq!(typed_value_to_f64(&val), Some(42.0)); + } + + #[test] + fn test_typed_value_to_f64_json_string() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"\"1.5e-3\"".to_vec())), + }; + assert_eq!(typed_value_to_f64(&val), Some(0.0015)); + } + + #[test] + fn test_typed_value_to_f64_json_number() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"99.9".to_vec())), + }; + assert_eq!(typed_value_to_f64(&val), Some(99.9)); + } + + #[test] + fn test_typed_value_to_f64_string() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal("1.23".to_string())), + }; + assert_eq!(typed_value_to_f64(&val), Some(1.23)); + } + + #[test] + fn test_typed_value_to_f64_non_numeric_string() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal("hello".to_string())), + }; + assert_eq!(typed_value_to_f64(&val), None); + } + + #[test] + fn test_typed_value_to_f64_none() { + let val = proto::TypedValue { value: None }; + assert_eq!(typed_value_to_f64(&val), None); + } + + #[test] + fn test_nvue_subscribe_paths_all_enabled() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); + assert_eq!(paths.len(), 3); + + assert_eq!(paths[0].elem.len(), 2); + assert_eq!(paths[0].elem[0].name, "components"); + assert_eq!(paths[0].elem[1].name, "component"); + + assert_eq!(paths[1].elem.len(), 2); + assert_eq!(paths[1].elem[0].name, "interfaces"); + assert_eq!(paths[1].elem[1].name, "interface"); + + assert_eq!(paths[2].elem.len(), 2); + assert_eq!(paths[2].elem[0].name, "platform-general"); + assert_eq!(paths[2].elem[1].name, "leak-sensors"); + } + + #[test] + fn test_nvue_subscribe_paths_selective() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: false, + interfaces_enabled: true, + leak_sensors_enabled: false, + }); + assert_eq!(paths.len(), 1); + assert_eq!(paths[0].elem.len(), 2); + assert_eq!(paths[0].elem[0].name, "interfaces"); + assert_eq!(paths[0].elem[1].name, "interface"); + } + + #[test] + fn test_nvue_subscribe_paths_none_enabled() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: false, + interfaces_enabled: false, + leak_sensors_enabled: false, + }); + assert!(paths.is_empty()); + } + + #[test] + fn test_build_sample_subscribe_request() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); + let interval_nanos = 300_000_000_000u64; + + let req = build_sample_subscribe_request(&paths, interval_nanos); + + let sub_list = match req.request { + Some(proto::subscribe_request::Request::Subscribe(sl)) => sl, + _ => panic!("expected Subscribe variant"), + }; + + assert_eq!( + sub_list.mode, + i32::from(SubscriptionListMode::Stream), + "must use Stream mode for SAMPLE subscriptions" + ); + assert_eq!( + sub_list.encoding, + i32::from(Encoding::Json), + "encoding must be JSON" + ); + + let prefix = sub_list.prefix.expect("prefix must be set"); + assert_eq!(prefix.target, "nvos", "target must be nvos"); + + assert_eq!(sub_list.subscription.len(), 3); + for sub in &sub_list.subscription { + assert_eq!( + sub.mode, + i32::from(SubscriptionMode::Sample), + "each subscription must use Sample mode" + ); + assert_eq!( + sub.sample_interval, interval_nanos, + "sample_interval must match the requested interval" + ); + assert!(sub.path.is_some(), "each subscription must have a path"); + } + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/mod.rs b/crates/health/src/collectors/nvue/gnmi/mod.rs new file mode 100644 index 0000000000..7a7b5b7338 --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/mod.rs @@ -0,0 +1,32 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +pub(crate) mod client; +pub(crate) mod sample_processor; +pub(crate) mod subscriber; + +// prost generates ExtensionId::EidUnset / EidExperimental from gnmi_ext.proto, +// where the proto convention prefixes every value with the enum abbreviation. +// clippy flags the shared "Eid" prefix but we can't control generated code. +#[allow(clippy::enum_variant_names)] +pub mod proto { + #[allow(clippy::enum_variant_names)] + pub mod gnmi_ext { + tonic::include_proto!("gnmi_ext"); + } + tonic::include_proto!("gnmi"); +} diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs new file mode 100644 index 0000000000..5d54f3a7c0 --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -0,0 +1,898 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::borrow::Cow; +use std::collections::HashSet; +use std::sync::Arc; +use std::time::Instant; + +use super::client::{typed_value_to_f64, typed_value_to_string}; +use super::proto::{self, PathElem}; +use super::subscriber::GnmiStreamMetrics; +use crate::sink::{CollectorEvent, DataSink, EventContext, SensorHealthData}; + +pub(crate) const NVUE_GNMI_SAMPLE_STREAM_ID: &str = "nvue_gnmi"; + +/// process NVUE gNMI SAMPLE notifications and emit them as `CollectorEvent::Metric` +pub(crate) struct GnmiSampleProcessor { + pub(crate) data_sink: Option>, + pub(crate) event_context: EventContext, + pub(crate) switch_id: String, +} + +impl GnmiSampleProcessor { + pub(crate) fn process_subscribe_response( + &self, + resp: &proto::SubscribeResponse, + stream_metrics: &GnmiStreamMetrics, + ) { + let notification = match &resp.response { + Some(proto::subscribe_response::Response::Update(n)) => n, + Some(proto::subscribe_response::Response::SyncResponse(_)) => return, + Some(proto::subscribe_response::Response::Error(e)) => { + stream_metrics.stream_errors_total.inc(); + tracing::warn!( + code = e.code, + message = %e.message, + "nvue_gnmi SAMPLE: server error in stream" + ); + return; + } + None => return, + }; + + stream_metrics.notifications_received_total.inc(); + stream_metrics + .last_notification_timestamp + .set(now_unix_secs()); + + let start = Instant::now(); + let entity_count = self.process_notification(notification); + stream_metrics + .notification_processing_seconds + .observe(start.elapsed().as_secs_f64()); + stream_metrics.monitored_entities.set(entity_count as f64); + } + + fn process_notification(&self, notification: &proto::Notification) -> usize { + let prefix_elems: &[PathElem] = notification + .prefix + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let mut entities: HashSet<(&str, &str)> = HashSet::new(); + + for update in ¬ification.update { + let val = match update.val.as_ref() { + Some(v) => v, + None => continue, + }; + + let update_elems: &[PathElem] = update + .path + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let combined: Vec<&PathElem> = prefix_elems.iter().chain(update_elems.iter()).collect(); + + if let Some(iface) = find_elem_key_ref(&combined, "interface", "name") { + entities.insert(("interface", iface)); + self.process_interface_metric(&combined, iface, val); + } else if let Some(comp) = find_elem_key_ref(&combined, "component", "name") { + entities.insert(("component", comp)); + self.process_component_metric(&combined, comp, val); + } else if let Some(sensor_id) = find_elem_key_ref(&combined, "leak-sensor", "id") + && leaf_matches(&combined, &["state", "state"]) + { + entities.insert(("sensor", sensor_id)); + self.process_leak_sensor_metric(val, sensor_id); + } + } + + entities.len() + } + + fn process_interface_metric( + &self, + elems: &[&PathElem], + iface_name: &str, + val: &proto::TypedValue, + ) { + if leaf_matches(elems, &["state", "oper-status"]) { + let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); + self.emit_data_metric( + "interface_oper_status", + iface_name, + v, + "state", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["state", "counters", "in-errors"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_in_errors", + iface_name, + v, + "count", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["state", "counters", "out-errors"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_out_errors", + iface_name, + v, + "count", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["phy-diag", "state", "effective-ber"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_effective_ber", + iface_name, + v, + "ratio", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["phy-diag", "state", "symbol-ber"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_symbol_ber", + iface_name, + v, + "ratio", + "interface_name", + iface_name, + ); + } else if leaf_matches( + elems, + &["phy-diag", "state", "unintentional-link-down-events"], + ) && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_link_down_events", + iface_name, + v, + "count", + "interface_name", + iface_name, + ); + } + } + + fn process_component_metric( + &self, + elems: &[&PathElem], + comp_name: &str, + val: &proto::TypedValue, + ) { + if leaf_matches(elems, &["healthz", "state", "status"]) { + let v = component_health_to_f64(typed_value_to_string(val).as_deref()); + self.emit_data_metric( + "component_health_status", + comp_name, + v, + "state", + "component_name", + comp_name, + ); + } else if leaf_matches(elems, &["state", "temperature", "instant"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "component_temperature_celsius", + comp_name, + v, + "celsius", + "component_name", + comp_name, + ); + } + } + + fn process_leak_sensor_metric(&self, val: &proto::TypedValue, sensor_id: &str) { + let v = leak_sensor_to_f64(typed_value_to_string(val).as_deref()); + self.emit_data_metric( + "leak_sensor_state", + sensor_id, + v, + "state", + "sensor_id", + sensor_id, + ); + } + + fn emit_data_metric( + &self, + metric_type: &str, + entity_id: &str, + value: f64, + unit: &str, + entity_label_name: &'static str, + entity_label_value: &str, + ) { + let Some(sink) = &self.data_sink else { return }; + + let mut key = String::with_capacity(metric_type.len() + 1 + entity_id.len()); + key.push_str(metric_type); + key.push(':'); + key.push_str(entity_id); + + // only the domain-specific entity label; endpoint identity (ip, mac, + // serial_number, collector_type) is added by PrometheusSink from EventContext + let labels = vec![( + Cow::Borrowed(entity_label_name), + entity_label_value.to_string(), + )]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(SensorHealthData { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: unit.to_string(), + value, + labels, + context: None, + })), + ); + } +} + +fn find_elem_key_ref<'a>( + elems: &[&'a PathElem], + elem_name: &str, + key_name: &str, +) -> Option<&'a str> { + elems + .iter() + .find(|e| e.name == elem_name) + .and_then(|e| e.key.get(key_name).map(String::as_str)) +} + +fn leaf_matches(elems: &[&PathElem], expected: &[&str]) -> bool { + if elems.len() < expected.len() { + return false; + } + let start = elems.len() - expected.len(); + elems[start..] + .iter() + .zip(expected) + .all(|(elem, name)| elem.name == *name) +} + +fn oper_status_to_f64(status: Option<&str>) -> f64 { + match status { + Some(s) if s.eq_ignore_ascii_case("up") => 1.0, + _ => 0.0, + } +} + +fn component_health_to_f64(status: Option<&str>) -> f64 { + match status { + Some(s) if s.eq_ignore_ascii_case("healthy") => 1.0, + Some(s) if s.eq_ignore_ascii_case("unhealthy") => 2.0, + _ => 0.0, + } +} + +// /platform-general/leak-sensors/leak-sensor[id=X]/state/state +// NVOS values from nvidia-platform-general-ext LeakSensors type: +// "OK" -> 0.0 (no leak) +// "LEAK" -> 1.0 (leak detected) +// "UNSET" -> 0.0 (default / unmapped internal value) +fn leak_sensor_to_f64(status: Option<&str>) -> f64 { + match status { + Some(s) if s.eq_ignore_ascii_case("LEAK") => 1.0, + _ => 0.0, + } +} + +pub(crate) fn now_unix_secs() -> f64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs_f64()) + .unwrap_or(0.0) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + + #[test] + fn test_leaf_matches() { + let elems: Vec = ["interfaces", "interface", "state", "oper-status"] + .iter() + .map(|n| PathElem { + name: n.to_string(), + key: Default::default(), + }) + .collect(); + let refs: Vec<&PathElem> = elems.iter().collect(); + + assert!(leaf_matches(&refs, &["state", "oper-status"])); + assert!(leaf_matches(&refs, &["oper-status"])); + assert!(!leaf_matches(&refs, &["counters", "oper-status"])); + assert!(!leaf_matches(&refs, &["a", "b", "c", "d", "e"])); + } + + #[test] + fn test_find_elem_key_ref() { + let mut key_map = HashMap::new(); + key_map.insert("name".to_string(), "nvl0".to_string()); + let elems = [ + PathElem { + name: "interfaces".to_string(), + key: Default::default(), + }, + PathElem { + name: "interface".to_string(), + key: key_map, + }, + ]; + let refs: Vec<&PathElem> = elems.iter().collect(); + + assert_eq!(find_elem_key_ref(&refs, "interface", "name"), Some("nvl0")); + assert_eq!(find_elem_key_ref(&refs, "interface", "id"), None); + assert_eq!(find_elem_key_ref(&refs, "component", "name"), None); + } + + #[test] + fn test_oper_status_mapping() { + assert_eq!(oper_status_to_f64(Some("UP")), 1.0); + assert_eq!(oper_status_to_f64(Some("up")), 1.0); + assert_eq!(oper_status_to_f64(Some("DOWN")), 0.0); + assert_eq!(oper_status_to_f64(None), 0.0); + } + + #[test] + fn test_component_health_mapping() { + assert_eq!(component_health_to_f64(Some("healthy")), 1.0); + assert_eq!(component_health_to_f64(Some("HEALTHY")), 1.0); + assert_eq!(component_health_to_f64(Some("unhealthy")), 2.0); + assert_eq!(component_health_to_f64(None), 0.0); + } + + #[test] + fn test_leak_sensor_mapping() { + assert_eq!(leak_sensor_to_f64(Some("OK")), 0.0); + assert_eq!(leak_sensor_to_f64(Some("ok")), 0.0); + assert_eq!(leak_sensor_to_f64(Some("LEAK")), 1.0); + assert_eq!(leak_sensor_to_f64(Some("leak")), 1.0); + assert_eq!(leak_sensor_to_f64(Some("Leak")), 1.0); + assert_eq!(leak_sensor_to_f64(Some("UNSET")), 0.0); + assert_eq!(leak_sensor_to_f64(None), 0.0); + } + + fn make_path_elem(name: &str, keys: &[(&str, &str)]) -> PathElem { + PathElem { + name: name.to_string(), + key: keys + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + } + } + + fn make_typed_value_string(s: &str) -> proto::TypedValue { + proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal(s.to_string())), + } + } + + fn make_typed_value_uint(v: u64) -> proto::TypedValue { + proto::TypedValue { + value: Some(proto::typed_value::Value::UintVal(v)), + } + } + + fn test_processor() -> GnmiSampleProcessor { + use std::str::FromStr; + + use mac_address::MacAddress; + + use crate::endpoint::BmcAddr; + + let addr = BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }; + let event_context = EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr, + collector_type: NVUE_GNMI_SAMPLE_STREAM_ID, + metadata: None, + rack_id: None, + }; + GnmiSampleProcessor { + data_sink: None, + event_context, + switch_id: "serial-abc".to_string(), + } + } + + #[test] + fn test_process_notification_interface_oper_status() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl4")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_component_temperature() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", "PSU-1")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("temperature", &[]), + make_path_elem("instant", &[]), + ], + ..Default::default() + }), + val: Some(proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(42.5)), + }), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_multiple_updates() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("counters", &[]), + make_path_elem("in-errors", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(42)), + ..Default::default() + }, + ], + ..Default::default() + }; + + // same interface, so entity count is 1 even with multiple updates + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_mixed_entities() { + let proc = test_processor(); + + let iface_update = proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("DOWN")), + ..Default::default() + }; + + let comp_update = proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", "FAN-1")]), + make_path_elem("healthz", &[]), + make_path_elem("state", &[]), + make_path_elem("status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("healthy")), + ..Default::default() + }; + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![iface_update, comp_update], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 2); + } + + #[test] + fn test_process_notification_leak_sensor() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("leak-sensors", &[]), + make_path_elem("leak-sensor", &[("id", "1")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![make_path_elem("state", &[]), make_path_elem("state", &[])], + ..Default::default() + }), + val: Some(make_typed_value_string("LEAK")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_leak_sensor_ok() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("leak-sensors", &[]), + make_path_elem("leak-sensor", &[("id", "2")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![make_path_elem("state", &[]), make_path_elem("state", &[])], + ..Default::default() + }), + val: Some(make_typed_value_string("OK")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_update_without_val_is_skipped() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: None, + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 0); + } + + #[test] + fn test_process_notification_effective_ber() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl1")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("effective-ber", &[]), + ], + ..Default::default() + }), + val: Some(proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(1.5e-12)), + }), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_symbol_ber_and_link_down_events() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl2")]), + ], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("symbol-ber", &[]), + ], + ..Default::default() + }), + val: Some(proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(3.2e-10)), + }), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("unintentional-link-down-events", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(7)), + ..Default::default() + }, + ], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_out_errors() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl3")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("counters", &[]), + make_path_elem("out-errors", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(99)), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + fn test_stream_metrics() -> super::super::subscriber::GnmiStreamMetrics { + use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntGauge}; + super::super::subscriber::GnmiStreamMetrics { + connection_state: IntGauge::new("test_conn_state", "test").unwrap(), + connected: IntGauge::new("test_connected", "test").unwrap(), + reconnections_total: Counter::new("test_reconn", "test").unwrap(), + server_initiated_closures_total: Counter::new("test_closures", "test").unwrap(), + connection_established_timestamp: Gauge::new("test_conn_ts", "test").unwrap(), + notifications_received_total: Counter::new("test_notif_total", "test").unwrap(), + last_notification_timestamp: Gauge::new("test_last_notif_ts", "test").unwrap(), + notification_processing_seconds: Histogram::with_opts(HistogramOpts::new( + "test_proc_secs", + "test", + )) + .unwrap(), + stream_errors_total: Counter::new("test_errors", "test").unwrap(), + monitored_entities: Gauge::new("test_entities", "test").unwrap(), + } + } + + #[test] + fn test_process_subscribe_response_sync_response_is_noop() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::SyncResponse(true)), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 0.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } + + #[test] + fn test_process_subscribe_response_error_increments_counter() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::Error(proto::Error { + code: 13, + message: "internal server error".into(), + ..Default::default() + })), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.stream_errors_total.get(), 1.0); + assert_eq!(metrics.notifications_received_total.get(), 0.0); + } + + #[test] + fn test_process_subscribe_response_none_is_noop() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: None, + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 0.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } + + #[test] + fn test_process_subscribe_response_update_increments_notification_counter() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::Update( + proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }, + )), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 1.0); + assert_eq!(metrics.monitored_entities.get(), 1.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs new file mode 100644 index 0000000000..1b1fa36778 --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -0,0 +1,408 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntGauge, Opts}; +use tokio_util::sync::CancellationToken; + +use super::client::{GnmiClient, nvue_subscribe_paths}; +use super::proto; +use super::sample_processor::{GnmiSampleProcessor, NVUE_GNMI_SAMPLE_STREAM_ID, now_unix_secs}; +use crate::HealthError; +use crate::collectors::Collector; +use crate::collectors::runtime::{BackoffConfig, ExponentialBackoff, StreamingConnectionGuard}; +use crate::config::NvueGnmiConfig; +use crate::endpoint::BmcEndpoint; +use crate::metrics::CollectorRegistry; +use crate::sink::{DataSink, EventContext}; + +// gRPC ConnectivityState values for `connection_state`. 0 (UNKNOWN) is the gauge default. +const IDLE: i64 = 1; +const CONNECTING: i64 = 2; +const READY: i64 = 3; +const TRANSIENT_FAILURE: i64 = 4; +const SHUTDOWN: i64 = 5; + +pub(crate) struct GnmiStreamMetrics { + pub(crate) connection_state: IntGauge, + /// binary "is this stream live right now?" -- guard-managed, mirrors SSE's `connected` gauge + pub(crate) connected: IntGauge, + pub(crate) reconnections_total: Counter, + pub(crate) server_initiated_closures_total: Counter, + pub(crate) connection_established_timestamp: Gauge, + pub(crate) notifications_received_total: Counter, + pub(crate) last_notification_timestamp: Gauge, + pub(crate) notification_processing_seconds: Histogram, + pub(crate) stream_errors_total: Counter, + pub(crate) monitored_entities: Gauge, +} + +impl GnmiStreamMetrics { + fn new( + registry: &prometheus::Registry, + prefix: &str, + stream_name: &str, + const_labels: HashMap, + ) -> Result { + let connection_state = IntGauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_connection_state"), + "gRPC connection state: 0=UNKNOWN, 1=IDLE, 2=CONNECTING, 3=READY, 4=TRANSIENT_FAILURE, 5=SHUTDOWN", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(connection_state.clone()))?; + + let connected = IntGauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_stream_connected"), + "1 while the stream is connected (READY), 0 otherwise. Mirrors the SSE collector's stream_connected gauge for aggregate streaming dashboards.", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(connected.clone()))?; + + let reconnections_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_reconnections_total"), + "Total reconnection attempts", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(reconnections_total.clone()))?; + + let server_initiated_closures_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_server_initiated_closures_total"), + "Total times the server closed the stream cleanly", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(server_initiated_closures_total.clone()))?; + + let connection_established_timestamp = Gauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_connection_established_timestamp"), + "Unix timestamp when current connection was established. Compute uptime via time() - this_metric.", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(connection_established_timestamp.clone()))?; + + let notifications_received_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_notifications_received_total"), + "Total notification messages received", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(notifications_received_total.clone()))?; + + let last_notification_timestamp = Gauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_last_notification_timestamp"), + "Unix timestamp of most recent notification", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(last_notification_timestamp.clone()))?; + + let notification_processing_seconds = Histogram::with_opts( + HistogramOpts::new( + format!("{prefix}_nvue_gnmi{stream_name}_notification_processing_seconds"), + "Per-notification processing time", + ) + .const_labels(const_labels.clone()) + .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]), + )?; + registry.register(Box::new(notification_processing_seconds.clone()))?; + + let stream_errors_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_stream_errors_total"), + "Total stream errors", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(stream_errors_total.clone()))?; + + let monitored_entities = Gauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_monitored_entities"), + "Unique entities in most recent notification batch", + ) + .const_labels(const_labels), + )?; + registry.register(Box::new(monitored_entities.clone()))?; + + Ok(Self { + connection_state, + connected, + reconnections_total, + server_initiated_closures_total, + connection_established_timestamp, + notifications_received_total, + last_notification_timestamp, + notification_processing_seconds, + stream_errors_total, + monitored_entities, + }) + } +} + +struct GnmiStreamConfig { + client: GnmiClient, + paths: Vec, + sample_interval_nanos: u64, +} + +pub fn spawn_gnmi_collector( + endpoint: &BmcEndpoint, + gnmi_config: &NvueGnmiConfig, + collector_registry: Arc, + data_sink: Option>, +) -> Result { + let switch_id = endpoint + .metadata + .as_ref() + .and_then(|m| m.serial_number().map(str::to_string)) + .unwrap_or_else(|| endpoint.addr.mac.to_string()); + let switch_ip = endpoint.addr.ip.to_string(); + let sample_event_context = EventContext::from_endpoint(endpoint, NVUE_GNMI_SAMPLE_STREAM_ID); + + let (username, password) = match endpoint.credentials() { + crate::endpoint::BmcCredentials::UsernamePassword { username, password } => { + (Some(username), password) + } + crate::endpoint::BmcCredentials::SessionToken { .. } => { + return Err(HealthError::GnmiError( + "gNMI collector does not support SessionToken credentials; expected UsernamePassword" + .into(), + )); + } + }; + let client = GnmiClient::new( + switch_id.clone(), + &switch_ip, + gnmi_config.gnmi_port, + username, + password, + gnmi_config.request_timeout, + ); + + let registry = collector_registry.registry(); + let prefix = collector_registry.prefix().clone(); + + let sample_const_labels = HashMap::from([ + ( + "collector_type".to_string(), + NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + ), + ("endpoint_key".to_string(), endpoint.hash_key().into_owned()), + ]); + + let sample_stream_metrics = GnmiStreamMetrics::new(registry, &prefix, "", sample_const_labels)?; + + let sample_config = GnmiStreamConfig { + client, + paths: nvue_subscribe_paths(&gnmi_config.paths), + sample_interval_nanos: gnmi_config.sample_interval.as_nanos() as u64, + }; + + let sample_processor = GnmiSampleProcessor { + data_sink, + event_context: sample_event_context, + switch_id, + }; + + Ok(Collector::spawn_task(move |cancel_token| async move { + gnmi_sample_task( + cancel_token, + sample_config, + sample_stream_metrics, + sample_processor, + ) + .await; + })) +} + +async fn gnmi_sample_task( + cancel_token: CancellationToken, + config: GnmiStreamConfig, + stream_metrics: GnmiStreamMetrics, + sample_processor: GnmiSampleProcessor, +) { + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_secs(2), + max: Duration::from_secs(60), + }); + + loop { + stream_metrics.connection_state.set(CONNECTING); + + let Some(stream) = cancel_token + .run_until_cancelled( + config + .client + .subscribe_sample(&config.paths, config.sample_interval_nanos), + ) + .await + else { + stream_metrics.connection_state.set(SHUTDOWN); + return; + }; + + match stream { + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: connection failed, backing off" + ); + } + Ok(mut stream) => { + stream_metrics.connection_state.set(READY); + stream_metrics + .connection_established_timestamp + .set(now_unix_secs()); + let _conn_guard = StreamingConnectionGuard::inc(stream_metrics.connected.clone()); + backoff.reset(); + tracing::info!( + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: stream connected" + ); + + loop { + let Some(msg) = cancel_token.run_until_cancelled(stream.message()).await else { + stream_metrics.connection_state.set(SHUTDOWN); + tracing::info!( + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: cancelled, shutting down" + ); + return; + }; + + match msg { + Ok(Some(resp)) => { + sample_processor.process_subscribe_response(&resp, &stream_metrics); + } + Ok(None) => { + stream_metrics.connection_state.set(IDLE); + stream_metrics.server_initiated_closures_total.inc(); + tracing::info!( + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: stream closed by server, reconnecting" + ); + backoff.reset(); + break; + } + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.stream_errors_total.inc(); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: stream error, reconnecting" + ); + break; + } + } + } + } + } + + if cancel_token + .run_until_cancelled(tokio::time::sleep(backoff.next_delay())) + .await + .is_none() + { + stream_metrics.connection_state.set(SHUTDOWN); + return; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_labels() -> HashMap { + HashMap::from([ + ("switch_id".to_string(), "test-switch".to_string()), + ("switch_ip".to_string(), "10.0.0.1".to_string()), + ]) + } + + #[test] + fn test_stream_metrics_registers_all_counters() { + let registry = prometheus::Registry::new(); + let metrics = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + + metrics.reconnections_total.inc(); + assert_eq!(metrics.reconnections_total.get(), 1.0); + + metrics.server_initiated_closures_total.inc(); + assert_eq!(metrics.server_initiated_closures_total.get(), 1.0); + + metrics.stream_errors_total.inc(); + assert_eq!(metrics.stream_errors_total.get(), 1.0); + } + + #[test] + fn test_stream_metrics_server_closures_independent_from_reconnections() { + let registry = prometheus::Registry::new(); + let metrics = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + + metrics.server_initiated_closures_total.inc(); + metrics.server_initiated_closures_total.inc(); + assert_eq!(metrics.server_initiated_closures_total.get(), 2.0); + assert_eq!(metrics.reconnections_total.get(), 0.0); + + metrics.reconnections_total.inc(); + assert_eq!(metrics.reconnections_total.get(), 1.0); + assert_eq!(metrics.server_initiated_closures_total.get(), 2.0); + } + + #[test] + fn test_stream_metrics_duplicate_registration_fails() { + let registry = prometheus::Registry::new(); + let _ = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + let result = GnmiStreamMetrics::new(®istry, "test", "", test_labels()); + assert!(result.is_err()); + } + + #[test] + fn test_stream_metrics_distinct_stream_names_coexist() { + let registry = prometheus::Registry::new(); + let sample = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + let events_labels = HashMap::from([ + ("switch_id".to_string(), "test-switch".to_string()), + ("switch_ip".to_string(), "10.0.0.2".to_string()), + ]); + let events = GnmiStreamMetrics::new(®istry, "test", "_events", events_labels).unwrap(); + + sample.server_initiated_closures_total.inc(); + assert_eq!(sample.server_initiated_closures_total.get(), 1.0); + assert_eq!(events.server_initiated_closures_total.get(), 0.0); + } +} diff --git a/crates/health/src/collectors/nvue/mod.rs b/crates/health/src/collectors/nvue/mod.rs index 592d1df205..f58a2999fe 100644 --- a/crates/health/src/collectors/nvue/mod.rs +++ b/crates/health/src/collectors/nvue/mod.rs @@ -15,4 +15,6 @@ * limitations under the License. */ +pub(crate) mod gnmi; pub(in crate::collectors) mod rest; +pub(crate) mod tls; diff --git a/crates/health/src/collectors/nvue/tls.rs b/crates/health/src/collectors/nvue/tls.rs new file mode 100644 index 0000000000..a715e644c0 --- /dev/null +++ b/crates/health/src/collectors/nvue/tls.rs @@ -0,0 +1,74 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::sync::Arc; + +use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified, ServerCertVerifier}; +use rustls::pki_types::{CertificateDer, ServerName, UnixTime}; +use rustls::{ClientConfig, DigitallySignedStruct, SignatureScheme}; + +// ! dangerous cert verifier that accepts any server certificate without validation. +// ! only enable in test environments where you cannot replace NVOS self-signed certificates. +#[derive(Debug)] +struct AcceptAnyCertVerifier; + +impl ServerCertVerifier for AcceptAnyCertVerifier { + fn verify_server_cert( + &self, + _end_entity: &CertificateDer<'_>, + _intermediates: &[CertificateDer<'_>], + _server_name: &ServerName, + _ocsp_response: &[u8], + _now: UnixTime, + ) -> Result { + Ok(ServerCertVerified::assertion()) + } + + fn verify_tls12_signature( + &self, + _message: &[u8], + _cert: &CertificateDer<'_>, + _dss: &DigitallySignedStruct, + ) -> Result { + Ok(HandshakeSignatureValid::assertion()) + } + + fn verify_tls13_signature( + &self, + _message: &[u8], + _cert: &CertificateDer<'_>, + _dss: &DigitallySignedStruct, + ) -> Result { + Ok(HandshakeSignatureValid::assertion()) + } + + fn supported_verify_schemes(&self) -> Vec { + rustls::crypto::aws_lc_rs::default_provider() + .signature_verification_algorithms + .supported_schemes() + } +} + +/// build a rustls ClientConfig that dangerously skips server certificate verification. +pub fn self_signed_tls_config() -> ClientConfig { + ClientConfig::builder_with_provider(Arc::new(rustls::crypto::aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("default protocol versions are valid") + .dangerous() + .with_custom_certificate_verifier(Arc::new(AcceptAnyCertVerifier)) + .with_no_client_auth() +} diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 5b9afc41e7..849dd02522 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -581,6 +581,23 @@ impl Collector { }) } + /// spawn helper for streaming collectors that don't fit `StreamingCollector` + /// (e.g. gNMI bidi subscribe with in-loop multiplexing). The closure gets a + /// CancellationToken and should return once it's cancelled. + pub fn spawn_task(task_fn: F) -> Self + where + F: FnOnce(CancellationToken) -> Fut + Send + 'static, + Fut: std::future::Future + Send + 'static, + { + let cancel_token = CancellationToken::new(); + let cancel_clone = cancel_token.clone(); + let handle = tokio::spawn(task_fn(cancel_clone)); + Self { + handle, + cancel_token, + } + } + pub async fn stop(self) { self.cancel_token.cancel(); let _ = self.handle.await; diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index c94d99ce6b..f853550aad 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -638,12 +638,61 @@ impl Default for NmxtCollectorConfig { #[serde(default)] pub struct NvueCollectorConfig { pub rest: Configurable, + pub gnmi: Configurable, } impl Default for NvueCollectorConfig { fn default() -> Self { Self { rest: Configurable::Enabled(NvueRestConfig::default()), + gnmi: Configurable::Disabled, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +pub struct NvueGnmiConfig { + /// gNMI server port on the switch. + pub gnmi_port: u16, + + /// Interval between SAMPLE mode subscription updates. + #[serde(with = "humantime_serde")] + pub sample_interval: Duration, + + /// Timeout for gRPC connection attempts. + #[serde(with = "humantime_serde")] + pub request_timeout: Duration, + + /// gNMI SAMPLE subscription paths. + pub paths: NvueGnmiPaths, +} + +impl Default for NvueGnmiConfig { + fn default() -> Self { + Self { + gnmi_port: 9339, + sample_interval: Duration::from_secs(300), + request_timeout: Duration::from_secs(30), + paths: NvueGnmiPaths::default(), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +pub struct NvueGnmiPaths { + pub components_enabled: bool, + pub interfaces_enabled: bool, + pub leak_sensors_enabled: bool, +} + +impl Default for NvueGnmiPaths { + fn default() -> Self { + Self { + components_enabled: true, + interfaces_enabled: true, + leak_sensors_enabled: true, } } } diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 0c38c32d10..fb9f94bb60 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -46,6 +46,7 @@ pub(super) enum CollectorKind { LeakDetector, Nmxt, NvueRest, + NvueGnmi, } impl CollectorKind { @@ -56,6 +57,7 @@ impl CollectorKind { CollectorKind::LeakDetector, CollectorKind::Nmxt, CollectorKind::NvueRest, + CollectorKind::NvueGnmi, ]; pub(super) fn stop_message(self) -> &'static str { @@ -68,6 +70,9 @@ impl CollectorKind { } CollectorKind::Nmxt => "Stopping NMX-T collector for removed BMC endpoint", CollectorKind::NvueRest => "Stopping NVUE REST collector for removed BMC endpoint", + CollectorKind::NvueGnmi => { + "Stopping NVUE gNMI streaming collector for removed switch endpoint" + } } } } @@ -79,6 +84,7 @@ pub(super) struct CollectorState { logs: HashMap, Collector>, nmxt: HashMap, Collector>, nvue_rest: HashMap, Collector>, + nvue_gnmi: HashMap, Collector>, } impl CollectorState { @@ -90,6 +96,7 @@ impl CollectorState { logs: HashMap::new(), nmxt: HashMap::new(), nvue_rest: HashMap::new(), + nvue_gnmi: HashMap::new(), } } @@ -101,6 +108,7 @@ impl CollectorState { CollectorKind::LeakDetector => &self.leak_detector, CollectorKind::Nmxt => &self.nmxt, CollectorKind::NvueRest => &self.nvue_rest, + CollectorKind::NvueGnmi => &self.nvue_gnmi, } } @@ -115,6 +123,7 @@ impl CollectorState { CollectorKind::LeakDetector => &mut self.leak_detector, CollectorKind::Nmxt => &mut self.nmxt, CollectorKind::NvueRest => &mut self.nvue_rest, + CollectorKind::NvueGnmi => &mut self.nvue_gnmi, } } diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 2bfbc901e8..483dd210db 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -25,7 +25,7 @@ use crate::collectors::{ LeakDetectorCollector, LeakDetectorCollectorConfig, LogsCollector, LogsCollectorConfig, NmxtCollector, NmxtCollectorConfig, NvueRestCollector, NvueRestCollectorConfig, SensorCollector, SensorCollectorConfig, SseLogCollector, SseLogCollectorConfig, - StreamingCollectorStartContext, + StreamingCollectorStartContext, spawn_gnmi_collector, }; use crate::config::{Configurable, LogCollectionMode}; use crate::endpoint::{BmcEndpoint, EndpointMetadata}; @@ -335,6 +335,35 @@ pub(super) async fn spawn_collectors_for_endpoint( } } + if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config + && let Configurable::Enabled(gnmi_cfg) = &nvue_cfg.gnmi + && !ctx.collectors.contains(CollectorKind::NvueGnmi, &key) + && matches!(endpoint.metadata, Some(EndpointMetadata::Switch(_))) + { + let collector_registry = Arc::new( + ctx.metrics_manager + .create_collector_registry(format!("nvue_gnmi_collector_{key}"), metrics_prefix)?, + ); + match spawn_gnmi_collector(endpoint, gnmi_cfg, collector_registry, data_sink.clone()) { + Ok(handle) => { + ctx.collectors + .insert(CollectorKind::NvueGnmi, key.clone(), handle); + tracing::info!( + endpoint_key = %key, + total_nvue_gnmi_collectors = ctx.collectors.len(CollectorKind::NvueGnmi), + "Started NVUE gNMI streaming collection for switch endpoint" + ); + } + Err(error) => { + tracing::error!( + ?error, + endpoint_key = %key, + "Could not start NVUE gNMI collector for switch" + ); + } + } + } + Ok(()) } diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 12fbd024e3..d0f3dcb08d 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -85,6 +85,9 @@ pub enum HealthError { #[error("Redfish SSE not available: {0}")] SseNotAvailable(String), + + #[error("gNMI error: {0}")] + GnmiError(String), } impl From for HealthError { From 8c0f234d7d78643a4090c60562cd7946974ea7f6 Mon Sep 17 00:00:00 2001 From: mkoci Date: Thu, 23 Apr 2026 13:24:18 +0200 Subject: [PATCH 10/30] feat(health): add OTLP metrics export via MetricsService Signed-off-by: mkoci --- crates/health/build.rs | 5 +- .../metrics/v1/metrics_service.proto | 79 ++ .../proto/metrics/v1/metrics.proto | 714 ++++++++++++++++++ crates/health/src/otlp/convert.rs | 67 +- crates/health/src/otlp/metrics_drain.rs | 198 +++++ crates/health/src/otlp/mod.rs | 13 + crates/health/src/sink/otlp.rs | 104 ++- 7 files changed, 1163 insertions(+), 17 deletions(-) create mode 100644 crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto create mode 100644 crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto create mode 100644 crates/health/src/otlp/metrics_drain.rs diff --git a/crates/health/build.rs b/crates/health/build.rs index 9120b06a10..0576fca63a 100644 --- a/crates/health/build.rs +++ b/crates/health/build.rs @@ -29,7 +29,10 @@ fn main() -> Result<(), Box> { .build_server(false) .build_client(true) .compile_protos( - &[proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto")], + &[ + proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto"), + proto_dir.join("opentelemetry/proto/collector/metrics/v1/metrics_service.proto"), + ], std::slice::from_ref(&proto_dir), )?; diff --git a/crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto b/crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto new file mode 100644 index 0000000000..dd48f1ad3a --- /dev/null +++ b/crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto @@ -0,0 +1,79 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.metrics.v1; + +import "opentelemetry/proto/metrics/v1/metrics.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.metrics.v1"; +option java_outer_classname = "MetricsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/metrics/v1"; + +// Service that can be used to push metrics between one Application +// instrumented with OpenTelemetry and a collector, or between a collector and a +// central collector. +service MetricsService { + // For performance reasons, it is recommended to keep this RPC + // alive for the entire life of the application. + rpc Export(ExportMetricsServiceRequest) returns (ExportMetricsServiceResponse) {} +} + +message ExportMetricsServiceRequest { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.metrics.v1.ResourceMetrics resource_metrics = 1; +} + +message ExportMetricsServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportMetricsPartialSuccess partial_success = 1; +} + +message ExportMetricsPartialSuccess { + // The number of rejected data points. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_data_points = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} diff --git a/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto b/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto new file mode 100644 index 0000000000..00c5112ce8 --- /dev/null +++ b/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto @@ -0,0 +1,714 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.metrics.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.metrics.v1"; +option java_outer_classname = "MetricsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/metrics/v1"; + +// MetricsData represents the metrics data that can be stored in a persistent +// storage, OR can be embedded by other protocols that transfer OTLP metrics +// data but do not implement the OTLP protocol. +// +// MetricsData +// └─── ResourceMetrics +// ├── Resource +// ├── SchemaURL +// └── ScopeMetrics +// ├── Scope +// ├── SchemaURL +// └── Metric +// ├── Name +// ├── Description +// ├── Unit +// └── data +// ├── Gauge +// ├── Sum +// ├── Histogram +// ├── ExponentialHistogram +// └── Summary +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message MetricsData { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceMetrics resource_metrics = 1; +} + +// A collection of ScopeMetrics from a Resource. +message ResourceMetrics { + reserved 1000; + + // The resource for the metrics in this message. + // If this field is not set then no resource info is known. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of metrics that originate from a resource. + repeated ScopeMetrics scope_metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_metrics" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Metrics produced by an Scope. +message ScopeMetrics { + // The instrumentation scope information for the metrics in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of metrics that originate from an instrumentation library. + repeated Metric metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the metric data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to all metrics in the "metrics" field. + string schema_url = 3; +} + +// Defines a Metric which has one or more timeseries. The following is a +// brief summary of the Metric data model. For more details, see: +// +// https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md +// +// The data model and relation between entities is shown in the +// diagram below. Here, "DataPoint" is the term used to refer to any +// one of the specific data point value types, and "points" is the term used +// to refer to any one of the lists of points contained in the Metric. +// +// - Metric is composed of a metadata and data. +// - Metadata part contains a name, description, unit. +// - Data is one of the possible types (Sum, Gauge, Histogram, Summary). +// - DataPoint contains timestamps, attributes, and one of the possible value type +// fields. +// +// Metric +// +------------+ +// |name | +// |description | +// |unit | +------------------------------------+ +// |data |---> |Gauge, Sum, Histogram, Summary, ... | +// +------------+ +------------------------------------+ +// +// Data [One of Gauge, Sum, Histogram, Summary, ...] +// +-----------+ +// |... | // Metadata about the Data. +// |points |--+ +// +-----------+ | +// | +---------------------------+ +// | |DataPoint 1 | +// v |+------+------+ +------+ | +// +-----+ ||label |label |...|label | | +// | 1 |-->||value1|value2|...|valueN| | +// +-----+ |+------+------+ +------+ | +// | . | |+-----+ | +// | . | ||value| | +// | . | |+-----+ | +// | . | +---------------------------+ +// | . | . +// | . | . +// | . | . +// | . | +---------------------------+ +// | . | |DataPoint M | +// +-----+ |+------+------+ +------+ | +// | M |-->||label |label |...|label | | +// +-----+ ||value1|value2|...|valueN| | +// |+------+------+ +------+ | +// |+-----+ | +// ||value| | +// |+-----+ | +// +---------------------------+ +// +// Each distinct type of DataPoint represents the output of a specific +// aggregation function, the result of applying the DataPoint's +// associated function of to one or more measurements. +// +// All DataPoint types have three common fields: +// - Attributes includes key-value pairs associated with the data point +// - TimeUnixNano is required, set to the end time of the aggregation +// - StartTimeUnixNano is optional, but strongly encouraged for DataPoints +// having an AggregationTemporality field, as discussed below. +// +// Both TimeUnixNano and StartTimeUnixNano values are expressed as +// UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. +// +// # TimeUnixNano +// +// This field is required, having consistent interpretation across +// DataPoint types. TimeUnixNano is the moment corresponding to when +// the data point's aggregate value was captured. +// +// Data points with the 0 value for TimeUnixNano SHOULD be rejected +// by consumers. +// +// # StartTimeUnixNano +// +// StartTimeUnixNano in general allows detecting when a sequence of +// observations is unbroken. This field indicates to consumers the +// start time for points with cumulative and delta +// AggregationTemporality, and it should be included whenever possible +// to support correct rate calculation. Although it may be omitted +// when the start time is truly unknown, setting StartTimeUnixNano is +// strongly encouraged. +message Metric { + reserved 4, 6, 8; + + // name of the metric. + string name = 1; + + // description of the metric, which can be used in documentation. + string description = 2; + + // unit in which the metric value is reported. Follows the format + // described by http://unitsofmeasure.org/ucum.html. + string unit = 3; + + // Data determines the aggregation type (if any) of the metric, what is the + // reported value type for the data points, as well as the relatationship to + // the time interval over which they are reported. + oneof data { + Gauge gauge = 5; + Sum sum = 7; + Histogram histogram = 9; + ExponentialHistogram exponential_histogram = 10; + Summary summary = 11; + } + + // Additional metadata attributes that describe the metric. [Optional]. + // Attributes are non-identifying. + // Consumers SHOULD NOT need to be aware of these attributes. + // These attributes MAY be used to encode information allowing + // for lossless roundtrip translation to / from another data model. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue metadata = 12; +} + +// Gauge represents the type of a scalar metric that always exports the +// "current value" for every data point. It should be used for an "unknown" +// aggregation. +// +// A Gauge does not support different aggregation temporalities. Given the +// aggregation is unknown, points cannot be combined using the same +// aggregation, regardless of aggregation temporalities. Therefore, +// AggregationTemporality is not included. Consequently, this also means +// "StartTimeUnixNano" is ignored for all data points. +message Gauge { + repeated NumberDataPoint data_points = 1; +} + +// Sum represents the type of a scalar metric that is calculated as a sum of all +// reported measurements over a time interval. +message Sum { + repeated NumberDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; + + // If "true" means that the sum is monotonic. + bool is_monotonic = 3; +} + +// Histogram represents the type of a metric that is calculated by aggregating +// as a Histogram of all reported measurements over a time interval. +message Histogram { + repeated HistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// ExponentialHistogram represents the type of a metric that is calculated by aggregating +// as a ExponentialHistogram of all reported double measurements over a time interval. +message ExponentialHistogram { + repeated ExponentialHistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// Summary metric data are used to convey quantile summaries, +// a Prometheus (see: https://prometheus.io/docs/concepts/metric_types/#summary) +// and OpenMetrics (see: https://github.com/OpenObservability/OpenMetrics/blob/4dbf6075567ab43296eed941037c12951faafb92/protos/prometheus.proto#L45) +// data type. These data points cannot always be merged in a meaningful way. +// While they can be useful in some applications, histogram data points are +// recommended for new applications. +// Summary metrics do not have an aggregation temporality field. This is +// because the count and sum fields of a SummaryDataPoint are assumed to be +// cumulative values. +message Summary { + repeated SummaryDataPoint data_points = 1; +} + +// AggregationTemporality defines how a metric aggregator reports aggregated +// values. It describes how those values relate to the time interval over +// which they are aggregated. +enum AggregationTemporality { + // UNSPECIFIED is the default AggregationTemporality, it MUST not be used. + AGGREGATION_TEMPORALITY_UNSPECIFIED = 0; + + // DELTA is an AggregationTemporality for a metric aggregator which reports + // changes since last report time. Successive metrics contain aggregation of + // values from continuous and non-overlapping intervals. + // + // The values for a DELTA metric are based only on the time interval + // associated with one measurement cycle. There is no dependency on + // previous measurements like is the case for CUMULATIVE metrics. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // DELTA metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0+1 to + // t_0+2 with a value of 2. + AGGREGATION_TEMPORALITY_DELTA = 1; + + // CUMULATIVE is an AggregationTemporality for a metric aggregator which + // reports changes since a fixed start time. This means that current values + // of a CUMULATIVE metric depend on all previous measurements since the + // start time. Because of this, the sender is required to retain this state + // in some form. If this state is lost or invalidated, the CUMULATIVE metric + // values MUST be reset and a new fixed start time following the last + // reported measurement time sent MUST be used. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // CUMULATIVE metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+2 with a value of 5. + // 9. The system experiences a fault and loses state. + // 10. The system recovers and resumes receiving at time=t_1. + // 11. A request is received, the system measures 1 request. + // 12. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_1 to + // t_0+1 with a value of 1. + // + // Note: Even though, when reporting changes since last report time, using + // CUMULATIVE is valid, it is not recommended. This may cause problems for + // systems that do not use start_time to determine when the aggregation + // value was reset (e.g. Prometheus). + AGGREGATION_TEMPORALITY_CUMULATIVE = 2; +} + +// DataPointFlags is defined as a protobuf 'uint32' type and is to be used as a +// bit-field representing 32 distinct boolean flags. Each flag defined in this +// enum is a bit-mask. To test the presence of a single flag in the flags of +// a data point, for example, use an expression like: +// +// (point.flags & DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK) == DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK +// +enum DataPointFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + DATA_POINT_FLAGS_DO_NOT_USE = 0; + + // This DataPoint is valid but has no recorded value. This value + // SHOULD be used to reflect explicitly missing data in a series, as + // for an equivalent to the Prometheus "staleness marker". + DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK = 1; + + // Bits 2-31 are reserved for future use. +} + +// NumberDataPoint is a single data point in a timeseries that describes the +// time-varying scalar value of a metric. +message NumberDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // The value itself. A point is considered invalid when one of the recognized + // value fields is not present inside this oneof. + oneof value { + double as_double = 4; + sfixed64 as_int = 6; + } + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 5; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// HistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Histogram. A Histogram contains summary statistics +// for a population of values, it may optionally contain the distribution of +// those values across a set of buckets. +// +// If the histogram contains the distribution of values, then both +// "explicit_bounds" and "bucket counts" fields must be defined. +// If the histogram does not contain the distribution of values, then both +// "explicit_bounds" and "bucket_counts" must be omitted and only "count" and +// "sum" are known. +message HistogramDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 9; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. This + // value must be equal to the sum of the "count" fields in buckets if a + // histogram is provided. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // bucket_counts is an optional field contains the count values of histogram + // for each bucket. + // + // The sum of the bucket_counts must equal the value in the count field. + // + // The number of elements in bucket_counts array must be by one greater than + // the number of elements in explicit_bounds array. + repeated fixed64 bucket_counts = 6; + + // explicit_bounds specifies buckets with explicitly defined bounds for values. + // + // The boundaries for bucket at index i are: + // + // (-infinity, explicit_bounds[i]] for i == 0 + // (explicit_bounds[i-1], explicit_bounds[i]] for 0 < i < size(explicit_bounds) + // (explicit_bounds[i-1], +infinity) for i == size(explicit_bounds) + // + // The values in the explicit_bounds array must be strictly increasing. + // + // Histogram buckets are inclusive of their upper boundary, except the last + // bucket where the boundary is at infinity. This format is intentionally + // compatible with the OpenMetrics histogram definition. + repeated double explicit_bounds = 7; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 8; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // min is the minimum value over (start_time, end_time]. + optional double min = 11; + + // max is the maximum value over (start_time, end_time]. + optional double max = 12; +} + +// ExponentialHistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a ExponentialHistogram of double values. A ExponentialHistogram contains +// summary statistics for a population of values, it may optionally contain the +// distribution of those values across a set of buckets. +// +message ExponentialHistogramDataPoint { + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be + // non-negative. This value must be equal to the sum of the "bucket_counts" + // values in the positive and negative Buckets plus the "zero_count" field. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // scale describes the resolution of the histogram. Boundaries are + // located at powers of the base, where: + // + // base = (2^(2^-scale)) + // + // The histogram bucket identified by `index`, a signed integer, + // contains values that are greater than (base^index) and + // less than or equal to (base^(index+1)). + // + // The positive and negative ranges of the histogram are expressed + // separately. Negative values are mapped by their absolute value + // into the negative range using the same scale as the positive range. + // + // scale is not restricted by the protocol, as the permissible + // values depend on the range of the data. + sint32 scale = 6; + + // zero_count is the count of values that are either exactly zero or + // within the region considered zero by the instrumentation at the + // tolerated degree of precision. This bucket stores values that + // cannot be expressed using the standard exponential formula as + // well as values that have been rounded to zero. + // + // Implementations MAY consider the zero bucket to have probability + // mass equal to (zero_count / count). + fixed64 zero_count = 7; + + // positive carries the positive range of exponential bucket counts. + Buckets positive = 8; + + // negative carries the negative range of exponential bucket counts. + Buckets negative = 9; + + // Buckets are a set of bucket counts, encoded in a contiguous array + // of counts. + message Buckets { + // Offset is the bucket index of the first entry in the bucket_counts array. + // + // Note: This uses a varint encoding as a simple form of compression. + sint32 offset = 1; + + // bucket_counts is an array of count values, where bucket_counts[i] carries + // the count of the bucket at index (offset+i). bucket_counts[i] is the count + // of values greater than base^(offset+i) and less than or equal to + // base^(offset+i+1). + // + // Note: By contrast, the explicit HistogramDataPoint uses + // fixed64. This field is expected to have many buckets, + // especially zeros, so uint64 has been selected to ensure + // varint encoding. + repeated uint64 bucket_counts = 2; + } + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 11; + + // min is the minimum value over (start_time, end_time]. + optional double min = 12; + + // max is the maximum value over (start_time, end_time]. + optional double max = 13; + + // ZeroThreshold may be optionally set to convey the width of the zero + // region. Where the zero region is defined as the closed interval + // [-ZeroThreshold, ZeroThreshold]. + // When ZeroThreshold is 0, zero count bucket stores values that cannot be + // expressed using the standard exponential formula as well as values that + // have been rounded to zero. + double zero_threshold = 14; +} + +// SummaryDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Summary metric. The count and sum fields represent +// cumulative values. +message SummaryDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#summary + double sum = 5; + + // Represents the value at a given quantile of a distribution. + // + // To record Min and Max values following conventions are used: + // - The 1.0 quantile is equivalent to the maximum value observed. + // - The 0.0 quantile is equivalent to the minimum value observed. + // + // See the following issue for more context: + // https://github.com/open-telemetry/opentelemetry-proto/issues/125 + message ValueAtQuantile { + // The quantile of a distribution. Must be in the interval + // [0.0, 1.0]. + double quantile = 1; + + // The value at the given quantile of a distribution. + // + // Quantile values must NOT be negative. + double value = 2; + } + + // (Optional) list of values at different quantiles of the distribution calculated + // from the current snapshot. The quantiles must be strictly increasing. + repeated ValueAtQuantile quantile_values = 6; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// A representation of an exemplar, which is a sample input measurement. +// Exemplars also hold information about the environment when the measurement +// was recorded, for example the span and trace ID of the active span when the +// exemplar was recorded. +message Exemplar { + reserved 1; + + // The set of key/value pairs that were filtered out by the aggregator, but + // recorded alongside the original measurement. Only key/value pairs that were + // filtered out by the aggregator should be included + repeated opentelemetry.proto.common.v1.KeyValue filtered_attributes = 7; + + // time_unix_nano is the exact time when this exemplar was recorded + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 2; + + // The value of the measurement that was recorded. An exemplar is + // considered invalid when one of the recognized value fields is not present + // inside this oneof. + oneof value { + double as_double = 3; + sfixed64 as_int = 6; + } + + // (Optional) Span ID of the exemplar trace. + // span_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes span_id = 4; + + // (Optional) Trace ID of the exemplar trace. + // trace_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes trace_id = 5; +} diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fab1b521a6..d7d85798f3 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -19,10 +19,15 @@ use std::collections::HashMap; use std::time::SystemTime; use super::collector_logs::ExportLogsServiceRequest; +use super::collector_metrics::ExportMetricsServiceRequest; use super::common::{AnyValue, KeyValue, any_value}; use super::logs::{LogRecord as OtlpLogRecord, ResourceLogs, ScopeLogs, SeverityNumber}; +use super::metrics::{ + Gauge as OtlpGauge, Metric as OtlpMetric, NumberDataPoint, ResourceMetrics, ScopeMetrics, + metric, number_data_point, +}; use super::resource::Resource; -use crate::sink::{CollectorEvent, EventContext}; +use crate::sink::{CollectorEvent, EventContext, SensorHealthData}; fn severity_text_to_number(severity: &str) -> i32 { match severity.to_uppercase().as_str() { @@ -193,6 +198,66 @@ pub fn build_export_request(batch: &[(EventContext, CollectorEvent)]) -> ExportL ExportLogsServiceRequest { resource_logs } } +/// group metric samples by endpoint and build an ExportMetricsServiceRequest. +/// every sample maps to an OTLP `Gauge` point; Sum/Histogram is a follow-up. +pub fn build_metrics_export_request( + batch: &[(EventContext, SensorHealthData)], +) -> ExportMetricsServiceRequest { + let observed_nanos = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + + let mut by_endpoint: HashMap, Vec)> = HashMap::new(); + + for (context, sample) in batch { + let data_point = NumberDataPoint { + attributes: sample + .labels + .iter() + .map(|(k, v)| kv(k, v.clone())) + .collect(), + time_unix_nano: observed_nanos, + value: Some(number_data_point::Value::AsDouble(sample.value)), + ..Default::default() + }; + + let otlp_metric = OtlpMetric { + name: sample.metric_type.clone(), + description: String::new(), + unit: sample.unit.clone(), + data: Some(metric::Data::Gauge(OtlpGauge { + data_points: vec![data_point], + })), + ..Default::default() + }; + + by_endpoint + .entry(context.endpoint_key.clone()) + .or_insert_with(|| (resource_attributes(context), Vec::new())) + .1 + .push(otlp_metric); + } + + let resource_metrics = by_endpoint + .into_values() + .map(|(attrs, metrics)| ResourceMetrics { + resource: Some(Resource { + attributes: attrs, + dropped_attributes_count: 0, + }), + scope_metrics: vec![ScopeMetrics { + scope: None, + metrics, + schema_url: String::new(), + }], + schema_url: String::new(), + }) + .collect(); + + ExportMetricsServiceRequest { resource_metrics } +} + #[cfg(test)] mod tests { use std::borrow::Cow; diff --git a/crates/health/src/otlp/metrics_drain.rs b/crates/health/src/otlp/metrics_drain.rs new file mode 100644 index 0000000000..a3d281c4b5 --- /dev/null +++ b/crates/health/src/otlp/metrics_drain.rs @@ -0,0 +1,198 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::sync::Arc; +use std::time::Duration; + +use tonic::transport::Channel; + +use super::collector_metrics::metrics_service_client::MetricsServiceClient; +use super::convert::build_metrics_export_request; +use crate::collectors::{BackoffConfig, ExponentialBackoff}; +use crate::sink::otlp::OtlpMetricsQueue; +use crate::sink::{EventContext, SensorHealthData}; + +pub(crate) struct OtlpMetricsDrainTask { + queue: Arc, + endpoint: String, + batch_size: usize, + flush_interval: Duration, +} + +impl OtlpMetricsDrainTask { + pub fn new( + queue: Arc, + endpoint: String, + batch_size: usize, + flush_interval: Duration, + ) -> Self { + Self { + queue, + endpoint, + batch_size, + flush_interval, + } + } + + fn drain_batch(&self, batch: &mut Vec<(EventContext, SensorHealthData)>) { + let remaining = self.batch_size.saturating_sub(batch.len()); + for _ in 0..remaining { + match self.queue.pop() { + Some((_key, value)) => batch.push(value), + None => break, + } + } + } + + pub async fn run(self) { + let mut client = match self.connect().await { + Some(c) => c, + None => return, + }; + + let mut batch = Vec::with_capacity(self.batch_size); + let mut interval = tokio::time::interval(self.flush_interval); + + loop { + tokio::select! { + _ = self.queue.notified() => { + self.drain_batch(&mut batch); + if batch.len() >= self.batch_size { + self.flush(&mut client, &mut batch).await; + interval.reset(); + } + } + _ = interval.tick() => { + self.drain_batch(&mut batch); + if !batch.is_empty() { + self.flush(&mut client, &mut batch).await; + } + } + } + } + } + + async fn connect(&self) -> Option> { + let endpoint = match Channel::from_shared(self.endpoint.clone()) { + Ok(e) => e, + Err(error) => { + tracing::error!( + ?error, + endpoint = %self.endpoint, + "invalid otlp metrics endpoint uri, stopping drain" + ); + return None; + } + }; + + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_secs(1), + max: Duration::from_secs(30), + }); + + loop { + match endpoint.connect().await { + Ok(channel) => { + tracing::info!(endpoint = %self.endpoint, "connected to otlp metrics collector"); + return Some(MetricsServiceClient::new(channel)); + } + Err(error) => { + let delay = backoff.next_delay(); + tracing::warn!( + ?error, + endpoint = %self.endpoint, + retry_in = ?delay, + "failed to connect to otlp metrics collector" + ); + tokio::time::sleep(delay).await; + } + } + } + } + + async fn flush( + &self, + client: &mut MetricsServiceClient, + batch: &mut Vec<(EventContext, SensorHealthData)>, + ) { + if batch.is_empty() { + return; + } + + let request = build_metrics_export_request(batch); + batch.clear(); + + let point_count = request + .resource_metrics + .iter() + .flat_map(|rm| &rm.scope_metrics) + .flat_map(|sm| &sm.metrics) + .count(); + + if point_count == 0 { + return; + } + + const MAX_RETRIES: usize = 5; + + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_millis(100), + max: Duration::from_secs(10), + }); + + for attempt in 0..=MAX_RETRIES { + match client.export(request.clone()).await { + Ok(_) => { + tracing::debug!(point_count, "exported metrics to otlp collector"); + break; + } + Err(status) if is_retryable(&status) && attempt < MAX_RETRIES => { + let delay = backoff.next_delay(); + tracing::warn!( + code = ?status.code(), + message = status.message(), + attempt, + retry_in = ?delay, + "retryable otlp metrics export error" + ); + tokio::time::sleep(delay).await; + } + Err(status) => { + tracing::error!( + code = ?status.code(), + message = status.message(), + point_count, + attempt, + "otlp metrics export failed, dropping batch" + ); + break; + } + } + } + } +} + +fn is_retryable(status: &tonic::Status) -> bool { + matches!( + status.code(), + tonic::Code::Unavailable + | tonic::Code::DeadlineExceeded + | tonic::Code::ResourceExhausted + | tonic::Code::Aborted + | tonic::Code::Internal + ) +} diff --git a/crates/health/src/otlp/mod.rs b/crates/health/src/otlp/mod.rs index ecd76b6c47..632212b9ea 100644 --- a/crates/health/src/otlp/mod.rs +++ b/crates/health/src/otlp/mod.rs @@ -17,6 +17,7 @@ pub mod convert; pub mod drain; +pub mod metrics_drain; #[allow(clippy::all)] pub mod opentelemetry { @@ -36,17 +37,29 @@ pub mod opentelemetry { tonic::include_proto!("opentelemetry.proto.logs.v1"); } } + pub mod metrics { + pub mod v1 { + tonic::include_proto!("opentelemetry.proto.metrics.v1"); + } + } pub mod collector { pub mod logs { pub mod v1 { tonic::include_proto!("opentelemetry.proto.collector.logs.v1"); } } + pub mod metrics { + pub mod v1 { + tonic::include_proto!("opentelemetry.proto.collector.metrics.v1"); + } + } } } } pub use opentelemetry::proto::collector::logs::v1 as collector_logs; +pub use opentelemetry::proto::collector::metrics::v1 as collector_metrics; pub use opentelemetry::proto::common::v1 as common; pub use opentelemetry::proto::logs::v1 as logs; +pub use opentelemetry::proto::metrics::v1 as metrics; pub use opentelemetry::proto::resource::v1 as resource; diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index be183376df..7bf775eacd 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -19,31 +19,38 @@ use std::sync::Arc; use prometheus::Counter; -use super::dedup_queue::DedupQueue; use super::event_mapper::RedfishEventMapper; -use super::{CollectorEvent, DataSink, EventContext}; +use super::override_queue::OverrideQueue; +use super::{CollectorEvent, DataSink, EventContext, SensorHealthData}; use crate::HealthError; use crate::config::OtlpSinkConfig; use crate::metrics::MetricsManager; use crate::otlp::drain::OtlpDrainTask; +use crate::otlp::metrics_drain::OtlpMetricsDrainTask; -pub(crate) type OtlpQueue = DedupQueue; +pub(crate) type OtlpQueue = OverrideQueue; +pub(crate) type OtlpMetricsQueue = OverrideQueue; #[cfg(not(feature = "bench-hooks"))] pub(crate) struct OtlpSink { queue: Arc, + metrics_queue: Arc, replaced_total: Counter, + metrics_replaced_total: Counter, mapper: Arc, } #[cfg(feature = "bench-hooks")] pub struct OtlpSink { queue: Arc, + metrics_queue: Arc, replaced_total: Counter, + metrics_replaced_total: Counter, mapper: Arc, } -pub(crate) fn is_otlp_relevant(event: &CollectorEvent) -> bool { +/// true for events that belong in the logs drain; metrics and collection sentinels are not. +pub(crate) fn is_otlp_log_relevant(event: &CollectorEvent) -> bool { !matches!( event, CollectorEvent::Metric(_) @@ -64,16 +71,25 @@ impl OtlpSink { HealthError::GenericError(format!("otlp sink requires active tokio runtime: {e}")) })?; - let queue: Arc = Arc::new(DedupQueue::new()); + let queue: Arc = Arc::new(OverrideQueue::new()); + let metrics_queue: Arc = Arc::new(OverrideQueue::new()); let replaced_total = Counter::new( format!("{prefix}_otlp_sink_replaced_total"), - "total events replaced in the otlp queue before drain could process them", + "total log events replaced in the otlp queue before drain could process them", )?; metrics_manager .global_registry() .register(Box::new(replaced_total.clone()))?; + let metrics_replaced_total = Counter::new( + format!("{prefix}_otlp_sink_metrics_replaced_total"), + "total metric samples replaced in the otlp queue before drain could process them", + )?; + metrics_manager + .global_registry() + .register(Box::new(metrics_replaced_total.clone()))?; + let drain = OtlpDrainTask::new( queue.clone(), config.endpoint.clone(), @@ -82,9 +98,20 @@ impl OtlpSink { ); handle.spawn(drain.run()); + // separate drain task so metrics don't head-of-line-block the logs export and vice versa + let metrics_drain = OtlpMetricsDrainTask::new( + metrics_queue.clone(), + config.endpoint.clone(), + config.batch_size, + config.flush_interval, + ); + handle.spawn(metrics_drain.run()); + Ok(Self { queue, + metrics_queue, replaced_total, + metrics_replaced_total, mapper, }) } @@ -94,8 +121,10 @@ impl OtlpSink { impl OtlpSink { pub fn new_for_bench(mapper: Arc) -> Self { Self { - queue: Arc::new(DedupQueue::new()), + queue: Arc::new(OverrideQueue::new()), + metrics_queue: Arc::new(OverrideQueue::new()), replaced_total: Counter::new("bench_replaced", "bench").unwrap(), + metrics_replaced_total: Counter::new("bench_metrics_replaced", "bench").unwrap(), mapper, } } @@ -106,6 +135,10 @@ impl OtlpSink { pub fn pop_for_bench(&self) -> Option<(EventContext, CollectorEvent)> { self.queue.pop().map(|(_key, value)| value) } + + pub fn pop_metric_for_bench(&self) -> Option<(EventContext, SensorHealthData)> { + self.metrics_queue.pop().map(|(_key, value)| value) + } } impl DataSink for OtlpSink { @@ -114,7 +147,18 @@ impl DataSink for OtlpSink { } fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - if !is_otlp_relevant(event) { + if let CollectorEvent::Metric(sample) = event { + let key = format!("{}|{}", context.endpoint_key, sample.key); + if self + .metrics_queue + .save_latest(key, (context.clone(), (**sample).clone())) + { + self.metrics_replaced_total.inc(); + } + return; + } + + if !is_otlp_log_relevant(event) { return; } @@ -198,23 +242,53 @@ mod tests { } #[test] - fn is_otlp_relevant_excludes_metric_events() { - assert!(!is_otlp_relevant(&metric_event())); - assert!(!is_otlp_relevant(&CollectorEvent::MetricCollectionStart)); - assert!(!is_otlp_relevant(&CollectorEvent::MetricCollectionEnd)); + fn is_otlp_log_relevant_excludes_metric_events() { + assert!(!is_otlp_log_relevant(&metric_event())); + assert!(!is_otlp_log_relevant( + &CollectorEvent::MetricCollectionStart + )); + assert!(!is_otlp_log_relevant(&CollectorEvent::MetricCollectionEnd)); } #[test] - fn is_otlp_relevant_includes_log_events() { - assert!(is_otlp_relevant(&log_event("OpenBMC.0.1.Test", "[]"))); + fn is_otlp_log_relevant_includes_log_events() { + assert!(is_otlp_log_relevant(&log_event("OpenBMC.0.1.Test", "[]"))); } #[test] - fn metric_events_are_not_queued() { + fn metric_events_go_to_metrics_queue_not_logs_queue() { let sink = test_sink(); let ctx = test_context(); sink.handle_event(&ctx, &metric_event()); + assert!(sink.queue.pop().is_none(), "logs queue should be empty"); + assert!( + sink.metrics_queue.pop().is_some(), + "metrics queue should have the sample" + ); + } + + #[test] + fn metric_collection_sentinels_are_no_op() { + let sink = test_sink(); + let ctx = test_context(); + sink.handle_event(&ctx, &CollectorEvent::MetricCollectionStart); + sink.handle_event(&ctx, &CollectorEvent::MetricCollectionEnd); assert!(sink.queue.pop().is_none()); + assert!(sink.metrics_queue.pop().is_none()); + } + + #[test] + fn metric_events_dedup_by_sample_key() { + let sink = test_sink(); + let ctx = test_context(); + sink.handle_event(&ctx, &metric_event()); + sink.handle_event(&ctx, &metric_event()); + let mut count = 0; + while sink.metrics_queue.pop().is_some() { + count += 1; + } + assert_eq!(count, 1, "same key should dedup to one entry"); + assert_eq!(sink.metrics_replaced_total.get() as u64, 1); } #[test] From c8e3c83b221b971603ab55a726bd34fb8df31e46 Mon Sep 17 00:00:00 2001 From: mkoci Date: Thu, 14 May 2026 14:52:59 -0400 Subject: [PATCH 11/30] fix(health): adapt nvos streaming to metadata stack Signed-off-by: mkoci --- .../opentelemetry/proto/metrics/v1/metrics.proto | 6 +++--- crates/health/src/collectors/nvue/gnmi/client.rs | 2 ++ .../src/collectors/nvue/gnmi/sample_processor.rs | 2 ++ crates/health/src/discovery/context.rs | 2 +- crates/health/src/discovery/spawn.rs | 2 +- crates/health/src/sink/otlp.rs | 14 +++++++------- 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto b/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto index 00c5112ce8..e8587fb54e 100644 --- a/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto +++ b/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto @@ -533,7 +533,7 @@ message ExponentialHistogramDataPoint { // doing so. This is specifically to enforce compatibility w/ OpenMetrics, // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram optional double sum = 5; - + // scale describes the resolution of the histogram. Boundaries are // located at powers of the base, where: // @@ -571,7 +571,7 @@ message ExponentialHistogramDataPoint { // of counts. message Buckets { // Offset is the bucket index of the first entry in the bucket_counts array. - // + // // Note: This uses a varint encoding as a simple form of compression. sint32 offset = 1; @@ -585,7 +585,7 @@ message ExponentialHistogramDataPoint { // especially zeros, so uint64 has been selected to ensure // varint encoding. repeated uint64 bucket_counts = 2; - } + } // Flags that apply to this specific data point. See DataPointFlags // for the available flags and their meaning. diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 1c0537098f..28d4d02714 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -235,6 +235,7 @@ fn build_auth_metadata( /// Extract a string from a `TypedValue`, handling JSON-encoded bytes as well /// as native string values. +#[allow(deprecated)] pub fn typed_value_to_string(val: &proto::TypedValue) -> Option { use proto::typed_value::Value; match &val.value { @@ -256,6 +257,7 @@ pub fn typed_value_to_string(val: &proto::TypedValue) -> Option { /// Extract a float from a `TypedValue`, handling JSON-encoded bytes, native /// numeric values, and string representations. +#[allow(deprecated)] pub fn typed_value_to_f64(val: &proto::TypedValue) -> Option { use proto::typed_value::Value; match &val.value { diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 5d54f3a7c0..993752c2a2 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -35,6 +35,7 @@ pub(crate) struct GnmiSampleProcessor { } impl GnmiSampleProcessor { + #[allow(deprecated)] pub(crate) fn process_subscribe_response( &self, resp: &proto::SubscribeResponse, @@ -824,6 +825,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_process_subscribe_response_error_increments_counter() { let proc = test_processor(); let metrics = test_stream_metrics(); diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index fb9f94bb60..5f4abe43f1 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -50,7 +50,7 @@ pub(super) enum CollectorKind { } impl CollectorKind { - pub(super) const ALL: [CollectorKind; 6] = [ + pub(super) const ALL: [CollectorKind; 7] = [ CollectorKind::Sensor, CollectorKind::Logs, CollectorKind::Firmware, diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 483dd210db..44f4d6c2fe 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -347,7 +347,7 @@ pub(super) async fn spawn_collectors_for_endpoint( match spawn_gnmi_collector(endpoint, gnmi_cfg, collector_registry, data_sink.clone()) { Ok(handle) => { ctx.collectors - .insert(CollectorKind::NvueGnmi, key.clone(), handle); + .insert(CollectorKind::NvueGnmi, key.clone().into(), handle); tracing::info!( endpoint_key = %key, total_nvue_gnmi_collectors = ctx.collectors.len(CollectorKind::NvueGnmi), diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index 7bf775eacd..26c1a50909 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -19,8 +19,8 @@ use std::sync::Arc; use prometheus::Counter; +use super::dedup_queue::DedupQueue; use super::event_mapper::RedfishEventMapper; -use super::override_queue::OverrideQueue; use super::{CollectorEvent, DataSink, EventContext, SensorHealthData}; use crate::HealthError; use crate::config::OtlpSinkConfig; @@ -28,8 +28,8 @@ use crate::metrics::MetricsManager; use crate::otlp::drain::OtlpDrainTask; use crate::otlp::metrics_drain::OtlpMetricsDrainTask; -pub(crate) type OtlpQueue = OverrideQueue; -pub(crate) type OtlpMetricsQueue = OverrideQueue; +pub(crate) type OtlpQueue = DedupQueue; +pub(crate) type OtlpMetricsQueue = DedupQueue; #[cfg(not(feature = "bench-hooks"))] pub(crate) struct OtlpSink { @@ -71,8 +71,8 @@ impl OtlpSink { HealthError::GenericError(format!("otlp sink requires active tokio runtime: {e}")) })?; - let queue: Arc = Arc::new(OverrideQueue::new()); - let metrics_queue: Arc = Arc::new(OverrideQueue::new()); + let queue: Arc = Arc::new(DedupQueue::new()); + let metrics_queue: Arc = Arc::new(DedupQueue::new()); let replaced_total = Counter::new( format!("{prefix}_otlp_sink_replaced_total"), @@ -121,8 +121,8 @@ impl OtlpSink { impl OtlpSink { pub fn new_for_bench(mapper: Arc) -> Self { Self { - queue: Arc::new(OverrideQueue::new()), - metrics_queue: Arc::new(OverrideQueue::new()), + queue: Arc::new(DedupQueue::new()), + metrics_queue: Arc::new(DedupQueue::new()), replaced_total: Counter::new("bench_replaced", "bench").unwrap(), metrics_replaced_total: Counter::new("bench_metrics_replaced", "bench").unwrap(), mapper, From 3aafde5551ab8011fb0ddf1915f79cee26ff3f40 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Fri, 15 May 2026 02:34:56 +0000 Subject: [PATCH 12/30] fix(health): add support for positional metadata in telemetry Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 110 ++++++++++++++++-- .../collectors/nvue/gnmi/sample_processor.rs | 97 +++++++++++++++ crates/health/src/config.rs | 46 ++++++++ crates/health/src/otlp/convert.rs | 10 +- crates/health/src/sink/prometheus.rs | 10 +- docs/architecture/health_aggregation.md | 4 +- 6 files changed, 260 insertions(+), 17 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index b49c78310b..f8832cc894 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,6 +15,7 @@ * limitations under the License. */ +buse std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; @@ -68,6 +69,34 @@ impl CredentialProvider for ApiCredentialProvider { } } +fn machine_slot_number( + machine: &rpc::forge::Machine, + position: Option<&rpc::forge::MachinePositionInfo>, +) -> Option { + position + .and_then(|position| position.physical_slot_number) + .or_else(|| { + machine + .placement_in_rack + .as_ref() + .and_then(|placement| placement.slot_number) + }) +} + +fn machine_tray_index( + machine: &rpc::forge::Machine, + position: Option<&rpc::forge::MachinePositionInfo>, +) -> Option { + position + .and_then(|position| position.compute_tray_index) + .or_else(|| { + machine + .placement_in_rack + .as_ref() + .and_then(|placement| placement.tray_index) + }) +} + impl ApiClientWrapper { pub fn new(root_ca: String, client_cert: String, client_key: String, api_url: &Url) -> Self { let client_config = ForgeClientConfig::new( @@ -118,13 +147,15 @@ impl ApiClientWrapper { .find_machines_by_ids(request) .await .map_err(HealthError::ApiInvocationError)?; + let positions = self.fetch_machine_position_info(ids_chunk).await; tracing::debug!( "Fetched details for {} machines with chunk size of 100", machines.machines.len(), ); for machine in machines.machines { - match self.extract_machine_endpoint(&machine).await { + let position = machine.id.as_ref().and_then(|id| positions.get(id)); + match self.extract_machine_endpoint(&machine, position).await { Ok(endpoint) => endpoints.push(Arc::new(endpoint)), Err(error) => tracing::warn!( ?machine, @@ -138,6 +169,30 @@ impl ApiClientWrapper { Ok(endpoints) } + async fn fetch_machine_position_info( + &self, + machine_ids: &[carbide_uuid::machine::MachineId], + ) -> HashMap { + let request = rpc::forge::MachinePositionQuery { + machine_ids: machine_ids.to_vec(), + }; + + match self.client.get_machine_position_info(request).await { + Ok(response) => response + .machine_position_info + .into_iter() + .filter_map(|info| info.machine_id.map(|id| (id, info))) + .collect(), + Err(error) => { + tracing::warn!( + ?error, + "failed to fetch machine position info; falling back to machine placement metadata" + ); + HashMap::new() + } + } + } + async fn fetch_switch_endpoints(&self) -> Vec> { let switch_request = rpc::forge::SwitchQuery { name: None, @@ -203,6 +258,7 @@ impl ApiClientWrapper { async fn extract_machine_endpoint( &self, machine: &rpc::forge::Machine, + position: Option<&rpc::forge::MachinePositionInfo>, ) -> Result { let Some(bmc_info) = &machine.bmc_info else { return Err(HealthError::GenericError( @@ -218,14 +274,8 @@ impl ApiClientWrapper { .as_ref() .and_then(|info| info.dmi_data.as_ref()) .map(|dmi| dmi.chassis_serial.clone()), - slot_number: machine - .placement_in_rack - .as_ref() - .and_then(|placement| placement.slot_number), - tray_index: machine - .placement_in_rack - .as_ref() - .and_then(|placement| placement.tray_index), + slot_number: machine_slot_number(machine, position), + tray_index: machine_tray_index(machine, position), nvlink_domain_uuid: machine .nvlink_info .as_ref() @@ -252,7 +302,7 @@ impl ApiClientWrapper { .as_ref() .map(|config| config.name.clone()) .ok_or(HealthError::GenericError( - "Switch endpont does not have serial".to_string(), + "switch endpoint does not have serial".to_string(), ))?; self.endpoint_with_auth( @@ -269,7 +319,7 @@ impl ApiClientWrapper { .as_ref() .and_then(|placement| placement.tray_index), })), - None, + switch.rack_id.clone(), ) .await } @@ -470,3 +520,41 @@ impl From for BmcCredentials { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn machine_position_info_takes_precedence_over_machine_placement() { + let machine = rpc::forge::Machine { + placement_in_rack: Some(rpc::forge::PlacementInRack { + slot_number: Some(2), + tray_index: Some(1), + }), + ..Default::default() + }; + let position = rpc::forge::MachinePositionInfo { + physical_slot_number: Some(11), + compute_tray_index: Some(4), + ..Default::default() + }; + + assert_eq!(machine_slot_number(&machine, Some(&position)), Some(11)); + assert_eq!(machine_tray_index(&machine, Some(&position)), Some(4)); + } + + #[test] + fn machine_placement_is_fallback_when_position_info_is_absent() { + let machine = rpc::forge::Machine { + placement_in_rack: Some(rpc::forge::PlacementInRack { + slot_number: Some(2), + tray_index: Some(1), + }), + ..Default::default() + }; + + assert_eq!(machine_slot_number(&machine, None), Some(2)); + assert_eq!(machine_tray_index(&machine, None), Some(1)); + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 993752c2a2..378fc0d7f6 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -324,9 +324,33 @@ pub(crate) fn now_unix_secs() -> f64 { #[cfg(test)] mod tests { use std::collections::HashMap; + use std::sync::{Arc, Mutex}; + + use carbide_uuid::rack::RackId; + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; + + use crate::endpoint::{EndpointMetadata, SwitchData}; use super::*; + #[derive(Default)] + struct CapturingSink { + events: Mutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + self.events + .lock() + .expect("lock poisoned") + .push((context.clone(), event.clone())); + } + } + #[test] fn test_leaf_matches() { let elems: Vec = ["interfaces", "interface", "state", "oper-status"] @@ -440,6 +464,13 @@ mod tests { } } + fn test_switch_id(label: &str) -> SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } + #[test] fn test_process_notification_interface_oper_status() { let proc = test_processor(); @@ -470,6 +501,72 @@ mod tests { assert_eq!(count, 1); } + #[test] + fn emitted_metrics_preserve_switch_position_context() { + use std::str::FromStr; + + use mac_address::MacAddress; + + use crate::endpoint::BmcAddr; + + let sink = Arc::new(CapturingSink::default()); + let switch_id = test_switch_id("switch-a"); + let proc = GnmiSampleProcessor { + data_sink: Some(sink.clone()), + event_context: EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }, + collector_type: NVUE_GNMI_SAMPLE_STREAM_ID, + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + })), + rack_id: Some(RackId::new("RACK_2")), + }, + switch_id: "SN-SWITCH-001".to_string(), + }; + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl4")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1); + let (context, event) = &events[0]; + assert_eq!(context.switch_id(), Some(switch_id)); + assert_eq!(context.switch_slot_number(), Some(7)); + assert_eq!(context.switch_tray_index(), Some(3)); + assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); + assert!(matches!(event, CollectorEvent::Metric(_))); + } + #[test] fn test_process_notification_component_temperature() { let proc = test_processor(); diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index f853550aad..6eb8baec14 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -111,7 +111,9 @@ pub struct StaticBmcEndpoint { pub struct StaticMachineEndpoint { pub id: String, pub serial: Option, + #[serde(alias = "physical_slot_number")] pub slot_number: Option, + #[serde(alias = "compute_tray_index")] pub tray_index: Option, pub nvlink_domain_uuid: Option, } @@ -128,7 +130,9 @@ pub struct StaticPowerShelfEndpoint { pub struct StaticSwitchEndpoint { pub id: Option, pub serial: Option, + #[serde(alias = "physical_slot_number")] pub slot_number: Option, + #[serde(alias = "compute_tray_index")] pub tray_index: Option, } @@ -1489,6 +1493,48 @@ machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", ); } + #[test] + fn test_static_endpoints_accept_position_field_aliases() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +mac = "11:22:33:44:55:11" +username = "admin" +password = "pass" +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", physical_slot_number = 15, compute_tray_index = 5 } + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.1" +mac = "11:22:33:44:55:66" +username = "cumulus" +password = "pass" +switch = { serial = "SN-SW-001", physical_slot_number = 7, compute_tray_index = 3 } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("failed to parse static endpoint config"); + + let machine = config.endpoint_sources.static_bmc_endpoints[0] + .machine + .as_ref() + .expect("machine metadata"); + assert_eq!(machine.slot_number, Some(15)); + assert_eq!(machine.tray_index, Some(5)); + + let switch = config.endpoint_sources.static_bmc_endpoints[1] + .switch + .as_ref() + .expect("switch metadata"); + assert_eq!(switch.slot_number, Some(7)); + assert_eq!(switch.tray_index, Some(3)); + } + #[test] fn test_static_endpoint_rejects_multiple_identity_types() { let toml_content = r#" diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index d7d85798f3..257dd345ce 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -79,6 +79,9 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(switch_id) = context.switch_id() { attrs.push(kv("switch.id", switch_id.to_string())); } + if let Some(rack_id) = context.rack_id() { + attrs.push(kv("rack.id", rack_id.to_string())); + } if let Some(slot) = context.slot_number() { attrs.push(int_kv("machine.slot_number", i64::from(slot))); } @@ -265,6 +268,7 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; @@ -337,11 +341,12 @@ mod tests { tray_index: Some(5), nvlink_domain_uuid: Some(domain_uuid), })), - rack_id: None, + rack_id: Some(RackId::new("RACK_1")), }; let attrs = resource_attributes(&context); + assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_1")); assert_eq!(attr_int_value(&attrs, "machine.slot_number"), Some(15)); assert_eq!(attr_int_value(&attrs, "machine.tray_index"), Some(5)); assert_eq!( @@ -368,7 +373,7 @@ mod tests { slot_number: Some(7), tray_index: Some(3), })), - rack_id: None, + rack_id: Some(RackId::new("RACK_2")), }; let attrs = resource_attributes(&context); @@ -377,6 +382,7 @@ mod tests { attr_value(&attrs, "switch.id"), Some(switch_id_attr.as_str()) ); + assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_2")); assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); assert_eq!(attr_int_value(&attrs, "switch.tray_index"), Some(3)); } diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index e1c70aae6f..5eb3108851 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -103,6 +103,9 @@ impl PrometheusSink { if let Some(serial) = context.serial_number() { labels.push((Cow::Borrowed("serial_number"), serial.to_string())); } + if let Some(rack_id) = context.rack_id() { + labels.push((Cow::Borrowed("rack_id"), rack_id.to_string())); + } if let Some(slot) = context.slot_number() { labels.push((Cow::Borrowed("machine_slot_number"), slot.to_string())); } @@ -237,6 +240,7 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; @@ -269,7 +273,7 @@ mod tests { tray_index: Some(5), nvlink_domain_uuid: Some(NvLinkDomainId::nil()), })), - rack_id: None, + rack_id: Some(RackId::new("RACK_1")), }; let labels = PrometheusSink::stream_static_labels(&context); @@ -284,6 +288,7 @@ mod tests { Some("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") ); assert_eq!(label_value("serial_number"), Some("MN-001")); + assert_eq!(label_value("rack_id"), Some("RACK_1")); assert_eq!(label_value("machine_slot_number"), Some("15")); assert_eq!(label_value("machine_tray_index"), Some("5")); assert_eq!( @@ -310,7 +315,7 @@ mod tests { slot_number: Some(7), tray_index: Some(3), })), - rack_id: None, + rack_id: Some(RackId::new("RACK_2")), }; let labels = PrometheusSink::stream_static_labels(&context); @@ -322,6 +327,7 @@ mod tests { assert_eq!(label_value("switch_id"), Some(switch_id_label.as_str())); assert_eq!(label_value("serial_number"), Some("SN-SWITCH-001")); + assert_eq!(label_value("rack_id"), Some("RACK_2")); assert_eq!(label_value("switch_slot_number"), Some("7")); assert_eq!(label_value("switch_tray_index"), Some("3")); } diff --git a/docs/architecture/health_aggregation.md b/docs/architecture/health_aggregation.md index c5100b2dca..bdccbdb2ef 100644 --- a/docs/architecture/health_aggregation.md +++ b/docs/architecture/health_aggregation.md @@ -268,8 +268,8 @@ ranges or by interpreting the `health_ok` values provided by BMCs. For production deployments, `carbide-hw-health` discovers machine, switch, and power-shelf BMC endpoints from Carbide API via `[endpoint_sources.carbide_api]`. Machine endpoints carry the inventory metadata needed to interpret hardware health in fleet context, including machine ID, serial number, rack ID, rack placement, and NVLink domain UUID when present. Switch endpoints carry switch ID, serial number, and rack placement when present. Local and test deployments can instead configure explicit machine, switch, or power-shelf identity with `[[endpoint_sources.static_bmc_endpoints]]`; static machine endpoints can include the same serial number, rack placement, and NVLink domain UUID metadata, static switch endpoints can include serial number and rack placement metadata, and all static endpoints can provide `rack_id` when rack-level rollups are needed. The publishing sinks expose that inventory context using the conventions of the target backend: -- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`; switch metadata uses `switch_id`, `serial_number`, `switch_slot_number`, and `switch_tray_index`. -- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`; switch metadata uses `switch.id`, integer `switch.slot_number`, and integer `switch.tray_index`. +- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `rack_id`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`; switch metadata uses `switch_id`, `serial_number`, `rack_id`, `switch_slot_number`, and `switch_tray_index`. +- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, `rack.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`; switch metadata uses `switch.id`, `rack.id`, integer `switch.slot_number`, and integer `switch.tray_index`. - `[sinks.health_report]`, `[sinks.rack_health_report]`, `[sinks.switch_health_report]`, and `[sinks.power_shelf_health_report]` use the same event context when submitting assessed health reports back to Carbide API. The persisted `HealthReport` and `HealthProbeAlert` schemas remain the probe success/alert model described above. ### BMC inventory monitoring From 37371fbcc5fae0ffe2b76ac42932216e2e716cec Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Fri, 15 May 2026 15:06:38 -0400 Subject: [PATCH 13/30] feat(health): restore nvue gnmi on-change events Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 2 + crates/health/src/api_client.rs | 4 +- .../health/src/collectors/nvue/gnmi/client.rs | 126 ++++ crates/health/src/collectors/nvue/gnmi/mod.rs | 1 + .../nvue/gnmi/on_change_processor.rs | 570 ++++++++++++++++++ .../src/collectors/nvue/gnmi/subscriber.rs | 169 +++++- crates/health/src/config.rs | 44 ++ 7 files changed, 906 insertions(+), 10 deletions(-) create mode 100644 crates/health/src/collectors/nvue/gnmi/on_change_processor.rs diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index d57a5e1b3a..ab47ffa48e 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -164,6 +164,8 @@ interfaces_enabled = true gnmi_port = 9339 sample_interval = "5m" request_timeout = "30s" +# gNMI ON_CHANGE subscription for system events +system_events_enabled = true [collectors.nvue.gnmi.paths] components_enabled = true diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index f8832cc894..f9271fd455 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,7 +15,7 @@ * limitations under the License. */ -buse std::collections::HashMap; +use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; @@ -348,7 +348,7 @@ impl ApiClientWrapper { id: power_shelf.id, serial, })), - None, + power_shelf.rack_id.clone(), ) .await } diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 28d4d02714..336bd30daa 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -183,6 +183,77 @@ impl GnmiClient { Ok(response.into_inner()) } + + /// open a gNMI ON_CHANGE streaming subscription + pub async fn subscribe_on_change( + &self, + prefix: &Path, + paths: &[Path], + ) -> Result, HealthError> { + let mut client = self.connect().await?; + + let subscribe_request = build_on_change_subscribe_request(prefix, paths); + + let auth = build_auth_metadata(&self.username, &self.password)?; + let stream = tokio_stream::once(subscribe_request); + let request = Request::from_parts(auth, Extensions::default(), stream); + + let response = client.subscribe(request).await.map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: subscribe_on_change RPC failed: {e}", + self.switch_id + )) + })?; + + tracing::debug!( + switch_id = %self.switch_id, + "gNMI ON_CHANGE stream opened" + ); + + Ok(response.into_inner()) + } +} + +pub(crate) fn system_events_prefix() -> Path { + Path { + target: "nvos".to_string(), + elem: vec![PathElem { + name: "system-events".to_string(), + key: Default::default(), + }], + ..Default::default() + } +} + +/// gNMI path for ON_CHANGE system event subscriptions. An empty path subscribes +/// to all events below the `system-events` prefix. +pub(crate) fn system_events_subscribe_path() -> Vec { + vec![Path::default()] +} + +fn build_on_change_subscribe_request(prefix: &Path, paths: &[Path]) -> SubscribeRequest { + let subscription_list = SubscriptionList { + prefix: Some(prefix.clone()), + subscription: paths + .iter() + .map(|path| Subscription { + path: Some(path.clone()), + mode: SubscriptionMode::OnChange.into(), + ..Default::default() + }) + .collect(), + mode: SubscriptionListMode::Stream.into(), + encoding: Encoding::Json.into(), + updates_only: true, + ..Default::default() + }; + + SubscribeRequest { + request: Some(proto::subscribe_request::Request::Subscribe( + subscription_list, + )), + extension: vec![], + } } fn build_sample_subscribe_request(paths: &[Path], sample_interval_nanos: u64) -> SubscribeRequest { @@ -467,4 +538,59 @@ mod tests { assert!(sub.path.is_some(), "each subscription must have a path"); } } + + #[test] + fn test_system_events_prefix() { + let prefix = system_events_prefix(); + assert_eq!(prefix.target, "nvos"); + assert_eq!(prefix.elem.len(), 1); + assert_eq!(prefix.elem[0].name, "system-events"); + } + + #[test] + fn test_system_events_subscribe_path() { + let paths = system_events_subscribe_path(); + assert_eq!(paths.len(), 1); + assert!( + paths[0].elem.is_empty(), + "empty path subscribes to all events under prefix" + ); + } + + #[test] + fn test_build_on_change_subscribe_request() { + let prefix = system_events_prefix(); + let paths = system_events_subscribe_path(); + + let req = build_on_change_subscribe_request(&prefix, &paths); + + let sub_list = match req.request { + Some(proto::subscribe_request::Request::Subscribe(sl)) => sl, + _ => panic!("expected Subscribe variant"), + }; + + assert_eq!( + sub_list.mode, + i32::from(SubscriptionListMode::Stream), + "must use Stream mode" + ); + assert_eq!( + sub_list.encoding, + i32::from(Encoding::Json), + "encoding must be JSON" + ); + assert!(sub_list.updates_only, "ON_CHANGE must use updates_only"); + + let req_prefix = sub_list.prefix.expect("prefix must be set"); + assert_eq!(req_prefix.target, "nvos"); + assert_eq!(req_prefix.elem.len(), 1); + assert_eq!(req_prefix.elem[0].name, "system-events"); + + assert_eq!(sub_list.subscription.len(), 1); + assert_eq!( + sub_list.subscription[0].mode, + i32::from(SubscriptionMode::OnChange), + "subscription must use OnChange mode" + ); + } } diff --git a/crates/health/src/collectors/nvue/gnmi/mod.rs b/crates/health/src/collectors/nvue/gnmi/mod.rs index 7a7b5b7338..3ce40c9d3d 100644 --- a/crates/health/src/collectors/nvue/gnmi/mod.rs +++ b/crates/health/src/collectors/nvue/gnmi/mod.rs @@ -16,6 +16,7 @@ */ pub(crate) mod client; +pub(crate) mod on_change_processor; pub(crate) mod sample_processor; pub(crate) mod subscriber; diff --git a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs new file mode 100644 index 0000000000..376a7fcb7b --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs @@ -0,0 +1,570 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use std::time::Instant; + +use prometheus::{CounterVec, Gauge, Opts}; + +use super::client::typed_value_to_string; +use super::proto::{self, PathElem}; +use super::sample_processor::now_unix_secs; +use super::subscriber::GnmiStreamMetrics; +use crate::HealthError; +use crate::sink::{CollectorEvent, DataSink, EventContext, SensorHealthData}; + +type ParsedRow = HashMap; +type TableSnapshot = HashMap; + +pub(crate) const ON_CHANGE_STREAM_ID_SYSTEM_EVENTS: &str = "nvue_gnmi_events"; + +pub(crate) struct OnChangeStreamMetrics { + pub(crate) rows_total: CounterVec, + pub(crate) last_row_timestamp: Gauge, +} + +impl OnChangeStreamMetrics { + pub(crate) fn new( + registry: &prometheus::Registry, + prefix: &str, + stream_id: &str, + const_labels: HashMap, + ) -> Result { + let rows_total = CounterVec::new( + Opts::new( + format!("{prefix}_{stream_id}_total"), + "ON_CHANGE rows received by severity (field 'severity' if present)", + ) + .const_labels(const_labels.clone()), + &["severity"], + )?; + registry.register(Box::new(rows_total.clone()))?; + + let last_row_timestamp = Gauge::with_opts( + Opts::new( + format!("{prefix}_{stream_id}_last_timestamp"), + "Unix timestamp of most recent ON_CHANGE row", + ) + .const_labels(const_labels), + )?; + registry.register(Box::new(last_row_timestamp.clone()))?; + + Ok(Self { + rows_total, + last_row_timestamp, + }) + } +} + +pub(crate) struct GnmiOnChangeProcessor { + pub(crate) collector_name: String, + pub(crate) stream_metrics: OnChangeStreamMetrics, + pub(crate) data_sink: Option>, + pub(crate) event_context: EventContext, + pub(crate) switch_id: String, + previous_snapshot: Mutex, +} + +impl GnmiOnChangeProcessor { + pub(crate) fn new( + collector_name: String, + stream_metrics: OnChangeStreamMetrics, + data_sink: Option>, + event_context: EventContext, + switch_id: String, + ) -> Self { + Self { + collector_name, + stream_metrics, + data_sink, + event_context, + switch_id, + previous_snapshot: Mutex::new(HashMap::new()), + } + } + + #[allow(deprecated)] + pub(crate) fn process_subscribe_response( + &self, + resp: &proto::SubscribeResponse, + stream_metrics: &GnmiStreamMetrics, + ) { + let notification = match &resp.response { + Some(proto::subscribe_response::Response::Update(n)) => n, + Some(proto::subscribe_response::Response::SyncResponse(_)) => return, + Some(proto::subscribe_response::Response::Error(e)) => { + stream_metrics.stream_errors_total.inc(); + tracing::warn!( + code = e.code, + message = %e.message, + stream = %self.collector_name, + "nvue_gnmi ON_CHANGE: server error in stream" + ); + return; + } + None => return, + }; + + stream_metrics.notifications_received_total.inc(); + stream_metrics + .last_notification_timestamp + .set(now_unix_secs()); + + let start = Instant::now(); + let entity_count = self.process_notification(notification); + stream_metrics + .notification_processing_seconds + .observe(start.elapsed().as_secs_f64()); + stream_metrics.monitored_entities.set(entity_count as f64); + } + + fn process_notification(&self, notification: &proto::Notification) -> usize { + let prefix_elems: &[PathElem] = notification + .prefix + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let mut current: TableSnapshot = HashMap::new(); + + for update in ¬ification.update { + let val = match update.val.as_ref() { + Some(v) => v, + None => continue, + }; + + let update_elems: &[PathElem] = update + .path + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let combined: Vec<&PathElem> = prefix_elems.iter().chain(update_elems.iter()).collect(); + + let Some(instance_key) = find_instance_key(&combined) else { + continue; + }; + let Some(leaf_elem) = combined.last() else { + continue; + }; + + let value = typed_value_to_string(val).unwrap_or_default(); + current + .entry(instance_key.to_string()) + .or_default() + .insert(leaf_elem.name.clone(), value); + } + + let mut previous = match self.previous_snapshot.lock() { + Ok(guard) => guard, + Err(poisoned) => poisoned.into_inner(), + }; + for (instance_id, row) in ¤t { + let is_new_or_changed = previous.get(instance_id).map(|p| p != row).unwrap_or(true); + if is_new_or_changed { + self.emit_row_as_metric(instance_id, row); + } + } + + let entity_count = current.len(); + *previous = current; + entity_count + } + + fn emit_row_as_metric(&self, instance_id: &str, row: &ParsedRow) { + let severity = row.get("severity").map(String::as_str).unwrap_or("unknown"); + let text = row.get("text").map(String::as_str).unwrap_or(""); + + self.stream_metrics.last_row_timestamp.set(now_unix_secs()); + self.stream_metrics + .rows_total + .with_label_values(&[severity]) + .inc(); + + tracing::info!( + switch_id = %self.switch_id, + stream = %self.collector_name, + instance_id, + severity, + text, + "nvue_gnmi ON_CHANGE: row received" + ); + + let Some(sink) = &self.data_sink else { return }; + + let key = format!("{}:{}", self.collector_name, instance_id); + let mut labels = vec![ + (Cow::Borrowed("instance_id"), instance_id.to_string()), + (Cow::Borrowed("text"), text.to_string()), + ]; + for (key, value) in row { + if key != "text" { + labels.push((Cow::Owned(key.clone()), value.clone())); + } + } + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(SensorHealthData { + key, + name: self.collector_name.clone(), + metric_type: "on_change_row".to_string(), + unit: "severity".to_string(), + value: severity_to_f64(Some(severity)), + labels, + context: None, + })), + ); + } +} + +fn find_instance_key<'a>(elems: &[&'a PathElem]) -> Option<&'a str> { + elems + .iter() + .find(|e| !e.key.is_empty()) + .and_then(|e| e.key.values().next().map(String::as_str)) +} + +fn severity_to_f64(severity: Option<&str>) -> f64 { + match severity { + Some(s) if s.eq_ignore_ascii_case("informational") => 1.0, + Some(s) if s.eq_ignore_ascii_case("warning") => 2.0, + Some(s) if s.eq_ignore_ascii_case("error") => 3.0, + Some(s) if s.eq_ignore_ascii_case("critical") => 4.0, + _ => 0.0, + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use carbide_uuid::rack::RackId; + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; + use mac_address::MacAddress; + + use crate::endpoint::{BmcAddr, EndpointMetadata, SwitchData}; + + use super::*; + + const TEST_COLLECTOR_NAME: &str = "nvue_gnmi_system_events"; + + #[derive(Default)] + struct CapturingSink { + events: Mutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + self.events + .lock() + .expect("lock poisoned") + .push((context.clone(), event.clone())); + } + } + + fn test_labels() -> HashMap { + HashMap::from([( + "collector_type".to_string(), + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), + )]) + } + + fn test_switch_id(label: &str) -> SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } + + fn test_event_context(collector_type: &'static str) -> EventContext { + EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }, + collector_type, + metadata: None, + rack_id: None, + } + } + + fn test_processor(data_sink: Option>) -> GnmiOnChangeProcessor { + let registry = prometheus::Registry::new(); + let stream_metrics = + OnChangeStreamMetrics::new(®istry, "test", TEST_COLLECTOR_NAME, test_labels()) + .unwrap(); + GnmiOnChangeProcessor::new( + TEST_COLLECTOR_NAME.to_string(), + stream_metrics, + data_sink, + test_event_context(TEST_COLLECTOR_NAME), + "SN1234".to_string(), + ) + } + + fn make_path_elem(name: &str, keys: &[(&str, &str)]) -> PathElem { + PathElem { + name: name.to_string(), + key: keys + .iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + } + } + + fn make_typed_value_string(value: &str) -> proto::TypedValue { + proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal(value.to_string())), + } + } + + #[test] + fn test_find_instance_key() { + let elems = vec![ + make_path_elem("system-events", &[]), + make_path_elem("system-event", &[("event-id", "38")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ]; + let refs: Vec<&PathElem> = elems.iter().collect(); + assert_eq!(find_instance_key(&refs), Some("38")); + } + + #[test] + fn test_find_instance_key_missing() { + let elems = [ + make_path_elem("system-events", &[]), + make_path_elem("state", &[]), + ]; + let refs: Vec<&PathElem> = elems.iter().collect(); + assert_eq!(find_instance_key(&refs), None); + } + + #[test] + fn test_severity_to_f64() { + assert_eq!(severity_to_f64(Some("informational")), 1.0); + assert_eq!(severity_to_f64(Some("warning")), 2.0); + assert_eq!(severity_to_f64(Some("error")), 3.0); + assert_eq!(severity_to_f64(Some("critical")), 4.0); + assert_eq!(severity_to_f64(Some("CRITICAL")), 4.0); + assert_eq!(severity_to_f64(Some("other")), 0.0); + assert_eq!(severity_to_f64(None), 0.0); + } + + #[test] + fn test_on_change_stream_metrics_duplicate_registration_fails() { + let registry = prometheus::Registry::new(); + let _ = OnChangeStreamMetrics::new(®istry, "test", "stream_a", test_labels()).unwrap(); + let result = OnChangeStreamMetrics::new(®istry, "test", "stream_a", test_labels()); + assert!(result.is_err()); + } + + #[test] + fn test_process_notification_severity_and_text() { + let processor = test_processor(None); + let notification = proto::Notification { + prefix: Some(proto::Path { + elem: vec![make_path_elem("system-events", &[])], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "5")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("critical")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "5")]), + make_path_elem("state", &[]), + make_path_elem("text", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("System fatal state detected")), + ..Default::default() + }, + ], + ..Default::default() + }; + + let count = processor.process_notification(¬ification); + assert_eq!(count, 1); + assert_eq!( + processor + .stream_metrics + .rows_total + .with_label_values(&["critical"]) + .get(), + 1.0 + ); + assert!(processor.stream_metrics.last_row_timestamp.get() > 0.0); + } + + #[test] + fn test_process_notification_snapshot_diff_no_duplicate_emit() { + let processor = test_processor(None); + let notification = proto::Notification { + prefix: Some(proto::Path { + elem: vec![make_path_elem("system-events", &[])], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "7")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("error")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "7")]), + make_path_elem("state", &[]), + make_path_elem("text", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("same event")), + ..Default::default() + }, + ], + ..Default::default() + }; + + processor.process_notification(¬ification); + processor.process_notification(¬ification); + + assert_eq!( + processor + .stream_metrics + .rows_total + .with_label_values(&["error"]) + .get(), + 1.0 + ); + } + + #[test] + fn emitted_metrics_preserve_switch_position_context() { + let sink = Arc::new(CapturingSink::default()); + let switch_id = test_switch_id("switch-a"); + let registry = prometheus::Registry::new(); + let stream_metrics = + OnChangeStreamMetrics::new(®istry, "test", TEST_COLLECTOR_NAME, test_labels()) + .unwrap(); + let processor = GnmiOnChangeProcessor::new( + TEST_COLLECTOR_NAME.to_string(), + stream_metrics, + Some(sink.clone()), + EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }, + collector_type: ON_CHANGE_STREAM_ID_SYSTEM_EVENTS, + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + })), + rack_id: Some(RackId::new("RACK_2")), + }, + "SN-SWITCH-001".to_string(), + ); + let notification = proto::Notification { + prefix: Some(proto::Path { + elem: vec![make_path_elem("system-events", &[])], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "42")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("warning")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "42")]), + make_path_elem("state", &[]), + make_path_elem("text", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("Link down detected on swp1")), + ..Default::default() + }, + ], + ..Default::default() + }; + + assert_eq!(processor.process_notification(¬ification), 1); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1); + let (context, event) = &events[0]; + assert_eq!(context.switch_id(), Some(switch_id)); + assert_eq!(context.switch_slot_number(), Some(7)); + assert_eq!(context.switch_tray_index(), Some(3)); + assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); + let CollectorEvent::Metric(metric) = event else { + panic!("expected metric event"); + }; + assert_eq!(metric.metric_type, "on_change_row"); + assert_eq!(metric.value, 2.0); + assert!( + metric + .labels + .iter() + .any(|(key, value)| key == "instance_id" && value == "42") + ); + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs index 1b1fa36778..da7939639d 100644 --- a/crates/health/src/collectors/nvue/gnmi/subscriber.rs +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -22,7 +22,12 @@ use std::time::Duration; use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntGauge, Opts}; use tokio_util::sync::CancellationToken; -use super::client::{GnmiClient, nvue_subscribe_paths}; +use super::client::{ + GnmiClient, nvue_subscribe_paths, system_events_prefix, system_events_subscribe_path, +}; +use super::on_change_processor::{ + GnmiOnChangeProcessor, ON_CHANGE_STREAM_ID_SYSTEM_EVENTS, OnChangeStreamMetrics, +}; use super::proto; use super::sample_processor::{GnmiSampleProcessor, NVUE_GNMI_SAMPLE_STREAM_ID, now_unix_secs}; use crate::HealthError; @@ -221,25 +226,71 @@ pub fn spawn_gnmi_collector( let sample_stream_metrics = GnmiStreamMetrics::new(registry, &prefix, "", sample_const_labels)?; let sample_config = GnmiStreamConfig { - client, + client: client.clone(), paths: nvue_subscribe_paths(&gnmi_config.paths), sample_interval_nanos: gnmi_config.sample_interval.as_nanos() as u64, }; let sample_processor = GnmiSampleProcessor { - data_sink, + data_sink: data_sink.clone(), event_context: sample_event_context, - switch_id, + switch_id: switch_id.clone(), + }; + + let on_change_state = if gnmi_config.system_events_enabled { + let on_change_const_labels = HashMap::from([ + ( + "collector_type".to_string(), + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), + ), + ("endpoint_key".to_string(), endpoint.hash_key().into_owned()), + ]); + + let on_change_stream_metrics = + GnmiStreamMetrics::new(registry, &prefix, "_events", on_change_const_labels.clone())?; + let on_change_row_metrics = OnChangeStreamMetrics::new( + registry, + &prefix, + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS, + on_change_const_labels, + )?; + let on_change_event_context = + EventContext::from_endpoint(endpoint, ON_CHANGE_STREAM_ID_SYSTEM_EVENTS); + let on_change_processor = GnmiOnChangeProcessor::new( + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), + on_change_row_metrics, + data_sink, + on_change_event_context, + switch_id, + ); + + Some((client, on_change_stream_metrics, on_change_processor)) + } else { + None }; Ok(Collector::spawn_task(move |cancel_token| async move { - gnmi_sample_task( - cancel_token, + let sample_handle = tokio::spawn(gnmi_sample_task( + cancel_token.clone(), sample_config, sample_stream_metrics, sample_processor, - ) - .await; + )); + + let on_change_handle = + on_change_state.map(|(client, stream_metrics, on_change_processor)| { + tokio::spawn(gnmi_on_change_task( + cancel_token, + client, + stream_metrics, + on_change_processor, + )) + }); + + let _ = sample_handle.await; + if let Some(handle) = on_change_handle { + let _ = handle.await; + } })) } @@ -342,6 +393,108 @@ async fn gnmi_sample_task( } } +async fn gnmi_on_change_task( + cancel_token: CancellationToken, + client: GnmiClient, + stream_metrics: GnmiStreamMetrics, + on_change_processor: GnmiOnChangeProcessor, +) { + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_secs(2), + max: Duration::from_secs(60), + }); + let prefix = system_events_prefix(); + let paths = system_events_subscribe_path(); + + loop { + stream_metrics.connection_state.set(CONNECTING); + + let Some(stream) = cancel_token + .run_until_cancelled(client.subscribe_on_change(&prefix, &paths)) + .await + else { + stream_metrics.connection_state.set(SHUTDOWN); + return; + }; + + match stream { + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: connection failed, backing off" + ); + } + Ok(mut stream) => { + stream_metrics.connection_state.set(READY); + stream_metrics + .connection_established_timestamp + .set(now_unix_secs()); + let _conn_guard = StreamingConnectionGuard::inc(stream_metrics.connected.clone()); + backoff.reset(); + tracing::info!( + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: stream connected" + ); + + loop { + let Some(msg) = cancel_token.run_until_cancelled(stream.message()).await else { + stream_metrics.connection_state.set(SHUTDOWN); + tracing::info!( + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: cancelled, shutting down" + ); + return; + }; + + match msg { + Ok(Some(resp)) => { + on_change_processor.process_subscribe_response(&resp, &stream_metrics); + } + Ok(None) => { + stream_metrics.connection_state.set(IDLE); + stream_metrics.server_initiated_closures_total.inc(); + tracing::info!( + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: stream closed by server, reconnecting" + ); + backoff.reset(); + break; + } + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.stream_errors_total.inc(); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: stream error, reconnecting" + ); + break; + } + } + } + } + } + + if cancel_token + .run_until_cancelled(tokio::time::sleep(backoff.next_delay())) + .await + .is_none() + { + stream_metrics.connection_state.set(SHUTDOWN); + return; + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 6eb8baec14..863a343f3d 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -668,6 +668,10 @@ pub struct NvueGnmiConfig { #[serde(with = "humantime_serde")] pub request_timeout: Duration, + /// Enable gNMI ON_CHANGE subscription for live system-event messages. + #[serde(alias = "system_events_subscription_enabled", alias = "events_enabled")] + pub system_events_enabled: bool, + /// gNMI SAMPLE subscription paths. pub paths: NvueGnmiPaths, } @@ -678,6 +682,7 @@ impl Default for NvueGnmiConfig { gnmi_port: 9339, sample_interval: Duration::from_secs(300), request_timeout: Duration::from_secs(30), + system_events_enabled: true, paths: NvueGnmiPaths::default(), } } @@ -1043,6 +1048,14 @@ mod tests { } else { panic!("nvue rest config should be enabled in example config"); } + if let Configurable::Enabled(ref gnmi) = nvue.gnmi { + assert_eq!(gnmi.gnmi_port, 9339); + assert_eq!(gnmi.sample_interval, Duration::from_secs(300)); + assert_eq!(gnmi.request_timeout, Duration::from_secs(30)); + assert!(gnmi.system_events_enabled); + } else { + panic!("nvue gnmi config should be enabled in example config"); + } } else { panic!("nvue config should be enabled in example config"); } @@ -1347,6 +1360,37 @@ interfaces_enabled = false } } + #[test] + fn test_nvue_gnmi_events_disabled() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.nvue.gnmi] +gnmi_port = 9339 +system_events_enabled = false +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("failed to parse"); + + if let Configurable::Enabled(ref nvue) = config.collectors.nvue { + if let Configurable::Enabled(ref gnmi) = nvue.gnmi { + assert!(!gnmi.system_events_enabled); + } else { + panic!("gnmi config should be enabled"); + } + } else { + panic!("nvue config should be enabled"); + } + } + #[test] fn test_static_endpoint_with_switch_serial() { let toml_content = r#" From 543be1a47dde146f049c67897205e583781cfb6a Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sun, 17 May 2026 02:43:54 +0000 Subject: [PATCH 14/30] fix(health): typo --- crates/health/src/api_client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index f8832cc894..70531d5905 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,7 +15,7 @@ * limitations under the License. */ -buse std::collections::HashMap; +use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; From 0f3fd25161a6ece4e0123c0cc821026846496cd2 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 18 May 2026 09:56:47 -0400 Subject: [PATCH 15/30] lint(health): tidy up Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- .../health/src/collectors/nvue/gnmi/on_change_processor.rs | 5 ++--- crates/health/src/collectors/nvue/gnmi/sample_processor.rs | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs index 376a7fcb7b..2297ca4c75 100644 --- a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs @@ -259,9 +259,8 @@ mod tests { use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; - use crate::endpoint::{BmcAddr, EndpointMetadata, SwitchData}; - use super::*; + use crate::endpoint::{BmcAddr, EndpointMetadata, SwitchData}; const TEST_COLLECTOR_NAME: &str = "nvue_gnmi_system_events"; @@ -343,7 +342,7 @@ mod tests { #[test] fn test_find_instance_key() { - let elems = vec![ + let elems = [ make_path_elem("system-events", &[]), make_path_elem("system-event", &[("event-id", "38")]), make_path_elem("state", &[]), diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 378fc0d7f6..9525115707 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -329,9 +329,8 @@ mod tests { use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; - use crate::endpoint::{EndpointMetadata, SwitchData}; - use super::*; + use crate::endpoint::{EndpointMetadata, SwitchData}; #[derive(Default)] struct CapturingSink { From 438019dedcc40a0c554bf32837d62e0ae0967347 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sun, 17 May 2026 02:13:31 +0000 Subject: [PATCH 16/30] bug(health): gate switch hosts and bmcs in spawn to avoid redfish calls on switch hosts Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/discovery/spawn.rs | 212 ++++++++++++++++++++------- crates/health/src/endpoint/model.rs | 11 ++ 2 files changed, 169 insertions(+), 54 deletions(-) diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 2bfbc901e8..d9b91460f9 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -28,7 +28,7 @@ use crate::collectors::{ StreamingCollectorStartContext, }; use crate::config::{Configurable, LogCollectionMode}; -use crate::endpoint::{BmcEndpoint, EndpointMetadata}; +use crate::endpoint::BmcEndpoint; use crate::sink::DataSink; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { @@ -40,9 +40,23 @@ pub(super) async fn spawn_collectors_for_endpoint( endpoint: &Arc, data_sink: Option>, metrics_prefix: &str, +) -> Result<(), HealthError> { + if endpoint.switch_data().is_some() { + spawn_switch_collectors(ctx, endpoint, data_sink, metrics_prefix) + } else { + spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + } +} + +fn spawn_generic_redfish_collectors( + ctx: &mut DiscoveryLoopContext, + endpoint: &Arc, + data_sink: Option>, + metrics_prefix: &str, ) -> Result<(), HealthError> { let key = endpoint.key(); let endpoint_arc = endpoint.clone(); + if let Configurable::Enabled(sensor_cfg) = &ctx.sensors_config && !ctx.collectors.contains(CollectorKind::Sensor, &key) { @@ -216,7 +230,7 @@ pub(super) async fn spawn_collectors_for_endpoint( metrics_prefix, )?); match Collector::start::>( - endpoint_arc.clone(), + endpoint_arc, LeakDetectorCollectorConfig { data_sink: data_sink.clone(), state_refresh_interval: leak_detector_cfg.state_refresh_interval, @@ -250,9 +264,20 @@ pub(super) async fn spawn_collectors_for_endpoint( } } + Ok(()) +} + +fn spawn_switch_collectors( + ctx: &mut DiscoveryLoopContext, + endpoint: &Arc, + data_sink: Option>, + metrics_prefix: &str, +) -> Result<(), HealthError> { + let key = endpoint.key(); + let endpoint_arc = endpoint.clone(); + if let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) - && matches!(endpoint.metadata, Some(EndpointMetadata::Switch(_))) { let collector_registry = Arc::new( ctx.metrics_manager @@ -279,7 +304,7 @@ pub(super) async fn spawn_collectors_for_endpoint( tracing::info!( endpoint_key = %key, total_nmxt_collectors = ctx.collectors.len(CollectorKind::Nmxt), - "Started NMX-T collection for BMC endpoint" + "Started NMX-T collection for switch endpoint" ); } Err(error) => { @@ -295,7 +320,6 @@ pub(super) async fn spawn_collectors_for_endpoint( if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) - && matches!(endpoint.metadata, Some(EndpointMetadata::Switch(_))) { let collector_registry = Arc::new( ctx.metrics_manager @@ -322,7 +346,7 @@ pub(super) async fn spawn_collectors_for_endpoint( tracing::info!( endpoint_key = %key, total_nvue_rest_collectors = ctx.collectors.len(CollectorKind::NvueRest), - "Started NVUE REST collection for BMC endpoint" + "Started NVUE REST collection for switch endpoint" ); } Err(error) => { @@ -347,59 +371,155 @@ mod tests { use super::*; use crate::config::{Config, Configurable}; - use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, SwitchData}; + use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData}; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; + use crate::sink::{CollectorEvent, EventContext}; - #[test] - fn test_logs_state_file_path_replaces_endpoint_id() { - let path = logs_state_file_path("/tmp/logs_{machine_id}.json", "endpoint-42"); - assert_eq!(path, PathBuf::from("/tmp/logs_endpoint-42.json")); + struct NoopSink; + + impl DataSink for NoopSink { + fn sink_type(&self) -> &'static str { + "noop" + } + + fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) {} } - #[test] - fn test_endpoint_log_identity_falls_back_to_mac_without_metadata() { - let endpoint = BmcEndpoint::with_fixed_credentials( + fn context_with_config(config: Config, metrics_name: &str) -> DiscoveryLoopContext { + let limiter: Arc = Arc::new(NoopLimiter); + let metrics_manager = + Arc::new(MetricsManager::new(metrics_name).expect("metrics manager should initialize")); + DiscoveryLoopContext::new(limiter, metrics_manager, Arc::new(config)) + .expect("context should initialize") + } + + fn test_endpoint( + ip: Ipv4Addr, + mac: &str, + metadata: Option, + ) -> Arc { + Arc::new(BmcEndpoint::with_fixed_credentials( BmcAddr { - ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + ip: IpAddr::V4(ip), port: Some(443), - mac: MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), + mac: MacAddress::from_str(mac).expect("valid mac address"), }, BmcCredentials::UsernamePassword { username: "user".to_string(), password: Some("pass".to_string()), }, + metadata, None, - None, - ); + )) + } + + fn switch_metadata() -> EndpointMetadata { + EndpointMetadata::Switch(SwitchData { + id: None, + serial: "switch-serial-1".to_string(), + slot_number: None, + tray_index: None, + }) + } + + fn machine_metadata() -> EndpointMetadata { + EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, + }) + } + + #[test] + fn test_logs_state_file_path_replaces_endpoint_id() { + let path = logs_state_file_path("/tmp/logs_{machine_id}.json", "endpoint-42"); + assert_eq!(path, PathBuf::from("/tmp/logs_endpoint-42.json")); + } + + #[test] + fn test_endpoint_log_identity_falls_back_to_mac_without_metadata() { + let endpoint = test_endpoint(Ipv4Addr::new(10, 0, 0, 1), "aa:bb:cc:dd:ee:ff", None); assert_eq!(endpoint.log_identity().as_ref(), "AA:BB:CC:DD:EE:FF"); } #[test] fn test_endpoint_log_identity_uses_switch_serial_when_available() { - let endpoint = BmcEndpoint::with_fixed_credentials( - BmcAddr { - ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)), - port: Some(443), - mac: MacAddress::from_str("11:22:33:44:55:66").unwrap(), - }, - BmcCredentials::UsernamePassword { - username: "user".to_string(), - password: Some("pass".to_string()), - }, - Some(EndpointMetadata::Switch(SwitchData { - id: None, - serial: "switch-serial-1".to_string(), - slot_number: None, - tray_index: None, - })), - None, + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 2), + "11:22:33:44:55:66", + Some(switch_metadata()), ); assert_eq!(endpoint.log_identity().as_ref(), "switch-serial-1"); } + #[tokio::test] + async fn test_switch_endpoint_does_not_start_generic_redfish_collectors() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Enabled(Default::default()); + config.collectors.logs = Configurable::Enabled(Default::default()); + config.collectors.firmware = Configurable::Enabled(Default::default()); + config.collectors.leak_detector = Configurable::Enabled(Default::default()); + config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; + + let mut ctx = context_with_config(config, "test_switch_generic_redfish_gate"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 6), + "55:66:77:88:99:aa", + Some(switch_metadata()), + ); + + spawn_collectors_for_endpoint( + &mut ctx, + &endpoint, + Some(Arc::new(NoopSink)), + "test_switch_generic_redfish_gate", + ) + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); + assert_eq!(ctx.collectors.len(CollectorKind::Logs), 0); + assert_eq!(ctx.collectors.len(CollectorKind::Firmware), 0); + assert_eq!(ctx.collectors.len(CollectorKind::LeakDetector), 0); + } + + #[tokio::test] + async fn test_machine_endpoint_still_starts_sse_logs_collector() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Enabled(Default::default()); + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; + + let mut ctx = context_with_config(config, "test_machine_sse_logs_collector"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 7), + "66:77:88:99:aa:bb", + Some(machine_metadata()), + ); + + spawn_collectors_for_endpoint( + &mut ctx, + &endpoint, + Some(Arc::new(NoopSink)), + "test_machine_sse_logs_collector", + ) + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Logs), 1); + } + #[tokio::test] async fn test_spawn_is_idempotent_when_collectors_are_disabled() { let mut config = Config::default(); @@ -408,26 +528,10 @@ mod tests { config.collectors.firmware = Configurable::Disabled; config.collectors.leak_detector = Configurable::Disabled; config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; - let limiter: Arc = Arc::new(NoopLimiter); - let metrics_manager = - Arc::new(MetricsManager::new("test").expect("metrics manager should initialize")); - let mut ctx = DiscoveryLoopContext::new(limiter, metrics_manager, Arc::new(config)) - .expect("context should initialize"); - - let endpoint = Arc::new(BmcEndpoint::with_fixed_credentials( - BmcAddr { - ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), - port: Some(443), - mac: MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), - }, - BmcCredentials::UsernamePassword { - username: "user".to_string(), - password: Some("pass".to_string()), - }, - None, - None, - )); + let mut ctx = context_with_config(config, "test_disabled_collectors"); + let endpoint = test_endpoint(Ipv4Addr::new(10, 0, 0, 1), "aa:bb:cc:dd:ee:ff", None); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") .await diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 06b2cf2a3a..825a3497c8 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -106,6 +106,10 @@ impl BmcEndpoint { } } + pub fn switch_data(&self) -> Option<&SwitchData> { + self.metadata.as_ref().and_then(EndpointMetadata::as_switch) + } + pub fn credentials(&self) -> BmcCredentials { self.credentials.read().expect("lock poisoned").to_owned() } @@ -128,6 +132,13 @@ pub enum EndpointMetadata { } impl EndpointMetadata { + pub fn as_switch(&self) -> Option<&SwitchData> { + match self { + EndpointMetadata::Switch(switch) => Some(switch), + _ => None, + } + } + pub fn serial_number(&self) -> Option<&str> { match self { EndpointMetadata::Machine(machine) => machine.machine_serial.as_deref(), From db6dca42f64931cf735ba8ed8344427b70efad69 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 18 May 2026 23:45:12 +0000 Subject: [PATCH 17/30] feat(health): add SwitchEndpointRole to distinguish switch BMC from Host Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 5 ++++- crates/health/src/endpoint/mod.rs | 2 +- crates/health/src/endpoint/model.rs | 9 +++++++++ crates/health/src/endpoint/sources.rs | 5 ++++- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index b49c78310b..6390f69bc2 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -31,7 +31,7 @@ use url::Url; use crate::HealthError; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, CredentialProvider, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, + EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; #[derive(Clone)] @@ -268,6 +268,9 @@ impl ApiClientWrapper { .placement_in_rack .as_ref() .and_then(|placement| placement.tray_index), + endpoint_role: SwitchEndpointRole::Bmc, + is_primary: switch.is_primary, + nmxt_enabled: false, })), None, ) diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index 191679ad79..b44a985d45 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -20,7 +20,7 @@ mod sources; pub use model::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, CredentialProvider, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, + EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; pub use sources::{CompositeEndpointSource, StaticEndpointSource}; diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 825a3497c8..bebfba1d10 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -163,12 +163,21 @@ pub struct PowerShelfData { pub serial: String, } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum SwitchEndpointRole { + Bmc, + Host, +} + #[derive(Clone, Debug)] pub struct SwitchData { pub id: Option, pub serial: String, pub slot_number: Option, pub tray_index: Option, + pub endpoint_role: SwitchEndpointRole, + pub is_primary: bool, + pub nmxt_enabled: bool, } #[derive(Clone)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index e2bc6bf8a0..f71af23227 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -26,7 +26,7 @@ use crate::HealthError; use crate::config::StaticBmcEndpoint; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, - PowerShelfData, SwitchData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; pub struct StaticEndpointSource { @@ -99,6 +99,9 @@ impl StaticEndpointSource { serial, slot_number: switch.slot_number, tray_index: switch.tray_index, + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })) } else if let Some(machine) = &cfg.machine { let machine_id = &machine.id; From 3fbc376006fbba725fcfbfb58384738549fbba65 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 00:18:54 +0000 Subject: [PATCH 18/30] feat(health): add static config shape for switch bmc/host Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 10 +++++++++- crates/health/src/config.rs | 17 +++++++++++++++++ crates/health/src/endpoint/sources.rs | 13 +++++++++---- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 12067a6f3f..5f1f3c5c65 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -46,7 +46,15 @@ port = 443 mac = "11:22:33:44:55:66" username = "admin" password = "secret" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001", slot_number = 7, tray_index = 3 } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-BMC-001", endpoint_role = "bmc", slot_number = 7, tray_index = 3 } + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +port = 443 +mac = "11:22:33:44:55:77" +username = "admin" +password = "secret" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-HOST-001", endpoint_role = "host", is_primary = true, slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index c94d99ce6b..837f1be96b 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -123,6 +123,17 @@ pub struct StaticPowerShelfEndpoint { pub serial: Option, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "snake_case")] +pub enum StaticSwitchEndpointRole { + Bmc, + Host, +} + +fn default_static_switch_endpoint_role() -> StaticSwitchEndpointRole { + StaticSwitchEndpointRole::Host +} + #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct StaticSwitchEndpoint { @@ -130,6 +141,12 @@ pub struct StaticSwitchEndpoint { pub serial: Option, pub slot_number: Option, pub tray_index: Option, + #[serde(default = "default_static_switch_endpoint_role")] + pub endpoint_role: StaticSwitchEndpointRole, + #[serde(default)] + pub is_primary: bool, + #[serde(default)] + pub nmxt_enabled: Option, } impl Debug for StaticBmcEndpoint { diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index f71af23227..6023f20c2b 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -23,7 +23,7 @@ use carbide_uuid::rack::RackId; use mac_address::MacAddress; use crate::HealthError; -use crate::config::StaticBmcEndpoint; +use crate::config::{StaticBmcEndpoint, StaticSwitchEndpointRole}; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, @@ -93,15 +93,20 @@ impl StaticEndpointSource { .clone() .or_else(|| switch.id.clone()) .unwrap_or_else(|| cfg.mac.clone()); + let endpoint_role = match switch.endpoint_role { + StaticSwitchEndpointRole::Bmc => SwitchEndpointRole::Bmc, + StaticSwitchEndpointRole::Host => SwitchEndpointRole::Host, + }; + let nmxt_enabled = switch.nmxt_enabled.unwrap_or(switch.is_primary); Some(EndpointMetadata::Switch(SwitchData { id, serial, slot_number: switch.slot_number, tray_index: switch.tray_index, - endpoint_role: SwitchEndpointRole::Host, - is_primary: false, - nmxt_enabled: false, + endpoint_role, + is_primary: switch.is_primary, + nmxt_enabled, })) } else if let Some(machine) = &cfg.machine { let machine_id = &machine.id; From 817c2f34178f317263c4b8eb699f8e72cf734185 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 00:32:51 +0000 Subject: [PATCH 19/30] feat(health): gate switch collection by endpoint role (host/bmc) Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/discovery/spawn.rs | 41 ++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d9b91460f9..d33a9e0753 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -28,7 +28,7 @@ use crate::collectors::{ StreamingCollectorStartContext, }; use crate::config::{Configurable, LogCollectionMode}; -use crate::endpoint::BmcEndpoint; +use crate::endpoint::{BmcEndpoint, SwitchEndpointRole}; use crate::sink::DataSink; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { @@ -41,13 +41,28 @@ pub(super) async fn spawn_collectors_for_endpoint( data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { - if endpoint.switch_data().is_some() { - spawn_switch_collectors(ctx, endpoint, data_sink, metrics_prefix) - } else { - spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + match endpoint.switch_data().map(|switch| switch.endpoint_role) { + Some(SwitchEndpointRole::Host) => { + spawn_switch_host_collectors(ctx, endpoint, data_sink, metrics_prefix) + } + Some(SwitchEndpointRole::Bmc) | None => { + spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + } } } +fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { + endpoint + .switch_data() + .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) +} + +fn switch_host_nmxt_enabled(endpoint: &BmcEndpoint) -> bool { + endpoint.switch_data().is_some_and(|switch| { + switch.endpoint_role == SwitchEndpointRole::Host && switch.nmxt_enabled + }) +} + fn spawn_generic_redfish_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, @@ -267,7 +282,7 @@ fn spawn_generic_redfish_collectors( Ok(()) } -fn spawn_switch_collectors( +fn spawn_switch_host_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, data_sink: Option>, @@ -276,7 +291,8 @@ fn spawn_switch_collectors( let key = endpoint.key(); let endpoint_arc = endpoint.clone(); - if let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config + if switch_host_nmxt_enabled(endpoint) + && let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) { let collector_registry = Arc::new( @@ -304,20 +320,21 @@ fn spawn_switch_collectors( tracing::info!( endpoint_key = %key, total_nmxt_collectors = ctx.collectors.len(CollectorKind::Nmxt), - "Started NMX-T collection for switch endpoint" + "Started NMX-T collection for switch host endpoint" ); } Err(error) => { tracing::error!( ?error, - "Could not start NMX-T collector for: {:?}", + "Could not start NMX-T collector for switch host: {:?}", endpoint.addr ) } } } - if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config + if is_switch_host_endpoint(endpoint) + && let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) { @@ -346,13 +363,13 @@ fn spawn_switch_collectors( tracing::info!( endpoint_key = %key, total_nvue_rest_collectors = ctx.collectors.len(CollectorKind::NvueRest), - "Started NVUE REST collection for switch endpoint" + "Started NVUE REST collection for switch host endpoint" ); } Err(error) => { tracing::error!( ?error, - "Could not start NVUE REST collector for: {:?}", + "Could not start NVUE REST collector for switch host: {:?}", endpoint.addr ) } From 83f05fe82a7fc61f8c51e92807394609c5d6bc73 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 01:31:31 +0000 Subject: [PATCH 20/30] feat(api): expose switch host endpoints and nvos credentials for host monitoring Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/api/src/api.rs | 14 ++++++++ crates/api/src/auth/internal_rbac_rules.rs | 5 +++ crates/api/src/handlers/credential.rs | 31 ++++++++++++++++ crates/api/src/handlers/switch.rs | 41 ++++++++++++++++++++++ crates/rpc/build.rs | 15 +++++++- crates/rpc/proto/forge.proto | 17 +++++++++ 6 files changed, 122 insertions(+), 1 deletion(-) diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 6eeed31538..0689bd244b 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -366,6 +366,13 @@ impl Forge for Api { crate::handlers::switch::find_by_ids(self, request).await } + async fn find_switch_host_endpoints( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::switch::find_host_endpoints(self, request).await + } + async fn delete_switch( &self, request: Request, @@ -899,6 +906,13 @@ impl Forge for Api { crate::handlers::credential::get_bmc_credentals(self, request).await } + async fn get_switch_nvos_credentials( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::credential::get_switch_nvos_credentials(self, request).await + } + /// Network status of each managed host, as reported by forge-dpu-agent. /// For use by forge-admin-cli /// diff --git a/crates/api/src/auth/internal_rbac_rules.rs b/crates/api/src/auth/internal_rbac_rules.rs index 57e2d2ab09..fba8237069 100644 --- a/crates/api/src/auth/internal_rbac_rules.rs +++ b/crates/api/src/auth/internal_rbac_rules.rs @@ -249,6 +249,7 @@ impl InternalRBACRules { x.perm("DeleteTenantKeyset", vec![SiteAgent]); x.perm("ValidateTenantPublicKey", vec![SiteAgent, Ssh, SshRs]); x.perm("GetBmcCredentials", vec![Health]); + x.perm("GetSwitchNvosCredentials", vec![Health]); x.perm("GetAllManagedHostNetworkStatus", vec![ForgeAdminCLI]); x.perm( "GetSiteExplorationReport", @@ -718,6 +719,10 @@ impl InternalRBACRules { "FindSwitchesByIds", vec![ForgeAdminCLI, Machineatron, Flow, Health], ); + x.perm( + "FindSwitchHostEndpoints", + vec![ForgeAdminCLI, Machineatron, Flow, Health], + ); x.perm("CreateSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("DeleteSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("AddExpectedSwitch", vec![ForgeAdminCLI, Machineatron, Flow]); diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index f93a0ef3eb..140fb6eae2 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -424,6 +424,37 @@ pub(crate) async fn get_bmc_credentals( })) } +pub(crate) async fn get_switch_nvos_credentials( + api: &Api, + request: tonic::Request, +) -> Result, tonic::Status> { + crate::api::log_request_data(&request); + + let req = request.into_inner(); + + let bmc_mac_address: mac_address::MacAddress = req + .bmc_mac_addr + .parse() + .map_err(CarbideError::MacAddressParseError)?; + + let credentials = api + .credential_manager + .get_credentials(&CredentialKey::SwitchNvosAdmin { bmc_mac_address }) + .await + .map_err(|e| CarbideError::internal(e.to_string()))? + .ok_or_else(|| CarbideError::internal("missing credentials".to_string()))?; + + let Credentials::UsernamePassword { username, password } = credentials; + + Ok(Response::new(rpc::GetBmcCredentialsResponse { + credentials: Some(rpc::BmcCredentials { + r#type: Some(rpc::bmc_credentials::Type::UsernamePassword( + rpc::UsernamePassword { username, password }, + )), + }), + })) +} + async fn set_sitewide_bmc_root_credentials( api: &Api, password: String, diff --git a/crates/api/src/handlers/switch.rs b/crates/api/src/handlers/switch.rs index 599b50b085..c8479f1486 100644 --- a/crates/api/src/handlers/switch.rs +++ b/crates/api/src/handlers/switch.rs @@ -203,6 +203,47 @@ pub async fn find_by_ids( Ok(Response::new(rpc::SwitchList { switches })) } +pub async fn find_host_endpoints( + api: &Api, + request: Request, +) -> Result, Status> { + log_request_data(&request); + + let switch_ids = request.into_inner().switch_ids; + + let max_find_by_ids = api.runtime_config.max_find_by_ids as usize; + if switch_ids.len() > max_find_by_ids { + return Err(CarbideError::InvalidArgument(format!( + "no more than {max_find_by_ids} IDs can be accepted" + )) + .into()); + } else if switch_ids.is_empty() { + return Err( + CarbideError::InvalidArgument("at least one ID must be provided".to_string()).into(), + ); + } + + let rows = db_switch::find_switch_endpoints_by_ids(&mut api.db_reader(), &switch_ids).await?; + + let endpoints = rows + .into_iter() + .filter_map(|row| { + let (Some(host_mac), Some(host_ip)) = (row.nvos_mac, row.nvos_ip) else { + return None; + }; + + Some(rpc::SwitchHostEndpoint { + switch_id: Some(row.switch_id), + bmc_mac: row.bmc_mac.to_string(), + host_mac: host_mac.to_string(), + host_ip: host_ip.to_string(), + }) + }) + .collect(); + + Ok(Response::new(rpc::SwitchHostEndpointList { endpoints })) +} + pub async fn find_switch_state_histories( api: &Api, request: Request, diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index 03ac72ab11..16bb61480c 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -837,7 +837,20 @@ fn main() -> Result<(), Box> { .type_attribute( "forge.GetBmcCredentialsRequest", "#[derive(serde::Serialize)]", - ).type_attribute( + ) + .type_attribute( + "forge.GetSwitchNvosCredentialsRequest", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.SwitchHostEndpoint", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.SwitchHostEndpointList", + "#[derive(serde::Serialize)]", + ) + .type_attribute( "forge.PlacementInRack", "#[derive(serde::Serialize)]", ) diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 5b1783c71a..a476af963d 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -101,6 +101,7 @@ service Forge { rpc FindSwitches(SwitchQuery) returns (SwitchList); rpc FindSwitchIds(SwitchSearchFilter) returns (SwitchIdList); rpc FindSwitchesByIds(SwitchesByIdsRequest) returns (SwitchList); + rpc FindSwitchHostEndpoints(SwitchesByIdsRequest) returns (SwitchHostEndpointList); rpc DeleteSwitch(SwitchDeletionRequest) returns (SwitchDeletionResult); // Force deletes a Switch and optionally its associated interfaces from the database. rpc AdminForceDeleteSwitch(AdminForceDeleteSwitchRequest) returns (AdminForceDeleteSwitchResponse); @@ -263,6 +264,7 @@ service Forge { // Query for BMC Credentials rpc GetBmcCredentials(GetBmcCredentialsRequest) returns (GetBmcCredentialsResponse); + rpc GetSwitchNvosCredentials(GetSwitchNvosCredentialsRequest) returns (GetBmcCredentialsResponse); // Admin CLI actions @@ -2192,6 +2194,17 @@ message SwitchesByIdsRequest { repeated common.SwitchId switch_ids = 1; } +message SwitchHostEndpoint { + common.SwitchId switch_id = 1; + string bmc_mac = 2; + string host_mac = 3; + string host_ip = 4; +} + +message SwitchHostEndpointList { + repeated SwitchHostEndpoint endpoints = 1; +} + message ExpectedSwitch { string bmc_mac_address = 1; string bmc_username = 2; @@ -3741,6 +3754,10 @@ message GetBmcCredentialsRequest { string mac_addr = 1; } +message GetSwitchNvosCredentialsRequest { + string bmc_mac_addr = 1; +} + message GetBmcCredentialsResponse { BmcCredentials credentials = 1; } From 039a020eb9b9ea90056f0daeb5c1024a4ed2c673 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 16:03:18 +0000 Subject: [PATCH 21/30] feat(health): wire switch host endpoint to health discovery Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 188 ++++++++++++++++++++++++++------ 1 file changed, 154 insertions(+), 34 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 6390f69bc2..d8f5c75161 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,12 +15,14 @@ * limitations under the License. */ +use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; use std::sync::{Arc, RwLock}; use carbide_uuid::rack::RackId; +use carbide_uuid::switch::SwitchId; use forge_tls::client_config::ClientCert; use mac_address::MacAddress; use rpc::forge::MachineSearchConfig; @@ -42,6 +44,13 @@ pub struct ApiClientWrapper { #[derive(Clone)] struct ApiCredentialProvider { client: ForgeApiClient, + kind: ApiCredentialKind, +} + +#[derive(Clone)] +enum ApiCredentialKind { + Bmc, + SwitchNvosAdmin { bmc_mac: MacAddress }, } impl CredentialProvider for ApiCredentialProvider { @@ -50,24 +59,68 @@ impl CredentialProvider for ApiCredentialProvider { endpoint: &'a BmcAddr, ) -> BoxFuture<'a, Result> { Box::pin(async move { - let request = rpc::forge::GetBmcCredentialsRequest { - mac_addr: endpoint.mac.to_string(), + let response = match &self.kind { + ApiCredentialKind::Bmc => { + let request = rpc::forge::GetBmcCredentialsRequest { + mac_addr: endpoint.mac.to_string(), + }; + + self.client + .get_bmc_credentials(request) + .await + .map_err(HealthError::ApiInvocationError)? + } + ApiCredentialKind::SwitchNvosAdmin { bmc_mac } => { + let request = rpc::forge::GetSwitchNvosCredentialsRequest { + bmc_mac_addr: bmc_mac.to_string(), + }; + + self.client + .get_switch_nvos_credentials(request) + .await + .map_err(HealthError::ApiInvocationError)? + } }; - self.client - .get_bmc_credentials(request) - .await - .map_err(HealthError::ApiInvocationError)? + response .credentials .and_then(|credentials| credentials.r#type) .map(Into::into) .ok_or_else(|| { - HealthError::GenericError("missing BMC credentials in API response".to_string()) + HealthError::GenericError("missing credentials in API response".to_string()) }) }) } } +fn switch_endpoint_metadata( + switch: &rpc::forge::Switch, + endpoint_role: SwitchEndpointRole, + nmxt_enabled: bool, +) -> Result { + let serial = switch + .config + .as_ref() + .map(|config| config.name.clone()) + .ok_or_else(|| HealthError::GenericError("switch endpoint does not have serial".into()))?; + + Ok(EndpointMetadata::Switch(SwitchData { + id: switch.id, + serial, + slot_number: switch + .placement_in_rack + .as_ref() + .and_then(|placement| placement.slot_number), + tray_index: switch + .placement_in_rack + .as_ref() + .and_then(|placement| placement.tray_index), + endpoint_role, + is_primary: switch.is_primary, + nmxt_enabled, + })) +} + impl ApiClientWrapper { pub fn new(root_ca: String, client_cert: String, client_key: String, api_url: &Url) -> Self { let client_config = ForgeClientConfig::new( @@ -146,10 +199,20 @@ impl ApiClientWrapper { match self.client.find_switches(switch_request).await { Ok(response) => { + let switches = response.switches; + let switch_ids = switches + .iter() + .filter_map(|switch| switch.id) + .collect::>(); + let switches_by_id = switches + .iter() + .filter_map(|switch| switch.id.map(|id| (id, switch))) + .collect::>(); + let mut endpoints = Vec::new(); - for switch in response.switches { - match self.extract_switch_endpoint(&switch).await { + for switch in &switches { + match self.extract_switch_endpoint(switch).await { Ok(endpoint) => endpoints.push(Arc::new(endpoint)), Err(error) => tracing::warn!( ?switch, @@ -159,6 +222,33 @@ impl ApiClientWrapper { } } + if !switch_ids.is_empty() { + match self + .client + .find_switch_host_endpoints(rpc::forge::SwitchesByIdsRequest { switch_ids }) + .await + { + Ok(response) => { + for host_endpoint in response.endpoints { + match self + .extract_switch_host_endpoint(&host_endpoint, &switches_by_id) + .await + { + Ok(endpoint) => endpoints.push(Arc::new(endpoint)), + Err(error) => tracing::warn!( + ?host_endpoint, + ?error, + "Could not add switch host endpoint due to error" + ), + } + } + } + Err(error) => { + tracing::warn!(?error, "Failed to fetch switch host endpoints") + } + } + } + tracing::debug!(count = endpoints.len(), "Fetched switch endpoints"); endpoints } @@ -233,8 +323,13 @@ impl ApiClientWrapper { }) }); - self.endpoint_with_auth(addr, metadata, machine.rack_id.clone()) - .await + self.endpoint_with_auth( + addr, + metadata, + machine.rack_id.clone(), + ApiCredentialKind::Bmc, + ) + .await } async fn extract_switch_endpoint( @@ -247,32 +342,54 @@ impl ApiClientWrapper { )); }; let addr = BmcAddr::try_from(bmc_info)?; - let serial = switch - .config - .as_ref() - .map(|config| config.name.clone()) - .ok_or(HealthError::GenericError( - "Switch endpont does not have serial".to_string(), - ))?; self.endpoint_with_auth( addr, - Some(EndpointMetadata::Switch(SwitchData { - id: switch.id, - serial, - slot_number: switch - .placement_in_rack - .as_ref() - .and_then(|placement| placement.slot_number), - tray_index: switch - .placement_in_rack - .as_ref() - .and_then(|placement| placement.tray_index), - endpoint_role: SwitchEndpointRole::Bmc, - is_primary: switch.is_primary, - nmxt_enabled: false, - })), - None, + Some(switch_endpoint_metadata( + switch, + SwitchEndpointRole::Bmc, + false, + )?), + switch.rack_id.clone(), + ApiCredentialKind::Bmc, + ) + .await + } + + async fn extract_switch_host_endpoint( + &self, + host_endpoint: &rpc::forge::SwitchHostEndpoint, + switches_by_id: &HashMap, + ) -> Result { + let switch_id = host_endpoint.switch_id.ok_or_else(|| { + HealthError::GenericError("switch host endpoint missing switch ID".to_string()) + })?; + let switch = *switches_by_id.get(&switch_id).ok_or_else(|| { + HealthError::GenericError( + "switch host endpoint did not match fetched switch".to_string(), + ) + })?; + let addr = BmcAddr { + ip: host_endpoint + .host_ip + .parse::() + .map_err(|error| HealthError::GenericError(error.to_string()))?, + port: None, + mac: MacAddress::from_str(&host_endpoint.host_mac) + .map_err(|error| HealthError::GenericError(error.to_string()))?, + }; + let bmc_mac = MacAddress::from_str(&host_endpoint.bmc_mac) + .map_err(|error| HealthError::GenericError(error.to_string()))?; + + self.endpoint_with_auth( + addr, + Some(switch_endpoint_metadata( + switch, + SwitchEndpointRole::Host, + switch.is_primary, + )?), + switch.rack_id.clone(), + ApiCredentialKind::SwitchNvosAdmin { bmc_mac }, ) .await } @@ -302,6 +419,7 @@ impl ApiClientWrapper { serial, })), None, + ApiCredentialKind::Bmc, ) .await } @@ -311,9 +429,11 @@ impl ApiClientWrapper { addr: BmcAddr, metadata: Option, rack_id: Option, + credential_kind: ApiCredentialKind, ) -> Result { let provider = ApiCredentialProvider { client: self.client.clone(), + kind: credential_kind, }; let credentials = provider.fetch_credentials(&addr).await?; From 58e8c24408e3677b4f6cb578d65dac0d857f5d2e Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 22:15:17 +0000 Subject: [PATCH 22/30] feat(health): discover switch bmc and host endpoints, respectively Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/api/src/tests/credential.rs | 40 ++++++- crates/api/src/tests/switch_find.rs | 57 +++++++++ crates/health/src/config.rs | 87 +++++++++++++- crates/health/src/discovery/iteration.rs | 7 +- crates/health/src/discovery/spawn.rs | 142 ++++++++++++++++++++++- crates/health/src/endpoint/mod.rs | 6 + crates/health/src/otlp/convert.rs | 5 +- crates/health/src/sink/prometheus.rs | 5 +- 8 files changed, 339 insertions(+), 10 deletions(-) diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index 2fa72f01ba..f9d25a72d1 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -16,7 +16,7 @@ */ use forge_secrets::credentials::{ - BgpCredentialType, CredentialKey, CredentialReader, CredentialType, Credentials, + BgpCredentialType, CredentialKey, CredentialReader, CredentialType, Credentials, CredentialWriter, }; use rpc::forge::forge_server::Forge; use rpc::forge::{ @@ -243,3 +243,41 @@ async fn test_create_bgp_credential_validates_max_password_length(pool: sqlx::Pg }) ); } + +#[crate::sqlx_test] +async fn test_get_switch_nvos_credentials(pool: sqlx::PgPool) -> eyre::Result<()> { + let env = create_test_env(pool).await; + let bmc_mac_address = "00:11:22:33:44:55".parse()?; + + env.test_credential_manager + .set_credentials( + &CredentialKey::SwitchNvosAdmin { bmc_mac_address }, + &Credentials::UsernamePassword { + username: "nvos-admin".to_string(), + password: "nvos-secret".to_string(), + }, + ) + .await?; + + let response = env + .api + .get_switch_nvos_credentials(tonic::Request::new( + rpc::forge::GetSwitchNvosCredentialsRequest { + bmc_mac_addr: bmc_mac_address.to_string(), + }, + )) + .await? + .into_inner(); + + let credentials = response.credentials.expect("credentials"); + let Some(rpc::forge::bmc_credentials::Type::UsernamePassword(username_password)) = + credentials.r#type + else { + panic!("expected username/password credentials"); + }; + + assert_eq!(username_password.username, "nvos-admin"); + assert_eq!(username_password.password, "nvos-secret"); + + Ok(()) +} diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index bbbe39904a..aa3c574677 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -262,3 +262,60 @@ async fn test_find_switches_by_ids_response_fields( Ok(()) } + +#[crate::sqlx_test] +async fn test_find_switch_host_endpoints_returns_resolved_nvos_host( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool).await; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + + let mut rows = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]).await?; + let expected = rows.pop().expect("switch endpoint row"); + let host_mac = expected.nvos_mac.expect("nvos mac"); + let host_ip = expected.nvos_ip.expect("nvos ip"); + + let response = env + .api + .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + switch_ids: vec![switch_id], + })) + .await? + .into_inner(); + + assert_eq!(response.endpoints.len(), 1); + assert_eq!(response.endpoints[0].switch_id, Some(switch_id)); + assert_eq!(response.endpoints[0].bmc_mac, expected.bmc_mac.to_string()); + assert_eq!(response.endpoints[0].host_mac, host_mac.to_string()); + assert_eq!(response.endpoints[0].host_ip, host_ip.to_string()); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_find_switch_host_endpoints_skips_switch_without_nvos_host( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool).await; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + let rows = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]).await?; + let bmc_mac = rows.first().expect("switch endpoint row").bmc_mac; + + { + let mut txn = env.pool.begin().await?; + db::expected_switch::update_nvos_mac_addresses(txn.as_mut(), bmc_mac, &[]).await?; + txn.commit().await?; + } + + let response = env + .api + .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + switch_ids: vec![switch_id], + })) + .await? + .into_inner(); + + assert!(response.endpoints.is_empty()); + + Ok(()) +} diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 837f1be96b..eba3028ac9 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -1424,6 +1424,66 @@ power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1 ); } + #[test] + fn test_static_switch_host_accepts_primary_without_nmxt_override() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.1" +mac = "11:22:33:44:55:66" +username = "admin" +password = "pass" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", is_primary = true } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("static switch host config should parse"); + + let switch = config.endpoint_sources.static_bmc_endpoints[0] + .switch + .as_ref() + .expect("switch metadata"); + + assert_eq!(switch.endpoint_role, StaticSwitchEndpointRole::Host); + assert!(switch.is_primary); + assert_eq!(switch.nmxt_enabled, None); + } + + #[test] + fn test_static_switch_host_accepts_nmxt_override() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +mac = "11:22:33:44:55:77" +username = "admin" +password = "pass" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-002", endpoint_role = "host", is_primary = false, nmxt_enabled = true } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("static switch host config should parse"); + + let switch = config.endpoint_sources.static_bmc_endpoints[0] + .switch + .as_ref() + .expect("switch metadata"); + + assert_eq!(switch.endpoint_role, StaticSwitchEndpointRole::Host); + assert!(!switch.is_primary); + assert_eq!(switch.nmxt_enabled, Some(true)); + } + #[test] fn test_static_machine_endpoint_accepts_placement_and_nvlink_metadata() { let toml_content = r#" @@ -1492,7 +1552,7 @@ switch = { serial = "SN-SW-001" } .extract() .expect("could not parse config toml file"); - assert_eq!(config.endpoint_sources.static_bmc_endpoints.len(), 3); + assert_eq!(config.endpoint_sources.static_bmc_endpoints.len(), 4); assert!( config.endpoint_sources.static_bmc_endpoints[0] .switch @@ -1521,17 +1581,38 @@ switch = { serial = "SN-SW-001" } .switch .as_ref() .and_then(|switch| switch.serial.as_deref()), - Some("SN-SWITCH-001") + Some("SN-SWITCH-BMC-001") + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[1] + .switch + .as_ref() + .map(|switch| switch.endpoint_role), + Some(StaticSwitchEndpointRole::Bmc) + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[2] + .switch + .as_ref() + .and_then(|switch| switch.serial.as_deref()), + Some("SN-SWITCH-HOST-001") ); assert_eq!( config.endpoint_sources.static_bmc_endpoints[2] + .switch + .as_ref() + .map(|switch| switch.endpoint_role), + Some(StaticSwitchEndpointRole::Host) + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[3] .power_shelf .as_ref() .and_then(|power_shelf| power_shelf.id.as_deref()), Some("fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") ); assert_eq!( - config.endpoint_sources.static_bmc_endpoints[2] + config.endpoint_sources.static_bmc_endpoints[3] .power_shelf .as_ref() .and_then(|power_shelf| power_shelf.serial.as_deref()), diff --git a/crates/health/src/discovery/iteration.rs b/crates/health/src/discovery/iteration.rs index 5eecd13878..4bb1407f5c 100644 --- a/crates/health/src/discovery/iteration.rs +++ b/crates/health/src/discovery/iteration.rs @@ -100,7 +100,9 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, SwitchData}; + use crate::endpoint::{ + BmcAddr, BmcCredentials, EndpointMetadata, SwitchData, SwitchEndpointRole, + }; fn endpoint(mac: MacAddress, switch: bool) -> Arc { Arc::new(BmcEndpoint::with_fixed_credentials( @@ -119,6 +121,9 @@ mod tests { serial: format!("serial-{mac}"), slot_number: None, tray_index: None, + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })) } else { None diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d33a9e0753..d10f06ccd3 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -388,7 +388,9 @@ mod tests { use super::*; use crate::config::{Config, Configurable}; - use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData}; + use crate::endpoint::{ + BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, + }; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; use crate::sink::{CollectorEvent, EventContext}; @@ -431,15 +433,27 @@ mod tests { )) } - fn switch_metadata() -> EndpointMetadata { + fn switch_metadata_with_role( + endpoint_role: SwitchEndpointRole, + is_primary: bool, + nmxt_enabled: bool, + serial: &str, + ) -> EndpointMetadata { EndpointMetadata::Switch(SwitchData { id: None, - serial: "switch-serial-1".to_string(), + serial: serial.to_string(), slot_number: None, tray_index: None, + endpoint_role, + is_primary, + nmxt_enabled, }) } + fn switch_metadata() -> EndpointMetadata { + switch_metadata_with_role(SwitchEndpointRole::Host, false, false, "switch-serial-1") + } + fn machine_metadata() -> EndpointMetadata { EndpointMetadata::Machine(MachineData { machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" @@ -508,6 +522,128 @@ mod tests { assert_eq!(ctx.collectors.len(CollectorKind::LeakDetector), 0); } + #[tokio::test] + async fn test_switch_bmc_endpoint_starts_redfish_but_not_switch_host_collectors() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Enabled(Default::default()); + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_switch_bmc_redfish_only"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 8), + "55:66:77:88:99:bb", + Some(switch_metadata_with_role( + SwitchEndpointRole::Bmc, + true, + false, + "switch-bmc", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test_switch_bmc_redfish_only") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 1); + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 0); + } + + #[tokio::test] + async fn test_switch_host_primary_starts_nmxt_and_nvue_rest_when_globally_enabled() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_switch_host_nmxt_nvue_enabled"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 9), + "55:66:77:88:99:cc", + Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + true, + true, + "switch-host", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 1); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 1); + } + + #[tokio::test] + async fn test_switch_host_policy_gates_nmxt_but_not_nvue_rest() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_switch_host_nmxt_endpoint_disabled"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 10), + "55:66:77:88:99:dd", + Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + false, + false, + "switch-host", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 1); + } + + #[tokio::test] + async fn test_switch_host_does_not_start_host_collectors_when_globally_disabled() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; + + let mut ctx = context_with_config(config, "test_switch_host_collectors_global_disabled"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 11), + "55:66:77:88:99:ee", + Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + true, + true, + "switch-host", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 0); + } + #[tokio::test] async fn test_machine_endpoint_still_starts_sse_logs_collector() { let mut config = Config::default(); diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index b44a985d45..081caef8d8 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -186,6 +186,9 @@ mod tests { serial: Some("SN-001".to_string()), slot_number: Some(7), tray_index: Some(3), + endpoint_role: crate::config::StaticSwitchEndpointRole::Host, + is_primary: true, + nmxt_enabled: None, }), rack_id: None, }]; @@ -200,6 +203,9 @@ mod tests { assert_eq!(s.serial, "SN-001"); assert_eq!(s.slot_number, Some(7)); assert_eq!(s.tray_index, Some(3)); + assert_eq!(s.endpoint_role, SwitchEndpointRole::Host); + assert!(s.is_primary); + assert!(s.nmxt_enabled); } other => panic!("expected Switch metadata, got {other:?}"), } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fab1b521a6..168c453862 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -204,7 +204,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, }; @@ -302,6 +302,9 @@ mod tests { serial: "SN-SWITCH-001".to_string(), slot_number: Some(7), tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })), rack_id: None, }; diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index e1c70aae6f..e36c1e472f 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -241,7 +241,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; fn test_switch_id(label: &str) -> SwitchId { let mut hash = [0u8; 32]; @@ -309,6 +309,9 @@ mod tests { serial: "SN-SWITCH-001".to_string(), slot_number: Some(7), tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })), rack_id: None, }; From 4f8e492b3301d7186de80790c86364d3d89f66bb Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 20 May 2026 14:30:35 +0200 Subject: [PATCH 23/30] fix(health): remove unnecessary helpers Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 11 ++++------- crates/health/src/discovery/spawn.rs | 17 ++--------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index d8f5c75161..e6f1d08fd3 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -200,14 +200,10 @@ impl ApiClientWrapper { match self.client.find_switches(switch_request).await { Ok(response) => { let switches = response.switches; - let switch_ids = switches - .iter() - .filter_map(|switch| switch.id) - .collect::>(); - let switches_by_id = switches + let switches_by_id: HashMap<_, _> = switches .iter() .filter_map(|switch| switch.id.map(|id| (id, switch))) - .collect::>(); + .collect(); let mut endpoints = Vec::new(); @@ -222,7 +218,8 @@ impl ApiClientWrapper { } } - if !switch_ids.is_empty() { + if !switches_by_id.is_empty() { + let switch_ids = switches_by_id.keys().copied().collect(); match self .client .find_switch_host_endpoints(rpc::forge::SwitchesByIdsRequest { switch_ids }) diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d10f06ccd3..e50f2fcb88 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -51,18 +51,6 @@ pub(super) async fn spawn_collectors_for_endpoint( } } -fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { - endpoint - .switch_data() - .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) -} - -fn switch_host_nmxt_enabled(endpoint: &BmcEndpoint) -> bool { - endpoint.switch_data().is_some_and(|switch| { - switch.endpoint_role == SwitchEndpointRole::Host && switch.nmxt_enabled - }) -} - fn spawn_generic_redfish_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, @@ -291,7 +279,7 @@ fn spawn_switch_host_collectors( let key = endpoint.key(); let endpoint_arc = endpoint.clone(); - if switch_host_nmxt_enabled(endpoint) + if endpoint.switch_data().is_some_and(|switch| switch.nmxt_enabled) && let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) { @@ -333,8 +321,7 @@ fn spawn_switch_host_collectors( } } - if is_switch_host_endpoint(endpoint) - && let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config + if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) { From eb11bb4652e707648368a61b573c1a175fa33ad9 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 20 May 2026 15:57:48 +0200 Subject: [PATCH 24/30] fix(api): use proper errors when bmc/switch host credentials are not found Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/api/src/handlers/credential.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index 140fb6eae2..eb956237ec 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -409,7 +409,10 @@ pub(crate) async fn get_bmc_credentals( }) .await .map_err(|e| CarbideError::internal(e.to_string()))? - .ok_or_else(|| CarbideError::internal("missing credentials".to_string()))?; + .ok_or_else(|| CarbideError::NotFoundError { + kind: "bmc_root_credentials", + id: req.mac_addr.clone(), + })?; let (username, password) = match credentials { Credentials::UsernamePassword { username, password } => (username, password), @@ -442,7 +445,10 @@ pub(crate) async fn get_switch_nvos_credentials( .get_credentials(&CredentialKey::SwitchNvosAdmin { bmc_mac_address }) .await .map_err(|e| CarbideError::internal(e.to_string()))? - .ok_or_else(|| CarbideError::internal("missing credentials".to_string()))?; + .ok_or_else(|| CarbideError::NotFoundError { + kind: "switch_nvos_credentials", + id: req.bmc_mac_addr.clone(), + })?; let Credentials::UsernamePassword { username, password } = credentials; From 4099f6204ad41eb59a5047623f08a3c8e00100e4 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 20 May 2026 21:18:25 +0200 Subject: [PATCH 25/30] fix(health): lint-police --- crates/api/src/tests/credential.rs | 3 ++- crates/health/src/discovery/spawn.rs | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index f9d25a72d1..1d69c312ce 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -16,7 +16,8 @@ */ use forge_secrets::credentials::{ - BgpCredentialType, CredentialKey, CredentialReader, CredentialType, Credentials, CredentialWriter, + BgpCredentialType, CredentialKey, CredentialReader, CredentialType, CredentialWriter, + Credentials, }; use rpc::forge::forge_server::Forge; use rpc::forge::{ diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index e50f2fcb88..e86401c8d6 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -279,7 +279,9 @@ fn spawn_switch_host_collectors( let key = endpoint.key(); let endpoint_arc = endpoint.clone(); - if endpoint.switch_data().is_some_and(|switch| switch.nmxt_enabled) + if endpoint + .switch_data() + .is_some_and(|switch| switch.nmxt_enabled) && let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) { From 804fd0f3086c5500595a54f0de51f9c5c3048064 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 21 May 2026 02:26:12 +0200 Subject: [PATCH 26/30] fix(api, health): reshape Switch API surface with switch bmc and nvos info in one endpoint. Remove dead code --- crates/api-db/src/switch.rs | 61 +------- crates/api-model/src/rpc_conv/switch.rs | 1 + crates/api/src/api.rs | 7 - crates/api/src/auth/internal_rbac_rules.rs | 4 - crates/api/src/handlers/credential.rs | 34 ++++- crates/api/src/handlers/switch.rs | 155 +++++++++------------ crates/api/src/tests/credential.rs | 10 +- crates/api/src/tests/switch_find.rs | 60 ++++++-- crates/health/src/api_client.rs | 79 +++-------- crates/rpc/build.rs | 8 -- crates/rpc/proto/forge.proto | 17 +-- 11 files changed, 180 insertions(+), 256 deletions(-) diff --git a/crates/api-db/src/switch.rs b/crates/api-db/src/switch.rs index 40c1fcb638..9b2510279a 100644 --- a/crates/api-db/src/switch.rs +++ b/crates/api-db/src/switch.rs @@ -22,6 +22,7 @@ use carbide_uuid::switch::SwitchId; use chrono::prelude::*; use config_version::{ConfigVersion, Versioned}; use health_report::{HealthReport, HealthReportApplyMode}; +use mac_address::MacAddress; use model::controller_outcome::PersistentStateHandlerOutcome; use model::metadata::Metadata; use model::rack::RackFirmwareUpgradeStatus; @@ -459,34 +460,6 @@ pub async fn update(switch: &Switch, txn: &mut PgConnection) -> Result DatabaseResult> { - let sql = r#" - SELECT - es.serial_number, - es.bmc_mac_address, - mia.address as ip_address - FROM expected_switches es - JOIN machine_interfaces mi ON mi.mac_address = es.bmc_mac_address - JOIN machine_interface_addresses mia ON mia.interface_id = mi.id - JOIN network_segments ns ON ns.id = mi.segment_id - WHERE ns.network_segment_type = 'underlay' - "#; - - sqlx::query_as(sql) - .fetch_all(txn) - .await - .map_err(|err| DatabaseError::new("list_switch_bmc_info", err)) -} - /// Resolve SwitchIds to BMC IPs via the FK path: /// switches.bmc_mac_address -> expected_switches.bmc_mac_address /// -> machine_interfaces -> machine_interface_addresses (underlay) -> IP @@ -611,38 +584,6 @@ pub async fn update_metadata( } } -#[derive(Debug, sqlx::FromRow)] -pub struct SwitchBmcRow { - pub switch_id: SwitchId, - pub bmc_mac: MacAddress, - pub bmc_ip: IpAddr, -} - -/// Resolve SwitchIds to BMC MAC + IP via machine_interfaces. -pub async fn find_bmc_info_by_switch_ids( - db: impl crate::db_read::DbReader<'_>, - switch_ids: &[SwitchId], -) -> DatabaseResult> { - let sql = r#" - SELECT DISTINCT ON (mi.switch_id) - mi.switch_id, - mi.mac_address AS bmc_mac, - mia.address AS bmc_ip - FROM machine_interfaces mi - JOIN machine_interface_addresses mia ON mia.interface_id = mi.id - JOIN network_segments ns ON ns.id = mi.segment_id - WHERE mi.switch_id = ANY($1) - AND ns.network_segment_type = 'underlay' - ORDER BY mi.switch_id - "#; - - sqlx::query_as(sql) - .bind(switch_ids) - .fetch_all(db) - .await - .map_err(|err| DatabaseError::new("switch::find_bmc_info_by_switch_ids", err)) -} - /// A switch resolved by its BMC MAC address, along with the rack it belongs /// to. Used by the Component Manager state controller wrapper to build a /// rack-level `MaintenanceScope` for the switches it's been asked to act on. diff --git a/crates/api-model/src/rpc_conv/switch.rs b/crates/api-model/src/rpc_conv/switch.rs index 5a2c1fd1f5..8dde250c2c 100644 --- a/crates/api-model/src/rpc_conv/switch.rs +++ b/crates/api-model/src/rpc_conv/switch.rs @@ -187,6 +187,7 @@ impl TryFrom for rpc::Switch { deleted, controller_state, bmc_info: None, + nvos_info: None, state_version, metadata: Some(src.metadata.into()), version: src.version.version_string(), diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 48146aaec8..5e6278fdc8 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -366,13 +366,6 @@ impl Forge for Api { crate::handlers::switch::find_by_ids(self, request).await } - async fn find_switch_host_endpoints( - &self, - request: Request, - ) -> Result, Status> { - crate::handlers::switch::find_host_endpoints(self, request).await - } - async fn delete_switch( &self, request: Request, diff --git a/crates/api/src/auth/internal_rbac_rules.rs b/crates/api/src/auth/internal_rbac_rules.rs index 152a1e15f8..bd1a7c78c2 100644 --- a/crates/api/src/auth/internal_rbac_rules.rs +++ b/crates/api/src/auth/internal_rbac_rules.rs @@ -723,10 +723,6 @@ impl InternalRBACRules { "FindSwitchesByIds", vec![ForgeAdminCLI, Machineatron, Flow, Health], ); - x.perm( - "FindSwitchHostEndpoints", - vec![ForgeAdminCLI, Machineatron, Flow, Health], - ); x.perm("CreateSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("DeleteSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("AddExpectedSwitch", vec![ForgeAdminCLI, Machineatron, Flow]); diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index eb956237ec..79c7085686 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -434,11 +434,33 @@ pub(crate) async fn get_switch_nvos_credentials( crate::api::log_request_data(&request); let req = request.into_inner(); - - let bmc_mac_address: mac_address::MacAddress = req - .bmc_mac_addr - .parse() - .map_err(CarbideError::MacAddressParseError)?; + let switch_id = req + .switch_id + .ok_or_else(|| CarbideError::InvalidArgument("switch_id is required".to_string()))?; + + let bmc_mac_address = { + let mut txn = api.txn_begin().await?; + let switches = db::switch::find_by( + &mut txn, + db::ObjectColumnFilter::One(db::switch::IdColumn, &switch_id), + ) + .await?; + let _ = txn.rollback().await; + + let switch = switches + .first() + .ok_or_else(|| CarbideError::NotFoundError { + kind: "switch", + id: switch_id.to_string(), + })?; + + switch + .bmc_mac_address + .ok_or_else(|| CarbideError::NotFoundError { + kind: "switch_bmc_mac_address", + id: switch_id.to_string(), + })? + }; let credentials = api .credential_manager @@ -447,7 +469,7 @@ pub(crate) async fn get_switch_nvos_credentials( .map_err(|e| CarbideError::internal(e.to_string()))? .ok_or_else(|| CarbideError::NotFoundError { kind: "switch_nvos_credentials", - id: req.bmc_mac_addr.clone(), + id: switch_id.to_string(), })?; let Credentials::UsernamePassword { username, password } = credentials; diff --git a/crates/api/src/handlers/switch.rs b/crates/api/src/handlers/switch.rs index c8479f1486..c15a3ef3a8 100644 --- a/crates/api/src/handlers/switch.rs +++ b/crates/api/src/handlers/switch.rs @@ -68,27 +68,17 @@ pub async fn find_switch( })? }; - let bmc_info_map: std::collections::HashMap = { - let rows = db_switch::list_switch_bmc_info(&mut txn) + let switch_ids: Vec<_> = switch_list.iter().map(|switch| switch.id).collect(); + let endpoint_info_map: std::collections::HashMap<_, _> = if switch_ids.is_empty() { + std::collections::HashMap::new() + } else { + db_switch::find_switch_endpoints_by_ids(&mut *txn, &switch_ids) .await .map_err(|e| CarbideError::Internal { - message: format!("Failed to get switch BMC info: {}", e), - })?; - - rows.into_iter() - .map(|row| { - ( - row.bmc_mac_address.to_string(), - rpc::BmcInfo { - ip: Some(row.ip_address.to_string()), - mac: Some(row.bmc_mac_address.to_string()), - version: None, - firmware_version: None, - port: None, - machine_interface_id: None, - }, - ) - }) + message: format!("Failed to get switch endpoint info: {}", e), + })? + .into_iter() + .map(|row| (row.switch_id, row)) .collect() }; @@ -99,13 +89,33 @@ pub async fn find_switch( let switches: Vec = switch_list .into_iter() .map(|s| { - let bmc_info = s - .bmc_mac_address - .as_ref() - .and_then(|mac| bmc_info_map.get(&mac.to_string()).cloned()); + let endpoint_info = endpoint_info_map.get(&s.id); rpc::Switch::try_from(s).map(|mut rpc_switch| { - rpc_switch.bmc_info = bmc_info; + rpc_switch.bmc_info = endpoint_info.map(|row| rpc::BmcInfo { + ip: Some(row.bmc_ip.to_string()), + mac: Some(row.bmc_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }); + rpc_switch.nvos_info = endpoint_info.and_then(|row| { + let (Some(nvos_mac), Some(nvos_ip)) = + (row.nvos_mac.as_ref(), row.nvos_ip.as_ref()) + else { + return None; + }; + + Some(rpc::BmcInfo { + ip: Some(nvos_ip.to_string()), + mac: Some(nvos_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }) + }); rpc_switch }) }) @@ -158,40 +168,48 @@ pub async fn find_by_ids( ) .await?; - let bmc_info_map: std::collections::HashMap<_, _> = { - let rows = db_switch::find_bmc_info_by_switch_ids(&mut txn, &switch_ids) + let endpoint_info_map: std::collections::HashMap<_, _> = + db_switch::find_switch_endpoints_by_ids(&mut txn, &switch_ids) .await .map_err(|e| CarbideError::Internal { - message: format!("Failed to get switch BMC info: {}", e), - })?; - - rows.into_iter() - .map(|row| { - ( - row.switch_id, - rpc::BmcInfo { - ip: Some(row.bmc_ip.to_string()), - mac: Some(row.bmc_mac.to_string()), - version: None, - firmware_version: None, - port: None, - machine_interface_id: None, - }, - ) - }) - .collect() - }; + message: format!("Failed to get switch endpoint info: {}", e), + })? + .into_iter() + .map(|row| (row.switch_id, row)) + .collect(); let _ = txn.rollback().await; let switches: Vec = switch_list .into_iter() .map(|s| { - let id = s.id; - let bmc_info = bmc_info_map.get(&id).cloned(); + let endpoint_info = endpoint_info_map.get(&s.id); rpc::Switch::try_from(s).map(|mut rpc_switch| { - rpc_switch.bmc_info = bmc_info; + rpc_switch.bmc_info = endpoint_info.map(|row| rpc::BmcInfo { + ip: Some(row.bmc_ip.to_string()), + mac: Some(row.bmc_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }); + rpc_switch.nvos_info = endpoint_info.and_then(|row| { + let (Some(nvos_mac), Some(nvos_ip)) = + (row.nvos_mac.as_ref(), row.nvos_ip.as_ref()) + else { + return None; + }; + + Some(rpc::BmcInfo { + ip: Some(nvos_ip.to_string()), + mac: Some(nvos_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }) + }); rpc_switch }) }) @@ -203,47 +221,6 @@ pub async fn find_by_ids( Ok(Response::new(rpc::SwitchList { switches })) } -pub async fn find_host_endpoints( - api: &Api, - request: Request, -) -> Result, Status> { - log_request_data(&request); - - let switch_ids = request.into_inner().switch_ids; - - let max_find_by_ids = api.runtime_config.max_find_by_ids as usize; - if switch_ids.len() > max_find_by_ids { - return Err(CarbideError::InvalidArgument(format!( - "no more than {max_find_by_ids} IDs can be accepted" - )) - .into()); - } else if switch_ids.is_empty() { - return Err( - CarbideError::InvalidArgument("at least one ID must be provided".to_string()).into(), - ); - } - - let rows = db_switch::find_switch_endpoints_by_ids(&mut api.db_reader(), &switch_ids).await?; - - let endpoints = rows - .into_iter() - .filter_map(|row| { - let (Some(host_mac), Some(host_ip)) = (row.nvos_mac, row.nvos_ip) else { - return None; - }; - - Some(rpc::SwitchHostEndpoint { - switch_id: Some(row.switch_id), - bmc_mac: row.bmc_mac.to_string(), - host_mac: host_mac.to_string(), - host_ip: host_ip.to_string(), - }) - }) - .collect(); - - Ok(Response::new(rpc::SwitchHostEndpointList { endpoints })) -} - pub async fn find_switch_state_histories( api: &Api, request: Request, diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index 1d69c312ce..ad45c83168 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -27,6 +27,7 @@ use tonic::Code; use crate::handlers::credential::MAX_BGP_PASSWORD_LENGTH; use crate::tests::common::api_fixtures::create_test_env; +use crate::tests::common::api_fixtures::site_explorer::new_switch; #[crate::sqlx_test] async fn test_create_host_uefi_credential_when_missing(pool: sqlx::PgPool) { @@ -248,7 +249,12 @@ async fn test_create_bgp_credential_validates_max_password_length(pool: sqlx::Pg #[crate::sqlx_test] async fn test_get_switch_nvos_credentials(pool: sqlx::PgPool) -> eyre::Result<()> { let env = create_test_env(pool).await; - let bmc_mac_address = "00:11:22:33:44:55".parse()?; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + let bmc_mac_address = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]) + .await? + .first() + .expect("switch endpoint row") + .bmc_mac; env.test_credential_manager .set_credentials( @@ -264,7 +270,7 @@ async fn test_get_switch_nvos_credentials(pool: sqlx::PgPool) -> eyre::Result<() .api .get_switch_nvos_credentials(tonic::Request::new( rpc::forge::GetSwitchNvosCredentialsRequest { - bmc_mac_addr: bmc_mac_address.to_string(), + switch_id: Some(switch_id), }, )) .await? diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index aa3c574677..73de7cd3b4 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -264,7 +264,7 @@ async fn test_find_switches_by_ids_response_fields( } #[crate::sqlx_test] -async fn test_find_switch_host_endpoints_returns_resolved_nvos_host( +async fn test_find_switches_by_ids_includes_resolved_nvos_info( pool: sqlx::PgPool, ) -> Result<(), Box> { let env = create_test_env(pool).await; @@ -277,23 +277,62 @@ async fn test_find_switch_host_endpoints_returns_resolved_nvos_host( let response = env .api - .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + .find_switches_by_ids(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { switch_ids: vec![switch_id], })) .await? .into_inner(); - assert_eq!(response.endpoints.len(), 1); - assert_eq!(response.endpoints[0].switch_id, Some(switch_id)); - assert_eq!(response.endpoints[0].bmc_mac, expected.bmc_mac.to_string()); - assert_eq!(response.endpoints[0].host_mac, host_mac.to_string()); - assert_eq!(response.endpoints[0].host_ip, host_ip.to_string()); + assert_eq!(response.switches.len(), 1); + let switch = &response.switches[0]; + assert_eq!(switch.id, Some(switch_id)); + assert_eq!( + switch.bmc_info.as_ref().and_then(|info| info.mac.clone()), + Some(expected.bmc_mac.to_string()) + ); + assert_eq!( + switch.bmc_info.as_ref().and_then(|info| info.ip.clone()), + Some(expected.bmc_ip.to_string()) + ); + + let nvos_info = switch.nvos_info.as_ref().expect("nvos info"); + assert_eq!(nvos_info.mac, Some(host_mac.to_string())); + assert_eq!(nvos_info.ip, Some(host_ip.to_string())); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_find_switches_includes_resolved_nvos_info( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool).await; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + + let mut rows = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]).await?; + let expected = rows.pop().expect("switch endpoint row"); + let host_mac = expected.nvos_mac.expect("nvos mac"); + let host_ip = expected.nvos_ip.expect("nvos ip"); + + let response = env + .api + .find_switches(tonic::Request::new(rpc::forge::SwitchQuery { + name: None, + switch_id: Some(switch_id), + })) + .await? + .into_inner(); + + assert_eq!(response.switches.len(), 1); + let nvos_info = response.switches[0].nvos_info.as_ref().expect("nvos info"); + assert_eq!(nvos_info.mac, Some(host_mac.to_string())); + assert_eq!(nvos_info.ip, Some(host_ip.to_string())); Ok(()) } #[crate::sqlx_test] -async fn test_find_switch_host_endpoints_skips_switch_without_nvos_host( +async fn test_find_switches_by_ids_returns_no_nvos_info_when_unresolved( pool: sqlx::PgPool, ) -> Result<(), Box> { let env = create_test_env(pool).await; @@ -309,13 +348,14 @@ async fn test_find_switch_host_endpoints_skips_switch_without_nvos_host( let response = env .api - .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + .find_switches_by_ids(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { switch_ids: vec![switch_id], })) .await? .into_inner(); - assert!(response.endpoints.is_empty()); + assert_eq!(response.switches.len(), 1); + assert!(response.switches[0].nvos_info.is_none()); Ok(()) } diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index e6f1d08fd3..50e1f098b3 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,7 +15,6 @@ * limitations under the License. */ -use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; @@ -50,7 +49,7 @@ struct ApiCredentialProvider { #[derive(Clone)] enum ApiCredentialKind { Bmc, - SwitchNvosAdmin { bmc_mac: MacAddress }, + SwitchNvosAdmin { switch_id: SwitchId }, } impl CredentialProvider for ApiCredentialProvider { @@ -70,9 +69,9 @@ impl CredentialProvider for ApiCredentialProvider { .await .map_err(HealthError::ApiInvocationError)? } - ApiCredentialKind::SwitchNvosAdmin { bmc_mac } => { + ApiCredentialKind::SwitchNvosAdmin { switch_id } => { let request = rpc::forge::GetSwitchNvosCredentialsRequest { - bmc_mac_addr: bmc_mac.to_string(), + switch_id: Some(*switch_id), }; self.client @@ -199,16 +198,10 @@ impl ApiClientWrapper { match self.client.find_switches(switch_request).await { Ok(response) => { - let switches = response.switches; - let switches_by_id: HashMap<_, _> = switches - .iter() - .filter_map(|switch| switch.id.map(|id| (id, switch))) - .collect(); - let mut endpoints = Vec::new(); - for switch in &switches { - match self.extract_switch_endpoint(switch).await { + for switch in response.switches { + match self.extract_switch_endpoint(&switch).await { Ok(endpoint) => endpoints.push(Arc::new(endpoint)), Err(error) => tracing::warn!( ?switch, @@ -216,32 +209,16 @@ impl ApiClientWrapper { "Could not add switch endpoint due to error" ), } - } - if !switches_by_id.is_empty() { - let switch_ids = switches_by_id.keys().copied().collect(); - match self - .client - .find_switch_host_endpoints(rpc::forge::SwitchesByIdsRequest { switch_ids }) - .await - { - Ok(response) => { - for host_endpoint in response.endpoints { - match self - .extract_switch_host_endpoint(&host_endpoint, &switches_by_id) - .await - { - Ok(endpoint) => endpoints.push(Arc::new(endpoint)), - Err(error) => tracing::warn!( - ?host_endpoint, - ?error, - "Could not add switch host endpoint due to error" - ), - } - } - } + match self.extract_switch_host_endpoint(&switch).await { + Ok(Some(endpoint)) => endpoints.push(Arc::new(endpoint)), + Ok(None) => {} Err(error) => { - tracing::warn!(?error, "Failed to fetch switch host endpoints") + tracing::warn!( + ?switch, + ?error, + "Could not add switch host endpoint due to error" + ); } } } @@ -355,28 +332,15 @@ impl ApiClientWrapper { async fn extract_switch_host_endpoint( &self, - host_endpoint: &rpc::forge::SwitchHostEndpoint, - switches_by_id: &HashMap, - ) -> Result { - let switch_id = host_endpoint.switch_id.ok_or_else(|| { + switch: &rpc::forge::Switch, + ) -> Result, HealthError> { + let Some(nvos_info) = switch.nvos_info.as_ref() else { + return Ok(None); + }; + let switch_id = switch.id.ok_or_else(|| { HealthError::GenericError("switch host endpoint missing switch ID".to_string()) })?; - let switch = *switches_by_id.get(&switch_id).ok_or_else(|| { - HealthError::GenericError( - "switch host endpoint did not match fetched switch".to_string(), - ) - })?; - let addr = BmcAddr { - ip: host_endpoint - .host_ip - .parse::() - .map_err(|error| HealthError::GenericError(error.to_string()))?, - port: None, - mac: MacAddress::from_str(&host_endpoint.host_mac) - .map_err(|error| HealthError::GenericError(error.to_string()))?, - }; - let bmc_mac = MacAddress::from_str(&host_endpoint.bmc_mac) - .map_err(|error| HealthError::GenericError(error.to_string()))?; + let addr = BmcAddr::try_from(nvos_info)?; self.endpoint_with_auth( addr, @@ -386,9 +350,10 @@ impl ApiClientWrapper { switch.is_primary, )?), switch.rack_id.clone(), - ApiCredentialKind::SwitchNvosAdmin { bmc_mac }, + ApiCredentialKind::SwitchNvosAdmin { switch_id }, ) .await + .map(Some) } async fn extract_power_shelf_endpoint( diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index a1a352ba22..48c0aea609 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -848,14 +848,6 @@ fn main() -> Result<(), Box> { "forge.GetSwitchNvosCredentialsRequest", "#[derive(serde::Serialize)]", ) - .type_attribute( - "forge.SwitchHostEndpoint", - "#[derive(serde::Serialize)]", - ) - .type_attribute( - "forge.SwitchHostEndpointList", - "#[derive(serde::Serialize)]", - ) .type_attribute( "forge.PlacementInRack", "#[derive(serde::Serialize)]", diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 31b96d8492..a23c2df851 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -101,7 +101,6 @@ service Forge { rpc FindSwitches(SwitchQuery) returns (SwitchList); rpc FindSwitchIds(SwitchSearchFilter) returns (SwitchIdList); rpc FindSwitchesByIds(SwitchesByIdsRequest) returns (SwitchList); - rpc FindSwitchHostEndpoints(SwitchesByIdsRequest) returns (SwitchHostEndpointList); rpc DeleteSwitch(SwitchDeletionRequest) returns (SwitchDeletionResult); // Force deletes a Switch and optionally its associated interfaces from the database. rpc AdminForceDeleteSwitch(AdminForceDeleteSwitchRequest) returns (AdminForceDeleteSwitchResponse); @@ -2136,6 +2135,9 @@ message Switch { optional PlacementInRack placement_in_rack = 11; reserved 12, 13; bool is_primary = 14; + + // NVOS host endpoint + BmcInfo nvos_info = 15; } message SwitchList { @@ -2202,17 +2204,6 @@ message SwitchesByIdsRequest { repeated common.SwitchId switch_ids = 1; } -message SwitchHostEndpoint { - common.SwitchId switch_id = 1; - string bmc_mac = 2; - string host_mac = 3; - string host_ip = 4; -} - -message SwitchHostEndpointList { - repeated SwitchHostEndpoint endpoints = 1; -} - message ExpectedSwitch { string bmc_mac_address = 1; string bmc_username = 2; @@ -3762,7 +3753,7 @@ message GetBmcCredentialsRequest { } message GetSwitchNvosCredentialsRequest { - string bmc_mac_addr = 1; + common.SwitchId switch_id = 1; } message GetBmcCredentialsResponse { From 4ebcf2e33fcb8679b1c4a2b7b67264b148c7eb60 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 21 May 2026 05:43:39 +0200 Subject: [PATCH 27/30] fix(api): broken test --- crates/api/src/tests/switch_find.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index 73de7cd3b4..34b2021d4d 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -254,10 +254,10 @@ async fn test_find_switches_by_ids_response_fields( // state_version should be populated assert!(!switch.state_version.is_empty()); - // bmc_info is None when no machine_interface discovery data exists + // bmc_info should be populated from the seeded machine_interface discovery data assert!( - switch.bmc_info.is_none(), - "bmc_info should be None when no discovery data exists" + switch.bmc_info.is_some(), + "bmc_info should be present when discovery data exists" ); Ok(()) From 66709485f8e1056dc8b85e1c9d8713354e89210f Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 26 May 2026 23:11:03 +0200 Subject: [PATCH 28/30] fix(health): adds SwitchNvosInfo, explicit `spawn_collectors_for_endpoint` for BMCs. --- crates/api/src/handlers/switch.rs | 10 ++-------- crates/api/src/tests/switch_find.rs | 2 ++ crates/health/src/api_client.rs | 24 ++++++++++++++++++++++++ crates/health/src/discovery/spawn.rs | 13 ++++++------- crates/rpc/build.rs | 1 + crates/rpc/proto/forge.proto | 8 +++++++- 6 files changed, 42 insertions(+), 16 deletions(-) diff --git a/crates/api/src/handlers/switch.rs b/crates/api/src/handlers/switch.rs index c15a3ef3a8..4bda23fbf5 100644 --- a/crates/api/src/handlers/switch.rs +++ b/crates/api/src/handlers/switch.rs @@ -107,13 +107,10 @@ pub async fn find_switch( return None; }; - Some(rpc::BmcInfo { + Some(rpc::SwitchNvosInfo { ip: Some(nvos_ip.to_string()), mac: Some(nvos_mac.to_string()), - version: None, - firmware_version: None, port: None, - machine_interface_id: None, }) }); rpc_switch @@ -201,13 +198,10 @@ pub async fn find_by_ids( return None; }; - Some(rpc::BmcInfo { + Some(rpc::SwitchNvosInfo { ip: Some(nvos_ip.to_string()), mac: Some(nvos_mac.to_string()), - version: None, - firmware_version: None, port: None, - machine_interface_id: None, }) }); rpc_switch diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index 34b2021d4d..b25f9d9d19 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -296,6 +296,7 @@ async fn test_find_switches_by_ids_includes_resolved_nvos_info( ); let nvos_info = switch.nvos_info.as_ref().expect("nvos info"); + let _: &rpc::forge::SwitchNvosInfo = nvos_info; assert_eq!(nvos_info.mac, Some(host_mac.to_string())); assert_eq!(nvos_info.ip, Some(host_ip.to_string())); @@ -325,6 +326,7 @@ async fn test_find_switches_includes_resolved_nvos_info( assert_eq!(response.switches.len(), 1); let nvos_info = response.switches[0].nvos_info.as_ref().expect("nvos info"); + let _: &rpc::forge::SwitchNvosInfo = nvos_info; assert_eq!(nvos_info.mac, Some(host_mac.to_string())); assert_eq!(nvos_info.ip, Some(host_ip.to_string())); diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 50e1f098b3..4ea2fe5607 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -532,6 +532,30 @@ impl TryFrom<&rpc::forge::BmcInfo> for BmcAddr { } } +impl TryFrom<&rpc::forge::SwitchNvosInfo> for BmcAddr { + type Error = HealthError; + + fn try_from(nvos_info: &rpc::forge::SwitchNvosInfo) -> Result { + let ip = nvos_info + .ip + .as_ref() + .ok_or_else(|| HealthError::GenericError("missing NVOS IP address".to_string()))? + .parse::() + .map_err(|error| HealthError::GenericError(error.to_string()))?; + let mac = nvos_info + .mac + .as_ref() + .ok_or_else(|| HealthError::GenericError("missing NVOS MAC address".to_string())) + .and_then(|mac| { + MacAddress::from_str(mac) + .map_err(|error| HealthError::GenericError(error.to_string())) + })?; + let port = nvos_info.port.map(|port| port.try_into().unwrap_or(443)); + + Ok(Self { ip, port, mac }) + } +} + impl From for BmcCredentials { fn from(value: rpc::forge::UsernamePassword) -> Self { Self::UsernamePassword { diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index e86401c8d6..2cdcbf3a9a 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -41,13 +41,12 @@ pub(super) async fn spawn_collectors_for_endpoint( data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { - match endpoint.switch_data().map(|switch| switch.endpoint_role) { - Some(SwitchEndpointRole::Host) => { - spawn_switch_host_collectors(ctx, endpoint, data_sink, metrics_prefix) - } - Some(SwitchEndpointRole::Bmc) | None => { - spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) - } + let endpoint_role = endpoint.switch_data().map(|switch| switch.endpoint_role); + + if matches!(endpoint_role, Some(SwitchEndpointRole::Host)) { + spawn_switch_host_collectors(ctx, endpoint, data_sink, metrics_prefix) + } else { + spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) } } diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index a81131b51b..e4ce9f0dbf 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -851,6 +851,7 @@ fn main() -> Result<(), Box> { "forge.GetSwitchNvosCredentialsRequest", "#[derive(serde::Serialize)]", ) + .type_attribute("forge.SwitchNvosInfo", "#[derive(serde::Serialize)]") .type_attribute( "forge.PlacementInRack", "#[derive(serde::Serialize)]", diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 575f1bdfbc..83974df044 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -2140,7 +2140,7 @@ message Switch { bool is_primary = 14; // NVOS host endpoint - BmcInfo nvos_info = 15; + SwitchNvosInfo nvos_info = 15; } message SwitchList { @@ -3323,6 +3323,12 @@ message BmcInfo { optional common.MachineInterfaceId machine_interface_id = 6; } +message SwitchNvosInfo { + optional string ip = 1; + optional string mac = 2; + optional uint32 port = 3; +} + message Machine { // Uniquely identifies a Forge machine. // The value of this field is globally unique. From b048f0357357d205cf46a40ffb2295217824c55c Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 27 May 2026 17:57:48 +0000 Subject: [PATCH 29/30] fix(health): parse nvl partition num-gpus properly (it returns a string :/) --- .../health/src/collectors/nvue/rest/client.rs | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 5076f5abe0..b2cc7c82f5 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -21,6 +21,7 @@ use std::time::Duration; use reqwest::Client; use reqwest::header::ACCEPT; use serde::Deserialize; +use serde::de::Error as _; use url::Url; use crate::HealthError; @@ -228,11 +229,37 @@ pub struct ClusterApp { pub type SdnPartitionsResponse = HashMap; +fn deserialize_optional_u32_from_number_or_string<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + #[derive(Deserialize)] + #[serde(untagged)] + enum U32OrString { + Number(u32), + String(String), + } + + match Option::::deserialize(deserializer)? { + Some(U32OrString::Number(value)) => Ok(Some(value)), + Some(U32OrString::String(value)) => value.parse::().map(Some).map_err(|error| { + D::Error::custom(format!("invalid numeric string for num-gpus: {error}")) + }), + None => Ok(None), + } +} + #[derive(Debug, Clone, Deserialize, Default)] pub struct SdnPartition { pub name: Option, pub health: Option, - #[serde(rename = "num-gpus")] + #[serde( + default, + rename = "num-gpus", + deserialize_with = "deserialize_optional_u32_from_number_or_string" + )] pub num_gpus: Option, } @@ -354,6 +381,23 @@ mod tests { assert_eq!(resp.num_gpus, Some(8)); } + #[test] + fn test_parse_sdn_partition_string_num_gpus() { + let json = r#"{ + "name": "Default Partition", + "num-gpus": "8", + "health": "unhealthy", + "resiliency-mode": "adaptive_bandwidth", + "mcast-limit": 1024, + "partition-type": "gpuuid_based" + }"#; + + let resp: SdnPartition = serde_json::from_str(json).unwrap(); + assert_eq!(resp.name.as_deref(), Some("Default Partition")); + assert_eq!(resp.health.as_deref(), Some("unhealthy")); + assert_eq!(resp.num_gpus, Some(8)); + } + #[test] fn test_parse_sdn_partitions_map() { let json = r#"{ From 4a38257a3b7367a2292171d4a0bc79abb504cc02 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 28 May 2026 03:52:23 +0200 Subject: [PATCH 30/30] fix(health): ensure collector removal like #1302 --- .../src/collectors/nvue/gnmi/subscriber.rs | 19 ++++++++-- crates/health/src/discovery/context.rs | 35 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs index da7939639d..37843e6d9d 100644 --- a/crates/health/src/collectors/nvue/gnmi/subscriber.rs +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -36,7 +36,7 @@ use crate::collectors::runtime::{BackoffConfig, ExponentialBackoff, StreamingCon use crate::config::NvueGnmiConfig; use crate::endpoint::BmcEndpoint; use crate::metrics::CollectorRegistry; -use crate::sink::{DataSink, EventContext}; +use crate::sink::{CollectorEvent, DataSink, EventContext}; // gRPC ConnectivityState values for `connection_state`. 0 (UNKNOWN) is the gauge default. const IDLE: i64 = 1; @@ -214,6 +214,8 @@ pub fn spawn_gnmi_collector( let registry = collector_registry.registry(); let prefix = collector_registry.prefix().clone(); + let collector_removed_sample_context = sample_event_context.clone(); + let mut collector_removed_on_change_context = None; let sample_const_labels = HashMap::from([ ( @@ -256,10 +258,11 @@ pub fn spawn_gnmi_collector( )?; let on_change_event_context = EventContext::from_endpoint(endpoint, ON_CHANGE_STREAM_ID_SYSTEM_EVENTS); + collector_removed_on_change_context = Some(on_change_event_context.clone()); let on_change_processor = GnmiOnChangeProcessor::new( ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), on_change_row_metrics, - data_sink, + data_sink.clone(), on_change_event_context, switch_id, ); @@ -268,6 +271,7 @@ pub fn spawn_gnmi_collector( } else { None }; + let collector_removed_data_sink = data_sink; Ok(Collector::spawn_task(move |cancel_token| async move { let sample_handle = tokio::spawn(gnmi_sample_task( @@ -291,6 +295,17 @@ pub fn spawn_gnmi_collector( if let Some(handle) = on_change_handle { let _ = handle.await; } + + if let Some(data_sink) = collector_removed_data_sink.as_deref() { + data_sink.handle_event( + &collector_removed_sample_context, + &CollectorEvent::CollectorRemoved, + ); + + if let Some(event_context) = &collector_removed_on_change_context { + data_sink.handle_event(event_context, &CollectorEvent::CollectorRemoved); + } + } })) } diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 5f4abe43f1..be7fa59ff8 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -155,6 +155,7 @@ impl CollectorState { .chain(self.leak_detector.keys()) .chain(self.nmxt.keys()) .chain(self.nvue_rest.keys()) + .chain(self.nvue_gnmi.keys()) .filter(|key| !active_keys.contains(*key)) .cloned() .collect() @@ -233,3 +234,37 @@ impl DiscoveryLoopContext { }) } } + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + use std::collections::HashSet; + + use super::*; + use crate::collectors::Collector; + + fn noop_collector() -> Collector { + Collector::spawn_task(|_| async {}) + } + + #[tokio::test] + async fn removed_keys_includes_nvue_gnmi_collectors() { + let mut state = CollectorState::new(); + state.insert( + CollectorKind::NvueGnmi, + Cow::Borrowed("removed-gNMI-endpoint"), + noop_collector(), + ); + state.insert( + CollectorKind::NvueRest, + Cow::Borrowed("active-rest-endpoint"), + noop_collector(), + ); + + let active = HashSet::from([Cow::Borrowed("active-rest-endpoint")]); + let removed = state.removed_keys(&active); + + assert!(removed.contains(&Cow::Borrowed("removed-gNMI-endpoint"))); + assert!(!removed.contains(&Cow::Borrowed("active-rest-endpoint"))); + } +}