diff --git a/Cargo.lock b/Cargo.lock index 5ff7ef2b6d..3469771a54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1186,11 +1186,15 @@ dependencies = [ "carbide-macros", "carbide-measured-boot", "carbide-metrics-utils", + "carbide-mqtt-common", "carbide-network", "carbide-network-segment-controller", "carbide-nvlink-manager", + "carbide-power-shelf-controller", "carbide-preingestion-manager", "carbide-prost-builder", + "carbide-rack", + "carbide-rack-controller", "carbide-redfish", "carbide-rpc", "carbide-rpc-utils", @@ -1199,6 +1203,7 @@ dependencies = [ "carbide-spdm-controller", "carbide-sqlx-testing", "carbide-ssh", + "carbide-state-controller-common", "carbide-switch-controller", "carbide-tls", "carbide-utils", @@ -1331,7 +1336,7 @@ dependencies = [ "eyre", "futures", "futures-util", - "hickory-proto 0.26.1", + "hickory-proto", "ipnetwork", "itertools 0.14.0", "lazy_static", @@ -1884,6 +1889,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper", + "hyper-rustls", "hyper-util", "logfmt", "mac_address", @@ -1893,12 +1899,15 @@ dependencies = [ "prost-types", "rand 0.10.1", "reqwest 0.13.3", + "rustls", + "rustls-pki-types", "serde", "serde_json", "serde_with", "tempfile", "thiserror 2.0.18", "tokio", + "tokio-stream", "tokio-util", "tonic", "tonic-prost", @@ -2259,6 +2268,18 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "carbide-mqtt-common" +version = "0.1.0" +dependencies = [ + "async-trait", + "mqttea", + "opentelemetry", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "carbide-network" version = "0.0.0" @@ -2321,6 +2342,28 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-power-shelf-controller" +version = "0.0.0" +dependencies = [ + "async-trait", + "carbide-api-db", + "carbide-api-model", + "carbide-health-metrics", + "carbide-rack", + "carbide-secrets", + "carbide-utils", + "carbide-uuid", + "config-version", + "eyre", + "librms", + "mac_address", + "opentelemetry", + "sqlx", + "state-controller", + "tracing", +] + [[package]] name = "carbide-preingestion-manager" version = "0.0.1" @@ -2389,6 +2432,60 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-rack" +version = "0.0.0" +dependencies = [ + "async-trait", + "bms-dsx-exchange", + "carbide-api-db", + "carbide-api-model", + "carbide-health-report", + "carbide-mqtt-common", + "carbide-secrets", + "carbide-uuid", + "chrono", + "eyre", + "librms", + "mac_address", + "mqttea", + "opentelemetry", + "serde_json", + "sqlx", + "state-controller", + "tokio", + "tokio-util", + "tonic", + "tracing", +] + +[[package]] +name = "carbide-rack-controller" +version = "0.0.0" +dependencies = [ + "async-trait", + "carbide-api-db", + "carbide-api-model", + "carbide-health-metrics", + "carbide-rack", + "carbide-secrets", + "carbide-utils", + "carbide-uuid", + "chrono", + "config-version", + "duration-str", + "eyre", + "librms", + "mac_address", + "opentelemetry", + "serde", + "serde_json", + "sqlx", + "state-controller", + "tonic", + "tracing", +] + [[package]] name = "carbide-redfish" version = "0.0.1" @@ -2744,6 +2841,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "carbide-state-controller-common" +version = "0.0.0" +dependencies = [ + "carbide-utils", + "duration-str", + "serde", + "state-controller", +] + [[package]] name = "carbide-switch-controller" version = "0.0.0" @@ -3945,14 +4052,14 @@ checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04" [[package]] name = "dhcproto" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "425ab19f6a915beac79cac8ec2810c1311b502ae14d7f294682081cf5ae4c5bb" +checksum = "c278d2f17dbcb7332f3b31788be67f76017096c5eedc293e1259f2d48b0f891f" dependencies = [ "dhcproto-macros", - "hickory-proto 0.25.2", + "hickory-proto", "ipnet", - "rand 0.9.4", + "rand 0.10.1", "thiserror 2.0.18", ] @@ -5127,7 +5234,7 @@ dependencies = [ "futures-channel", "futures-io", "futures-util", - "hickory-proto 0.26.1", + "hickory-proto", "idna 1.1.0", "ipnet", "jni", @@ -5139,28 +5246,6 @@ dependencies = [ "url", ] -[[package]] -name = "hickory-proto" -version = "0.25.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" -dependencies = [ - "async-trait", - "cfg-if", - "data-encoding", - "enum-as-inner", - "futures-channel", - "futures-util", - "idna 1.1.0", - "ipnet", - "once_cell", - "rand 0.9.4", - "thiserror 2.0.18", - "tinyvec", - "tracing", - "url", -] - [[package]] name = "hickory-proto" version = "0.26.1" @@ -5190,7 +5275,7 @@ dependencies = [ "cfg-if", "futures-util", "hickory-net", - "hickory-proto 0.26.1", + "hickory-proto", "ipconfig", "ipnet", "jni", @@ -9111,12 +9196,8 @@ dependencies = [ "flume", "futures-util", "log", - "rustls-native-certs", - "rustls-pemfile", - "rustls-webpki 0.102.8", "thiserror 2.0.18", "tokio", - "tokio-rustls", "tokio-stream", "tokio-util", ] @@ -9353,7 +9434,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.13", + "rustls-webpki", "subtle", "zeroize", ] @@ -9403,7 +9484,7 @@ dependencies = [ "rustls", "rustls-native-certs", "rustls-platform-verifier-android", - "rustls-webpki 0.103.13", + "rustls-webpki", "security-framework", "security-framework-sys", "webpki-root-certs", @@ -9416,17 +9497,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" -[[package]] -name = "rustls-webpki" -version = "0.102.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" -dependencies = [ - "ring", - "rustls-pki-types", - "untrusted 0.9.0", -] - [[package]] name = "rustls-webpki" version = "0.103.13" diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index f645e06221..6ce872a46a 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -37,13 +37,17 @@ http = { workspace = true } humantime = { workspace = true } humantime-serde = { workspace = true } hyper = { workspace = true } +hyper-rustls = { workspace = true, features = ["http2"] } hyper-util = { workspace = true } mac_address = { workspace = true } prometheus = { workspace = true } reqwest = { workspace = true, features = ["query", "json"] } +rustls = { workspace = true } +rustls-pki-types = { workspace = true } serde = { features = ["derive"], workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } +tokio-stream = { workspace = true } tokio-util = { workspace = true } tracing = { workspace = true } tracing-subscriber = { features = [ diff --git a/crates/health/build.rs b/crates/health/build.rs index dbc1d6f838..0576fca63a 100644 --- a/crates/health/build.rs +++ b/crates/health/build.rs @@ -20,17 +20,42 @@ use std::path::PathBuf; fn main() -> Result<(), Box> { carbide_version::build(); - // vendored from opentelemetry-proto v1.5.0 let proto_dir = PathBuf::from("proto"); println!("cargo:rerun-if-changed=proto/"); + // vendored from opentelemetry-proto v1.5.0 + tonic_prost_build::configure() + .build_server(false) + .build_client(true) + .compile_protos( + &[ + proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto"), + proto_dir.join("opentelemetry/proto/collector/metrics/v1/metrics_service.proto"), + ], + std::slice::from_ref(&proto_dir), + )?; + + // vendored from openconfig/gnmi v0.11.0 + // gnmi_ext compiled separately so gnmi.proto can extern_path it and reuse the types tonic_prost_build::configure() + .build_client(true) .build_server(false) + .compile_protos( + &[proto_dir.join("github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto")], + std::slice::from_ref(&proto_dir), + )?; + + tonic_prost_build::configure() .build_client(true) + .build_server(false) + .extern_path( + ".gnmi_ext", + "crate::collectors::nvue::gnmi::proto::gnmi_ext", + ) .compile_protos( - &[proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto")], - &[proto_dir], + &[proto_dir.join("github.com/openconfig/gnmi/proto/gnmi/gnmi.proto")], + std::slice::from_ref(&proto_dir), )?; Ok(()) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 19e215fcc3..f0e6f20233 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -168,6 +168,22 @@ cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true +# NVUE gNMI streaming collector (switches only, disabled by default). +# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink +# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when +# configured separately) pushes to an OTel Collector. +[collectors.nvue.gnmi] +gnmi_port = 9339 +sample_interval = "5m" +request_timeout = "30s" +# gNMI ON_CHANGE subscription for system events +system_events_enabled = true + +[collectors.nvue.gnmi.paths] +components_enabled = true +interfaces_enabled = true +leak_sensors_enabled = true + # ============================================================================== # Processors # ============================================================================== diff --git a/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto new file mode 100644 index 0000000000..5738aedd2b --- /dev/null +++ b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi/gnmi.proto @@ -0,0 +1,467 @@ +// +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +syntax = "proto3"; + +import "google/protobuf/any.proto"; +import "google/protobuf/descriptor.proto"; +import "github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto"; + +// Package gNMI defines a service specification for the gRPC Network Management +// Interface. This interface is defined to be a standard interface via which +// a network management system ("client") can subscribe to state values, +// retrieve snapshots of state information, and manipulate the state of a data +// tree supported by a device ("target"). +// +// This document references the gNMI Specification which can be found at +// http://github.com/openconfig/reference/blob/master/rpc/gnmi +package gnmi; + +// Define a protobuf FileOption that defines the gNMI service version. +extend google.protobuf.FileOptions { + // The gNMI service semantic version. + string gnmi_service = 1001; +} + +// gNMI_service is the current version of the gNMI service, returned through +// the Capabilities RPC. +option (gnmi_service) = "0.10.0"; + +option go_package = "github.com/openconfig/gnmi/proto/gnmi"; +option java_multiple_files = true; +option java_outer_classname = "GnmiProto"; +option java_package = "com.github.gnmi.proto"; + + +service gNMI { + // Capabilities allows the client to retrieve the set of capabilities that + // is supported by the target. This allows the target to validate the + // service version that is implemented and retrieve the set of models that + // the target supports. The models can then be specified in subsequent RPCs + // to restrict the set of data that is utilized. + // Reference: gNMI Specification Section 3.2 + rpc Capabilities(CapabilityRequest) returns (CapabilityResponse); + // Retrieve a snapshot of data from the target. A Get RPC requests that the + // target snapshots a subset of the data tree as specified by the paths + // included in the message and serializes this to be returned to the + // client using the specified encoding. + // Reference: gNMI Specification Section 3.3 + rpc Get(GetRequest) returns (GetResponse); + // Set allows the client to modify the state of data on the target. The + // paths to modified along with the new values that the client wishes + // to set the value to. + // Reference: gNMI Specification Section 3.4 + rpc Set(SetRequest) returns (SetResponse); + // Subscribe allows a client to request the target to send it values + // of particular paths within the data tree. These values may be streamed + // at a particular cadence (STREAM), sent one off on a long-lived channel + // (POLL), or sent as a one-off retrieval (ONCE). + // Reference: gNMI Specification Section 3.5 + rpc Subscribe(stream SubscribeRequest) returns (stream SubscribeResponse); +} + +// Notification is a re-usable message that is used to encode data from the +// target to the client. A Notification carries two types of changes to the data +// tree: +// - Deleted values (delete) - a set of paths that have been removed from the +// data tree. +// - Updated values (update) - a set of path-value pairs indicating the path +// whose value has changed in the data tree. +// Reference: gNMI Specification Section 2.1 +message Notification { + int64 timestamp = 1; // Timestamp in nanoseconds since Epoch. + Path prefix = 2; // Prefix used for paths in the message. + repeated Update update = 4; // Data elements that have changed values. + repeated Path delete = 5; // Data elements that have been deleted. + // This notification contains a set of paths that are always updated together + // referenced by a globally unique prefix. + bool atomic = 6; + // Reserved field numbers and identifiers. + reserved "alias"; + reserved 3; +} + +// Update is a re-usable message that is used to store a particular Path, +// Value pair. +// Reference: gNMI Specification Section 2.1 +message Update { + Path path = 1; // The path (key) for the update. + Value value = 2 [deprecated = true]; // The value (value) for the update. + TypedValue val = 3; // The explicitly typed update value. + uint32 duplicates = 4; // Number of coalesced duplicates. +} + +// TypedValue is used to encode a value being sent between the client and +// target (originated by either entity). +message TypedValue { + // One of the fields within the val oneof is populated with the value + // of the update. The type of the value being included in the Update + // determines which field should be populated. In the case that the + // encoding is a particular form of the base protobuf type, a specific + // field is used to store the value (e.g., json_val). + oneof value { + string string_val = 1; // String value. + int64 int_val = 2; // Integer value. + uint64 uint_val = 3; // Unsigned integer value. + bool bool_val = 4; // Bool value. + bytes bytes_val = 5; // Arbitrary byte sequence value. + float float_val = 6 [deprecated = true]; // Deprecated - use double_val. + double double_val = 14; // Floating point value. + Decimal64 decimal_val = 7 + [deprecated = true]; // Deprecated - use double_val. + ScalarArray leaflist_val = 8; // Mixed type scalar array value. + google.protobuf.Any any_val = 9; // protobuf.Any encoded bytes. + bytes json_val = 10; // JSON-encoded text. + bytes json_ietf_val = 11; // JSON-encoded text per RFC7951. + string ascii_val = 12; // Arbitrary ASCII text. + // Protobuf binary encoded bytes. The message type is not included. + // See the specification at + // github.com/openconfig/reference/blob/master/rpc/gnmi/protobuf-vals.md + // for a complete specification. [Experimental] + bytes proto_bytes = 13; + } +} + +// Path encodes a data tree path as a series of repeated strings, with +// each element of the path representing a data tree node name and the +// associated attributes. +// Reference: gNMI Specification Section 2.2.2. +message Path { + // Elements of the path are no longer encoded as a string, but rather within + // the elem field as a PathElem message. + repeated string element = 1 [deprecated = true]; + string origin = 2; // Label to disambiguate path. + repeated PathElem elem = 3; // Elements of the path. + string target = 4; // The name of the target + // (Sec. 2.2.2.1) +} + +// PathElem encodes an element of a gNMI path, along with any attributes (keys) +// that may be associated with it. +// Reference: gNMI Specification Section 2.2.2. +message PathElem { + string name = 1; // The name of the element in the path. + map key = 2; // Map of key (attribute) name to value. +} + +// Value encodes a data tree node's value - along with the way in which +// the value is encoded. This message is deprecated by gNMI 0.3.0. +// Reference: gNMI Specification Section 2.2.3. +message Value { + option deprecated = true; + + bytes value = 1; // Value of the variable being transmitted. + Encoding type = 2; // Encoding used for the value field. +} + +// Encoding defines the value encoding formats that are supported by the gNMI +// protocol. These encodings are used by both the client (when sending Set +// messages to modify the state of the target) and the target when serializing +// data to be returned to the client (in both Subscribe and Get RPCs). +// Reference: gNMI Specification Section 2.3 +enum Encoding { + JSON = 0; // JSON encoded text. + BYTES = 1; // Arbitrarily encoded bytes. + PROTO = 2; // Encoded according to scalar values of TypedValue. + ASCII = 3; // ASCII text of an out-of-band agreed format. + JSON_IETF = 4; // JSON encoded text as per RFC7951. +} + +// Error message previously utilised to return errors to the client. Deprecated +// in favour of using the google.golang.org/genproto/googleapis/rpc/status +// message in the RPC response. +// Reference: gNMI Specification Section 2.5 +message Error { + option deprecated = true; + + uint32 code = 1; // Canonical gRPC error code. + string message = 2; // Human readable error. + google.protobuf.Any data = 3; // Optional additional information. +} + +// Decimal64 is used to encode a fixed precision decimal number. The value +// is expressed as a set of digits with the precision specifying the +// number of digits following the decimal point in the digit set. +// This message is deprecated in favor of encoding all floating point types +// as double precision. +message Decimal64 { + option deprecated = true; + + int64 digits = 1; // Set of digits. + uint32 precision = 2; // Number of digits following the decimal point. +} + +// ScalarArray is used to encode a mixed-type array of values. +message ScalarArray { + // The set of elements within the array. Each TypedValue message should + // specify only elements that have a field identifier of 1-7 (i.e., the + // values are scalar values). + repeated TypedValue element = 1; +} + +// SubscribeRequest is the message sent by the client to the target when +// initiating a subscription to a set of paths within the data tree. The +// request field must be populated and the initial message must specify a +// SubscriptionList to initiate a subscription. +// Reference: gNMI Specification Section 3.5.1.1 +message SubscribeRequest { + oneof request { + SubscriptionList subscribe = 1; // Specify the paths within a subscription. + Poll poll = 3; // Trigger a polled update. + } + // Extension messages associated with the SubscribeRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; + // Reserved field numbers and identifiers. + reserved 4; + reserved "aliases"; +} + +// Poll is sent within a SubscribeRequest to trigger the device to +// send telemetry updates for the paths that are associated with the +// subscription. +// Reference: gNMI Specification Section Section 3.5.1.4 +message Poll {} + +// SubscribeResponse is the message used by the target within a Subscribe RPC. +// The target includes a Notification message which is used to transmit values +// of the path(s) that are associated with the subscription. The same message +// is to indicate that the target has sent all data values once (is +// synchronized). +// Reference: gNMI Specification Section 3.5.1.4 +message SubscribeResponse { + oneof response { + Notification update = 1; // Changed or sampled value for a path. + // Indicate target has sent all values associated with the subscription + // at least once. + bool sync_response = 3; + // Deprecated in favour of google.golang.org/genproto/googleapis/rpc/status + Error error = 4 [deprecated = true]; + } + // Extension messages associated with the SubscribeResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; +} + +// SubscriptionList is used within a Subscribe message to specify the list of +// paths that the client wishes to subscribe to. The message consists of a +// list of (possibly prefixed) paths, and options that relate to the +// subscription. +// Reference: gNMI Specification Section 3.5.1.2 +message SubscriptionList { + Path prefix = 1; // Prefix used for paths. + repeated Subscription subscription = 2; // Set of subscriptions to create. + QOSMarking qos = 4; // DSCP marking to be used. + // Mode of the subscription. + enum Mode { + STREAM = 0; // Values streamed by the target (Sec. 3.5.1.5.2). + ONCE = 1; // Values sent once-off by the target (Sec. 3.5.1.5.1). + POLL = 2; // Values sent in response to a poll request (Sec. 3.5.1.5.3). + } + Mode mode = 5; + // Whether elements of the schema that are marked as eligible for aggregation + // should be aggregated or not. + bool allow_aggregation = 6; + // The set of schemas that define the elements of the data tree that should + // be sent by the target. + repeated ModelData use_models = 7; + // The encoding that the target should use within the Notifications generated + // corresponding to the SubscriptionList. + Encoding encoding = 8; + // An optional field to specify that only updates to current state should be + // sent to a client. If set, the initial state is not sent to the client but + // rather only the sync message followed by any subsequent updates to the + // current state. For ONCE and POLL modes, this causes the server to send only + // the sync message (Sec. 3.5.2.3). + bool updates_only = 9; + // Reserved field numbers and identifiers. + reserved 3; + reserved "use_aliases"; +} + +// Subscription is a single request within a SubscriptionList. The path +// specified is interpreted (along with the prefix) as the elements of the data +// tree that the client is subscribing to. The mode determines how the target +// should trigger updates to be sent. +// Reference: gNMI Specification Section 3.5.1.3 +message Subscription { + Path path = 1; // The data tree path. + SubscriptionMode mode = 2; // Subscription mode to be used. + uint64 sample_interval = 3; // ns between samples in SAMPLE mode. + // Indicates whether values that have not changed should be sent in a SAMPLE + // subscription. + bool suppress_redundant = 4; + // 1. A heartbeat interval MAY be specified along with an “on change” + // subscription - in this case, the value of the data item(s) MUST be re-sent + // once per heartbeat interval regardless of whether the value has changed or + // not. + // 2. A heartbeat_interval MAY be specified to modify the behavior of + // suppress_redundant in a sampled subscription. In this case, the + // target MUST generate one telemetry update per heartbeat interval, + // regardless of whether the suppress_redundant flag is set to true. + // This value is specified as an unsigned 64-bit integer in nanoseconds + uint64 heartbeat_interval = 5; +} + +// SubscriptionMode is the mode of the subscription, specifying how the +// target must return values in a subscription. +// Reference: gNMI Specification Section 3.5.1.3 +enum SubscriptionMode { + TARGET_DEFINED = 0; // The target selects the relevant mode for each element. + ON_CHANGE = 1; // The target sends an update on element value change. + SAMPLE = 2; // The target samples values according to the interval. +} + +// QOSMarking specifies the DSCP value to be set on transmitted telemetry +// updates from the target. +// Reference: gNMI Specification Section 3.5.1.2 +message QOSMarking { + uint32 marking = 1; +} + +// SetRequest is sent from a client to the target to update values in the data +// tree. Paths are either deleted by the client, or modified by means of being +// updated, or replaced. Where a replace is used, unspecified values are +// considered to be replaced, whereas when update is used the changes are +// considered to be incremental. The set of changes that are specified within +// a single SetRequest are considered to be a transaction. +// Reference: gNMI Specification Section 3.4.1 +message SetRequest { + Path prefix = 1; // Prefix used for paths in the message. + repeated Path delete = 2; // Paths to be deleted from the data tree. + repeated Update replace = 3; // Updates specifying elements to be replaced. + repeated Update update = 4; // Updates specifying elements to updated. + // Updates specifying elements to union and then replace the data tree. + // See the gNMI specification at + // https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md + // for details. + repeated Update union_replace = 6; + // Extension messages associated with the SetRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; +} + +// SetResponse is the response to a SetRequest, sent from the target to the +// client. It reports the result of the modifications to the data tree that were +// specified by the client. Errors for this RPC should be reported using the +// https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto +// message in the RPC return. The gnmi.Error message can be used to add +// additional details where required. Reference: gNMI Specification +// Section 3.4.2 +message SetResponse { + Path prefix = 1; // Prefix used for paths. + // A set of responses specifying the result of the operations specified in + // the SetRequest. + repeated UpdateResult response = 2; + Error message = 3 + [deprecated = true]; // The overall status of the transaction. + int64 timestamp = 4; // Timestamp of transaction (ns since epoch). + // Extension messages associated with the SetResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 5; +} + +// UpdateResult is used within the SetResponse message to communicate the +// result of an operation specified within a SetRequest message. +// Reference: gNMI Specification Section 3.4.2 +message UpdateResult { + // The operation that was associated with the Path specified. + enum Operation { + INVALID = 0; + DELETE = 1; // The result relates to a delete of Path. + REPLACE = 2; // The result relates to a replace of Path. + UPDATE = 3; // The result relates to an update of Path. + UNION_REPLACE = 4; // The result of a union_replace of Path or CLI origin. + } + // Deprecated timestamp for the UpdateResult, this field has been + // replaced by the timestamp within the SetResponse message, since + // all mutations effected by a set should be applied as a single + // transaction. + int64 timestamp = 1 [deprecated = true]; + Path path = 2; // Path associated with the update. + Error message = 3 [deprecated = true]; // Status of the update operation. + Operation op = 4; // Update operation type. +} + +// GetRequest is sent when a client initiates a Get RPC. It is used to specify +// the set of data elements for which the target should return a snapshot of +// data. The use_models field specifies the set of schema modules that are to +// be used by the target - where use_models is not specified then the target +// must use all schema models that it has. +// Reference: gNMI Specification Section 3.3.1 +message GetRequest { + Path prefix = 1; // Prefix used for paths. + repeated Path path = 2; // Paths requested by the client. + // Type of elements within the data tree. + enum DataType { + ALL = 0; // All data elements. + CONFIG = 1; // Config (rw) only elements. + STATE = 2; // State (ro) only elements. + // Data elements marked in the schema as operational. This refers to data + // elements whose value relates to the state of processes or interactions + // running on the device. + OPERATIONAL = 3; + } + DataType type = 3; // The type of data being requested. + Encoding encoding = 5; // Encoding to be used. + repeated ModelData use_models = 6; // The schema models to be used. + // Extension messages associated with the GetRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 7; +} + +// GetResponse is used by the target to respond to a GetRequest from a client. +// The set of Notifications corresponds to the data values that are requested +// by the client in the GetRequest. +// Reference: gNMI Specification Section 3.3.2 +message GetResponse { + repeated Notification notification = 1; // Data values. + Error error = 2 [deprecated = true]; // Errors that occurred in the Get. + // Extension messages associated with the GetResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 3; +} + +// CapabilityRequest is sent by the client in the Capabilities RPC to request +// that the target reports its capabilities. +// Reference: gNMI Specification Section 3.2.1 +message CapabilityRequest { + // Extension messages associated with the CapabilityRequest. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 1; +} + +// CapabilityResponse is used by the target to report its capabilities to the +// client within the Capabilities RPC. +// Reference: gNMI Specification Section 3.2.2 +message CapabilityResponse { + repeated ModelData supported_models = 1; // Supported schema models. + repeated Encoding supported_encodings = 2; // Supported encodings. + string gNMI_version = 3; // Supported gNMI version. + // Extension messages associated with the CapabilityResponse. See the + // gNMI extension specification for further definition. + repeated gnmi_ext.Extension extension = 4; +} + +// ModelData is used to describe a set of schema modules. It can be used in a +// CapabilityResponse where a target reports the set of modules that it +// supports, and within the SubscribeRequest and GetRequest messages to specify +// the set of models from which data tree elements should be reported. +// Reference: gNMI Specification Section 3.2.3 +message ModelData { + string name = 1; // Name of the model. + string organization = 2; // Organization publishing the model. + string version = 3; // Semantic version of the model. +} diff --git a/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto new file mode 100644 index 0000000000..ada5e39a5d --- /dev/null +++ b/crates/health/proto/github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto @@ -0,0 +1,161 @@ +// +// Copyright 2018 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +syntax = "proto3"; + +import "google/protobuf/duration.proto"; + +// Package gnmi_ext defines a set of extensions messages which can be optionally +// included with the request and response messages of gNMI RPCs. A set of +// well-known extensions are defined within this file, along with a registry for +// extensions defined outside of this package. +package gnmi_ext; + +option go_package = "github.com/openconfig/gnmi/proto/gnmi_ext"; + +// The Extension message contains a single gNMI extension. +message Extension { + oneof ext { + RegisteredExtension registered_ext = 1; // A registered extension. + // Well known extensions. + MasterArbitration master_arbitration = 2; // Master arbitration extension. + History history = 3; // History extension. + Commit commit = 4; // Commit confirmed extension. + Depth depth = 5; // Depth extension. + } +} + +// The RegisteredExtension message defines an extension which is defined outside +// of this file. +message RegisteredExtension { + ExtensionID id = 1; // The unique ID assigned to this extension. + bytes msg = 2; // The binary-marshalled protobuf extension payload. +} + +// RegisteredExtension is an enumeration acting as a registry for extensions +// defined by external sources. +enum ExtensionID { + EID_UNSET = 0; + // New extensions are to be defined within this enumeration - their definition + // MUST link to a reference describing their implementation. + + // An experimental extension that may be used during prototyping of a new + // extension. + EID_EXPERIMENTAL = 999; +} + +// MasterArbitration is used to select the master among multiple gNMI clients +// with the same Roles. The client with the largest election_id is honored as +// the master. +// The document about gNMI master arbitration can be found at +// https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-master-arbitration.md +message MasterArbitration { + Role role = 1; + Uint128 election_id = 2; +} + +// Representation of unsigned 128-bit integer. +message Uint128 { + uint64 high = 1; + uint64 low = 2; +} + +// There can be one master for each role. The role is identified by its id. +message Role { + string id = 1; + // More fields can be added if needed, for example, to specify what paths the + // role can read/write. +} + +// The History extension allows clients to request historical data. Its +// spec can be found at +// https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-history.md +message History { + oneof request { + int64 snapshot_time = 1; // Nanoseconds since the epoch + TimeRange range = 2; + } +} + +message TimeRange { + int64 start = 1; // Nanoseconds since the epoch + int64 end = 2; // Nanoseconds since the epoch +} + +// Commit confirmed extension allows automated revert of the configuration after +// certain duration if an explicit confirmation is not issued. It allows +// explicit cancellation of the commit during the rollback window. There cannot +// be more than one commit active at a given time. The document about gNMI +// commit confirmed can be found at +// https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-commit-confirmed.md +message Commit { + // ID is provided by the client during the commit request. During confirm and + // cancel actions the provided ID should match the ID provided during commit. + // If ID is not passed in any actions server shall return error. + // Required. + string id = 1; + oneof action { + // commit action creates a new commit. If a commit is on-going, server + // returns error. + CommitRequest commit = 2; + // confirm action will confirm an on-going commit, the ID provided during + // confirm should match the on-going commit ID. + CommitConfirm confirm = 3; + // cancel action will cancel an on-going commit, the ID provided during + // cancel should match the on-going commit ID. + CommitCancel cancel = 4; + // set rollback duration action sets the rollback duration of an on-going commit + // to a new value. + // The ID provided with the Commit message should match the on-going commit ID. + CommitSetRollbackDuration set_rollback_duration = 5; + } +} + +// CommitRequest is used to create a new confirmed commit. It hold additional +// parameter requried for commit action. +message CommitRequest { + // Maximum duration to wait for a confirmaton before reverting the commit. + google.protobuf.Duration rollback_duration = 1; +} + +// CommitConfirm is used to confirm an on-going commit. It hold additional +// parameter requried for confirm action. +message CommitConfirm {} + +// CommitCancel is used to cancel an on-going commit. It hold additional +// parameter requried for cancel action. +message CommitCancel {} + +// CommitSetRollbackDuration is used to set the existing rollback duration value +// of an on-going commit to a new desired value. +message CommitSetRollbackDuration { + // Maximum duration to wait for a confirmaton before reverting the commit. + google.protobuf.Duration rollback_duration = 1; +} + +// Depth allows clients to specify the depth of the subtree to be returned in +// the response. The depth is specified as the number of levels below the +// specified path. +// The depth is applied to all paths in the Get or Subscribe request. +// The document about gNMI depth can be found at +// https://github.com/openconfig/reference/tree/master/rpc/gnmi/gnmi-depth.md +message Depth { + // The level of the subtree to be returned in the response. + // Value of 0 means no depth limit and behaves the same as if the extension + // was not specified. + // Value of 1 means only the specified path and its direct children will be + // returned. + uint32 level = 1; +} diff --git a/crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto b/crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto new file mode 100644 index 0000000000..dd48f1ad3a --- /dev/null +++ b/crates/health/proto/opentelemetry/proto/collector/metrics/v1/metrics_service.proto @@ -0,0 +1,79 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.metrics.v1; + +import "opentelemetry/proto/metrics/v1/metrics.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.metrics.v1"; +option java_outer_classname = "MetricsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/metrics/v1"; + +// Service that can be used to push metrics between one Application +// instrumented with OpenTelemetry and a collector, or between a collector and a +// central collector. +service MetricsService { + // For performance reasons, it is recommended to keep this RPC + // alive for the entire life of the application. + rpc Export(ExportMetricsServiceRequest) returns (ExportMetricsServiceResponse) {} +} + +message ExportMetricsServiceRequest { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.metrics.v1.ResourceMetrics resource_metrics = 1; +} + +message ExportMetricsServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportMetricsPartialSuccess partial_success = 1; +} + +message ExportMetricsPartialSuccess { + // The number of rejected data points. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_data_points = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} diff --git a/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto b/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto new file mode 100644 index 0000000000..e8587fb54e --- /dev/null +++ b/crates/health/proto/opentelemetry/proto/metrics/v1/metrics.proto @@ -0,0 +1,714 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.metrics.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.metrics.v1"; +option java_outer_classname = "MetricsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/metrics/v1"; + +// MetricsData represents the metrics data that can be stored in a persistent +// storage, OR can be embedded by other protocols that transfer OTLP metrics +// data but do not implement the OTLP protocol. +// +// MetricsData +// └─── ResourceMetrics +// ├── Resource +// ├── SchemaURL +// └── ScopeMetrics +// ├── Scope +// ├── SchemaURL +// └── Metric +// ├── Name +// ├── Description +// ├── Unit +// └── data +// ├── Gauge +// ├── Sum +// ├── Histogram +// ├── ExponentialHistogram +// └── Summary +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message MetricsData { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceMetrics resource_metrics = 1; +} + +// A collection of ScopeMetrics from a Resource. +message ResourceMetrics { + reserved 1000; + + // The resource for the metrics in this message. + // If this field is not set then no resource info is known. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of metrics that originate from a resource. + repeated ScopeMetrics scope_metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_metrics" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Metrics produced by an Scope. +message ScopeMetrics { + // The instrumentation scope information for the metrics in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of metrics that originate from an instrumentation library. + repeated Metric metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the metric data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to all metrics in the "metrics" field. + string schema_url = 3; +} + +// Defines a Metric which has one or more timeseries. The following is a +// brief summary of the Metric data model. For more details, see: +// +// https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md +// +// The data model and relation between entities is shown in the +// diagram below. Here, "DataPoint" is the term used to refer to any +// one of the specific data point value types, and "points" is the term used +// to refer to any one of the lists of points contained in the Metric. +// +// - Metric is composed of a metadata and data. +// - Metadata part contains a name, description, unit. +// - Data is one of the possible types (Sum, Gauge, Histogram, Summary). +// - DataPoint contains timestamps, attributes, and one of the possible value type +// fields. +// +// Metric +// +------------+ +// |name | +// |description | +// |unit | +------------------------------------+ +// |data |---> |Gauge, Sum, Histogram, Summary, ... | +// +------------+ +------------------------------------+ +// +// Data [One of Gauge, Sum, Histogram, Summary, ...] +// +-----------+ +// |... | // Metadata about the Data. +// |points |--+ +// +-----------+ | +// | +---------------------------+ +// | |DataPoint 1 | +// v |+------+------+ +------+ | +// +-----+ ||label |label |...|label | | +// | 1 |-->||value1|value2|...|valueN| | +// +-----+ |+------+------+ +------+ | +// | . | |+-----+ | +// | . | ||value| | +// | . | |+-----+ | +// | . | +---------------------------+ +// | . | . +// | . | . +// | . | . +// | . | +---------------------------+ +// | . | |DataPoint M | +// +-----+ |+------+------+ +------+ | +// | M |-->||label |label |...|label | | +// +-----+ ||value1|value2|...|valueN| | +// |+------+------+ +------+ | +// |+-----+ | +// ||value| | +// |+-----+ | +// +---------------------------+ +// +// Each distinct type of DataPoint represents the output of a specific +// aggregation function, the result of applying the DataPoint's +// associated function of to one or more measurements. +// +// All DataPoint types have three common fields: +// - Attributes includes key-value pairs associated with the data point +// - TimeUnixNano is required, set to the end time of the aggregation +// - StartTimeUnixNano is optional, but strongly encouraged for DataPoints +// having an AggregationTemporality field, as discussed below. +// +// Both TimeUnixNano and StartTimeUnixNano values are expressed as +// UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. +// +// # TimeUnixNano +// +// This field is required, having consistent interpretation across +// DataPoint types. TimeUnixNano is the moment corresponding to when +// the data point's aggregate value was captured. +// +// Data points with the 0 value for TimeUnixNano SHOULD be rejected +// by consumers. +// +// # StartTimeUnixNano +// +// StartTimeUnixNano in general allows detecting when a sequence of +// observations is unbroken. This field indicates to consumers the +// start time for points with cumulative and delta +// AggregationTemporality, and it should be included whenever possible +// to support correct rate calculation. Although it may be omitted +// when the start time is truly unknown, setting StartTimeUnixNano is +// strongly encouraged. +message Metric { + reserved 4, 6, 8; + + // name of the metric. + string name = 1; + + // description of the metric, which can be used in documentation. + string description = 2; + + // unit in which the metric value is reported. Follows the format + // described by http://unitsofmeasure.org/ucum.html. + string unit = 3; + + // Data determines the aggregation type (if any) of the metric, what is the + // reported value type for the data points, as well as the relatationship to + // the time interval over which they are reported. + oneof data { + Gauge gauge = 5; + Sum sum = 7; + Histogram histogram = 9; + ExponentialHistogram exponential_histogram = 10; + Summary summary = 11; + } + + // Additional metadata attributes that describe the metric. [Optional]. + // Attributes are non-identifying. + // Consumers SHOULD NOT need to be aware of these attributes. + // These attributes MAY be used to encode information allowing + // for lossless roundtrip translation to / from another data model. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue metadata = 12; +} + +// Gauge represents the type of a scalar metric that always exports the +// "current value" for every data point. It should be used for an "unknown" +// aggregation. +// +// A Gauge does not support different aggregation temporalities. Given the +// aggregation is unknown, points cannot be combined using the same +// aggregation, regardless of aggregation temporalities. Therefore, +// AggregationTemporality is not included. Consequently, this also means +// "StartTimeUnixNano" is ignored for all data points. +message Gauge { + repeated NumberDataPoint data_points = 1; +} + +// Sum represents the type of a scalar metric that is calculated as a sum of all +// reported measurements over a time interval. +message Sum { + repeated NumberDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; + + // If "true" means that the sum is monotonic. + bool is_monotonic = 3; +} + +// Histogram represents the type of a metric that is calculated by aggregating +// as a Histogram of all reported measurements over a time interval. +message Histogram { + repeated HistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// ExponentialHistogram represents the type of a metric that is calculated by aggregating +// as a ExponentialHistogram of all reported double measurements over a time interval. +message ExponentialHistogram { + repeated ExponentialHistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// Summary metric data are used to convey quantile summaries, +// a Prometheus (see: https://prometheus.io/docs/concepts/metric_types/#summary) +// and OpenMetrics (see: https://github.com/OpenObservability/OpenMetrics/blob/4dbf6075567ab43296eed941037c12951faafb92/protos/prometheus.proto#L45) +// data type. These data points cannot always be merged in a meaningful way. +// While they can be useful in some applications, histogram data points are +// recommended for new applications. +// Summary metrics do not have an aggregation temporality field. This is +// because the count and sum fields of a SummaryDataPoint are assumed to be +// cumulative values. +message Summary { + repeated SummaryDataPoint data_points = 1; +} + +// AggregationTemporality defines how a metric aggregator reports aggregated +// values. It describes how those values relate to the time interval over +// which they are aggregated. +enum AggregationTemporality { + // UNSPECIFIED is the default AggregationTemporality, it MUST not be used. + AGGREGATION_TEMPORALITY_UNSPECIFIED = 0; + + // DELTA is an AggregationTemporality for a metric aggregator which reports + // changes since last report time. Successive metrics contain aggregation of + // values from continuous and non-overlapping intervals. + // + // The values for a DELTA metric are based only on the time interval + // associated with one measurement cycle. There is no dependency on + // previous measurements like is the case for CUMULATIVE metrics. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // DELTA metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0+1 to + // t_0+2 with a value of 2. + AGGREGATION_TEMPORALITY_DELTA = 1; + + // CUMULATIVE is an AggregationTemporality for a metric aggregator which + // reports changes since a fixed start time. This means that current values + // of a CUMULATIVE metric depend on all previous measurements since the + // start time. Because of this, the sender is required to retain this state + // in some form. If this state is lost or invalidated, the CUMULATIVE metric + // values MUST be reset and a new fixed start time following the last + // reported measurement time sent MUST be used. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // CUMULATIVE metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+2 with a value of 5. + // 9. The system experiences a fault and loses state. + // 10. The system recovers and resumes receiving at time=t_1. + // 11. A request is received, the system measures 1 request. + // 12. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_1 to + // t_0+1 with a value of 1. + // + // Note: Even though, when reporting changes since last report time, using + // CUMULATIVE is valid, it is not recommended. This may cause problems for + // systems that do not use start_time to determine when the aggregation + // value was reset (e.g. Prometheus). + AGGREGATION_TEMPORALITY_CUMULATIVE = 2; +} + +// DataPointFlags is defined as a protobuf 'uint32' type and is to be used as a +// bit-field representing 32 distinct boolean flags. Each flag defined in this +// enum is a bit-mask. To test the presence of a single flag in the flags of +// a data point, for example, use an expression like: +// +// (point.flags & DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK) == DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK +// +enum DataPointFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + DATA_POINT_FLAGS_DO_NOT_USE = 0; + + // This DataPoint is valid but has no recorded value. This value + // SHOULD be used to reflect explicitly missing data in a series, as + // for an equivalent to the Prometheus "staleness marker". + DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK = 1; + + // Bits 2-31 are reserved for future use. +} + +// NumberDataPoint is a single data point in a timeseries that describes the +// time-varying scalar value of a metric. +message NumberDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // The value itself. A point is considered invalid when one of the recognized + // value fields is not present inside this oneof. + oneof value { + double as_double = 4; + sfixed64 as_int = 6; + } + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 5; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// HistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Histogram. A Histogram contains summary statistics +// for a population of values, it may optionally contain the distribution of +// those values across a set of buckets. +// +// If the histogram contains the distribution of values, then both +// "explicit_bounds" and "bucket counts" fields must be defined. +// If the histogram does not contain the distribution of values, then both +// "explicit_bounds" and "bucket_counts" must be omitted and only "count" and +// "sum" are known. +message HistogramDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 9; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. This + // value must be equal to the sum of the "count" fields in buckets if a + // histogram is provided. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // bucket_counts is an optional field contains the count values of histogram + // for each bucket. + // + // The sum of the bucket_counts must equal the value in the count field. + // + // The number of elements in bucket_counts array must be by one greater than + // the number of elements in explicit_bounds array. + repeated fixed64 bucket_counts = 6; + + // explicit_bounds specifies buckets with explicitly defined bounds for values. + // + // The boundaries for bucket at index i are: + // + // (-infinity, explicit_bounds[i]] for i == 0 + // (explicit_bounds[i-1], explicit_bounds[i]] for 0 < i < size(explicit_bounds) + // (explicit_bounds[i-1], +infinity) for i == size(explicit_bounds) + // + // The values in the explicit_bounds array must be strictly increasing. + // + // Histogram buckets are inclusive of their upper boundary, except the last + // bucket where the boundary is at infinity. This format is intentionally + // compatible with the OpenMetrics histogram definition. + repeated double explicit_bounds = 7; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 8; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // min is the minimum value over (start_time, end_time]. + optional double min = 11; + + // max is the maximum value over (start_time, end_time]. + optional double max = 12; +} + +// ExponentialHistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a ExponentialHistogram of double values. A ExponentialHistogram contains +// summary statistics for a population of values, it may optionally contain the +// distribution of those values across a set of buckets. +// +message ExponentialHistogramDataPoint { + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be + // non-negative. This value must be equal to the sum of the "bucket_counts" + // values in the positive and negative Buckets plus the "zero_count" field. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // scale describes the resolution of the histogram. Boundaries are + // located at powers of the base, where: + // + // base = (2^(2^-scale)) + // + // The histogram bucket identified by `index`, a signed integer, + // contains values that are greater than (base^index) and + // less than or equal to (base^(index+1)). + // + // The positive and negative ranges of the histogram are expressed + // separately. Negative values are mapped by their absolute value + // into the negative range using the same scale as the positive range. + // + // scale is not restricted by the protocol, as the permissible + // values depend on the range of the data. + sint32 scale = 6; + + // zero_count is the count of values that are either exactly zero or + // within the region considered zero by the instrumentation at the + // tolerated degree of precision. This bucket stores values that + // cannot be expressed using the standard exponential formula as + // well as values that have been rounded to zero. + // + // Implementations MAY consider the zero bucket to have probability + // mass equal to (zero_count / count). + fixed64 zero_count = 7; + + // positive carries the positive range of exponential bucket counts. + Buckets positive = 8; + + // negative carries the negative range of exponential bucket counts. + Buckets negative = 9; + + // Buckets are a set of bucket counts, encoded in a contiguous array + // of counts. + message Buckets { + // Offset is the bucket index of the first entry in the bucket_counts array. + // + // Note: This uses a varint encoding as a simple form of compression. + sint32 offset = 1; + + // bucket_counts is an array of count values, where bucket_counts[i] carries + // the count of the bucket at index (offset+i). bucket_counts[i] is the count + // of values greater than base^(offset+i) and less than or equal to + // base^(offset+i+1). + // + // Note: By contrast, the explicit HistogramDataPoint uses + // fixed64. This field is expected to have many buckets, + // especially zeros, so uint64 has been selected to ensure + // varint encoding. + repeated uint64 bucket_counts = 2; + } + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 11; + + // min is the minimum value over (start_time, end_time]. + optional double min = 12; + + // max is the maximum value over (start_time, end_time]. + optional double max = 13; + + // ZeroThreshold may be optionally set to convey the width of the zero + // region. Where the zero region is defined as the closed interval + // [-ZeroThreshold, ZeroThreshold]. + // When ZeroThreshold is 0, zero count bucket stores values that cannot be + // expressed using the standard exponential formula as well as values that + // have been rounded to zero. + double zero_threshold = 14; +} + +// SummaryDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Summary metric. The count and sum fields represent +// cumulative values. +message SummaryDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#summary + double sum = 5; + + // Represents the value at a given quantile of a distribution. + // + // To record Min and Max values following conventions are used: + // - The 1.0 quantile is equivalent to the maximum value observed. + // - The 0.0 quantile is equivalent to the minimum value observed. + // + // See the following issue for more context: + // https://github.com/open-telemetry/opentelemetry-proto/issues/125 + message ValueAtQuantile { + // The quantile of a distribution. Must be in the interval + // [0.0, 1.0]. + double quantile = 1; + + // The value at the given quantile of a distribution. + // + // Quantile values must NOT be negative. + double value = 2; + } + + // (Optional) list of values at different quantiles of the distribution calculated + // from the current snapshot. The quantiles must be strictly increasing. + repeated ValueAtQuantile quantile_values = 6; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// A representation of an exemplar, which is a sample input measurement. +// Exemplars also hold information about the environment when the measurement +// was recorded, for example the span and trace ID of the active span when the +// exemplar was recorded. +message Exemplar { + reserved 1; + + // The set of key/value pairs that were filtered out by the aggregator, but + // recorded alongside the original measurement. Only key/value pairs that were + // filtered out by the aggregator should be included + repeated opentelemetry.proto.common.v1.KeyValue filtered_attributes = 7; + + // time_unix_nano is the exact time when this exemplar was recorded + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 2; + + // The value of the measurement that was recorded. An exemplar is + // considered invalid when one of the recognized value fields is not present + // inside this oneof. + oneof value { + double as_double = 3; + sfixed64 as_int = 6; + } + + // (Optional) Span ID of the exemplar trace. + // span_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes span_id = 4; + + // (Optional) Trace ID of the exemplar trace. + // trace_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes trace_id = 5; +} diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 4ea2fe5607..98a9d45168 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,6 +15,7 @@ * limitations under the License. */ +use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; @@ -92,6 +93,34 @@ impl CredentialProvider for ApiCredentialProvider { } } +fn machine_slot_number( + machine: &rpc::forge::Machine, + position: Option<&rpc::forge::MachinePositionInfo>, +) -> Option { + position + .and_then(|position| position.physical_slot_number) + .or_else(|| { + machine + .placement_in_rack + .as_ref() + .and_then(|placement| placement.slot_number) + }) +} + +fn machine_tray_index( + machine: &rpc::forge::Machine, + position: Option<&rpc::forge::MachinePositionInfo>, +) -> Option { + position + .and_then(|position| position.compute_tray_index) + .or_else(|| { + machine + .placement_in_rack + .as_ref() + .and_then(|placement| placement.tray_index) + }) +} + fn switch_endpoint_metadata( switch: &rpc::forge::Switch, endpoint_role: SwitchEndpointRole, @@ -170,13 +199,15 @@ impl ApiClientWrapper { .find_machines_by_ids(request) .await .map_err(HealthError::ApiInvocationError)?; + let positions = self.fetch_machine_position_info(ids_chunk).await; tracing::debug!( "Fetched details for {} machines with chunk size of 100", machines.machines.len(), ); for machine in machines.machines { - match self.extract_machine_endpoint(&machine).await { + let position = machine.id.as_ref().and_then(|id| positions.get(id)); + match self.extract_machine_endpoint(&machine, position).await { Ok(endpoint) => endpoints.push(Arc::new(endpoint)), Err(error) => tracing::warn!( ?machine, @@ -190,6 +221,30 @@ impl ApiClientWrapper { Ok(endpoints) } + async fn fetch_machine_position_info( + &self, + machine_ids: &[carbide_uuid::machine::MachineId], + ) -> HashMap { + let request = rpc::forge::MachinePositionQuery { + machine_ids: machine_ids.to_vec(), + }; + + match self.client.get_machine_position_info(request).await { + Ok(response) => response + .machine_position_info + .into_iter() + .filter_map(|info| info.machine_id.map(|id| (id, info))) + .collect(), + Err(error) => { + tracing::warn!( + ?error, + "failed to fetch machine position info; falling back to machine placement metadata" + ); + HashMap::new() + } + } + } + async fn fetch_switch_endpoints(&self) -> Vec> { let switch_request = rpc::forge::SwitchQuery { name: None, @@ -267,6 +322,7 @@ impl ApiClientWrapper { async fn extract_machine_endpoint( &self, machine: &rpc::forge::Machine, + position: Option<&rpc::forge::MachinePositionInfo>, ) -> Result { let Some(bmc_info) = &machine.bmc_info else { return Err(HealthError::GenericError( @@ -282,14 +338,8 @@ impl ApiClientWrapper { .as_ref() .and_then(|info| info.dmi_data.as_ref()) .map(|dmi| dmi.chassis_serial.clone()), - slot_number: machine - .placement_in_rack - .as_ref() - .and_then(|placement| placement.slot_number), - tray_index: machine - .placement_in_rack - .as_ref() - .and_then(|placement| placement.tray_index), + slot_number: machine_slot_number(machine, position), + tray_index: machine_tray_index(machine, position), nvlink_domain_uuid: machine .nvlink_info .as_ref() @@ -380,7 +430,7 @@ impl ApiClientWrapper { id: power_shelf.id, serial, })), - None, + power_shelf.rack_id.clone(), ApiCredentialKind::Bmc, ) .await @@ -579,3 +629,41 @@ impl From for BmcCredentials { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn machine_position_info_takes_precedence_over_machine_placement() { + let machine = rpc::forge::Machine { + placement_in_rack: Some(rpc::forge::PlacementInRack { + slot_number: Some(2), + tray_index: Some(1), + }), + ..Default::default() + }; + let position = rpc::forge::MachinePositionInfo { + physical_slot_number: Some(11), + compute_tray_index: Some(4), + ..Default::default() + }; + + assert_eq!(machine_slot_number(&machine, Some(&position)), Some(11)); + assert_eq!(machine_tray_index(&machine, Some(&position)), Some(4)); + } + + #[test] + fn machine_placement_is_fallback_when_position_info_is_absent() { + let machine = rpc::forge::Machine { + placement_in_rack: Some(rpc::forge::PlacementInRack { + slot_number: Some(2), + tray_index: Some(1), + }), + ..Default::default() + }; + + assert_eq!(machine_slot_number(&machine, None), Some(2)); + assert_eq!(machine_tray_index(&machine, None), Some(1)); + } +} diff --git a/crates/health/src/collectors/mod.rs b/crates/health/src/collectors/mod.rs index 5b281ff687..bd1de1750b 100644 --- a/crates/health/src/collectors/mod.rs +++ b/crates/health/src/collectors/mod.rs @@ -27,6 +27,7 @@ pub use firmware::{FirmwareCollector, FirmwareCollectorConfig}; pub use leak_detector::{LeakDetectorCollector, LeakDetectorCollectorConfig}; pub use logs::{LogsCollector, LogsCollectorConfig, SseLogCollector, SseLogCollectorConfig}; pub use nmxt::{NmxtCollector, NmxtCollectorConfig}; +pub use nvue::gnmi::subscriber::spawn_gnmi_collector; pub use nvue::rest::collector::{NvueRestCollector, NvueRestCollectorConfig}; pub use runtime::{ BackoffConfig, Collector, CollectorStartContext, EventStream, ExponentialBackoff, diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs new file mode 100644 index 0000000000..336bd30daa --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -0,0 +1,596 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::time::Duration; + +use tonic::metadata::MetadataMap; +use tonic::transport::{Channel, Endpoint}; +use tonic::{Extensions, Request}; + +use super::proto::g_nmi_client::GNmiClient as TonicGnmiClient; +use super::proto::subscription_list::Mode as SubscriptionListMode; +use super::proto::{ + self, Encoding, Path, PathElem, SubscribeRequest, Subscription, SubscriptionList, + SubscriptionMode, +}; +use crate::HealthError; +use crate::config::NvueGnmiPaths; + +pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { + let mut paths = Vec::with_capacity(3); + if paths_config.components_enabled { + paths.push(Path { + elem: vec![ + PathElem { + name: "components".into(), + key: Default::default(), + }, + PathElem { + name: "component".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } + if paths_config.interfaces_enabled { + paths.push(Path { + elem: vec![ + PathElem { + name: "interfaces".into(), + key: Default::default(), + }, + PathElem { + name: "interface".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } + if paths_config.leak_sensors_enabled { + paths.push(Path { + elem: vec![ + PathElem { + name: "platform-general".into(), + key: Default::default(), + }, + PathElem { + name: "leak-sensors".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); + } + paths +} + +#[derive(Clone)] +pub struct GnmiClient { + switch_id: String, + host: String, + port: u16, + username: Option, + password: Option, + request_timeout: Duration, +} + +impl GnmiClient { + pub fn new( + switch_id: String, + host: &str, + port: u16, + username: Option, + password: Option, + request_timeout: Duration, + ) -> Self { + Self { + switch_id, + host: host.to_string(), + port, + username, + password, + request_timeout, + } + } + + async fn connect(&self) -> Result, HealthError> { + let target = format!("{}:{}", self.host, self.port); + + let uri = http::Uri::builder() + .scheme("https") + .authority(target.as_str()) + .path_and_query("/") + .build() + .map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: invalid endpoint URI: {e}", + self.switch_id + )) + })?; + + let endpoint = Endpoint::from(uri) + .connect_timeout(self.request_timeout) + .timeout(self.request_timeout); + + let tls_config = crate::collectors::nvue::tls::self_signed_tls_config(); + let connector = hyper_rustls::HttpsConnectorBuilder::new() + .with_tls_config(tls_config) + .https_only() + .enable_http2() + .build(); + + let channel = endpoint + .connect_with_connector(connector) + .await + .map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: connection failed to {target}: {e}", + self.switch_id + )) + })?; + + tracing::debug!( + switch_id = %self.switch_id, + target = %target, + "gNMI TLS channel established (skip-verify)" + ); + + Ok(TonicGnmiClient::new(channel)) + } + + /// open a gNMI SAMPLE streaming subscription + pub async fn subscribe_sample( + &self, + paths: &[Path], + sample_interval_nanos: u64, + ) -> Result, HealthError> { + let mut client = self.connect().await?; + + let subscribe_request = build_sample_subscribe_request(paths, sample_interval_nanos); + + let auth = build_auth_metadata(&self.username, &self.password)?; + let stream = tokio_stream::once(subscribe_request); + let request = Request::from_parts(auth, Extensions::default(), stream); + + let response = client.subscribe(request).await.map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: subscribe_sample RPC failed: {e}", + self.switch_id + )) + })?; + + tracing::debug!( + switch_id = %self.switch_id, + sample_interval_nanos, + "gNMI SAMPLE stream opened" + ); + + Ok(response.into_inner()) + } + + /// open a gNMI ON_CHANGE streaming subscription + pub async fn subscribe_on_change( + &self, + prefix: &Path, + paths: &[Path], + ) -> Result, HealthError> { + let mut client = self.connect().await?; + + let subscribe_request = build_on_change_subscribe_request(prefix, paths); + + let auth = build_auth_metadata(&self.username, &self.password)?; + let stream = tokio_stream::once(subscribe_request); + let request = Request::from_parts(auth, Extensions::default(), stream); + + let response = client.subscribe(request).await.map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: subscribe_on_change RPC failed: {e}", + self.switch_id + )) + })?; + + tracing::debug!( + switch_id = %self.switch_id, + "gNMI ON_CHANGE stream opened" + ); + + Ok(response.into_inner()) + } +} + +pub(crate) fn system_events_prefix() -> Path { + Path { + target: "nvos".to_string(), + elem: vec![PathElem { + name: "system-events".to_string(), + key: Default::default(), + }], + ..Default::default() + } +} + +/// gNMI path for ON_CHANGE system event subscriptions. An empty path subscribes +/// to all events below the `system-events` prefix. +pub(crate) fn system_events_subscribe_path() -> Vec { + vec![Path::default()] +} + +fn build_on_change_subscribe_request(prefix: &Path, paths: &[Path]) -> SubscribeRequest { + let subscription_list = SubscriptionList { + prefix: Some(prefix.clone()), + subscription: paths + .iter() + .map(|path| Subscription { + path: Some(path.clone()), + mode: SubscriptionMode::OnChange.into(), + ..Default::default() + }) + .collect(), + mode: SubscriptionListMode::Stream.into(), + encoding: Encoding::Json.into(), + updates_only: true, + ..Default::default() + }; + + SubscribeRequest { + request: Some(proto::subscribe_request::Request::Subscribe( + subscription_list, + )), + extension: vec![], + } +} + +fn build_sample_subscribe_request(paths: &[Path], sample_interval_nanos: u64) -> SubscribeRequest { + let subscription_list = SubscriptionList { + prefix: Some(Path { + target: "nvos".to_string(), + ..Default::default() + }), + subscription: paths + .iter() + .map(|path| Subscription { + path: Some(path.clone()), + mode: SubscriptionMode::Sample.into(), + sample_interval: sample_interval_nanos, + ..Default::default() + }) + .collect(), + mode: SubscriptionListMode::Stream.into(), + encoding: Encoding::Json.into(), + ..Default::default() + }; + + SubscribeRequest { + request: Some(proto::subscribe_request::Request::Subscribe( + subscription_list, + )), + extension: vec![], + } +} + +fn build_auth_metadata( + username: &Option, + password: &Option, +) -> Result { + let mut meta = MetadataMap::new(); + if let Some(username) = username { + let value = username.parse().map_err(|e| { + HealthError::GnmiError(format!("invalid username for gRPC metadata: {e}")) + })?; + meta.insert("username", value); + } + if let Some(password) = password { + let value = password + .parse() + .map_err(|_e| HealthError::GnmiError("invalid password for gRPC metadata".into()))?; + meta.insert("password", value); + } + Ok(meta) +} + +/// Extract a string from a `TypedValue`, handling JSON-encoded bytes as well +/// as native string values. +#[allow(deprecated)] +pub fn typed_value_to_string(val: &proto::TypedValue) -> Option { + use proto::typed_value::Value; + match &val.value { + Some(Value::StringVal(s)) => Some(s.clone()), + Some(Value::JsonVal(bytes)) | Some(Value::JsonIetfVal(bytes)) => { + let s = String::from_utf8_lossy(bytes); + let trimmed = s.trim().trim_matches('"'); + Some(trimmed.to_string()) + } + Some(Value::AsciiVal(s)) => Some(s.clone()), + Some(Value::IntVal(v)) => Some(v.to_string()), + Some(Value::UintVal(v)) => Some(v.to_string()), + Some(Value::BoolVal(v)) => Some(v.to_string()), + Some(Value::FloatVal(v)) => Some(v.to_string()), + Some(Value::DoubleVal(v)) => Some(v.to_string()), + _ => None, + } +} + +/// Extract a float from a `TypedValue`, handling JSON-encoded bytes, native +/// numeric values, and string representations. +#[allow(deprecated)] +pub fn typed_value_to_f64(val: &proto::TypedValue) -> Option { + use proto::typed_value::Value; + match &val.value { + Some(Value::DoubleVal(v)) => Some(*v), + Some(Value::FloatVal(v)) => Some(*v as f64), + Some(Value::IntVal(v)) => Some(*v as f64), + Some(Value::UintVal(v)) => Some(*v as f64), + Some(Value::StringVal(s)) => s.parse().ok(), + Some(Value::JsonVal(bytes)) | Some(Value::JsonIetfVal(bytes)) => { + let s = String::from_utf8_lossy(bytes); + s.trim().trim_matches('"').parse().ok() + } + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_typed_value_to_string_string_val() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal("healthy".to_string())), + }; + assert_eq!(typed_value_to_string(&val), Some("healthy".to_string())); + } + + #[test] + fn test_typed_value_to_string_json_val() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"\"degraded\"".to_vec())), + }; + assert_eq!(typed_value_to_string(&val), Some("degraded".to_string())); + } + + #[test] + fn test_typed_value_to_string_json_unquoted() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"42".to_vec())), + }; + assert_eq!(typed_value_to_string(&val), Some("42".to_string())); + } + + #[test] + fn test_typed_value_to_string_int() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::IntVal(-5)), + }; + assert_eq!(typed_value_to_string(&val), Some("-5".to_string())); + } + + #[test] + fn test_typed_value_to_string_uint() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::UintVal(100)), + }; + assert_eq!(typed_value_to_string(&val), Some("100".to_string())); + } + + #[test] + fn test_typed_value_to_string_bool() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::BoolVal(true)), + }; + assert_eq!(typed_value_to_string(&val), Some("true".to_string())); + } + + #[test] + fn test_typed_value_to_string_none() { + let val = proto::TypedValue { value: None }; + assert_eq!(typed_value_to_string(&val), None); + } + + #[test] + fn test_typed_value_to_f64_double() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(42.5)), + }; + assert_eq!(typed_value_to_f64(&val), Some(42.5)); + } + + #[test] + fn test_typed_value_to_f64_int() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::IntVal(42)), + }; + assert_eq!(typed_value_to_f64(&val), Some(42.0)); + } + + #[test] + fn test_typed_value_to_f64_json_string() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"\"1.5e-3\"".to_vec())), + }; + assert_eq!(typed_value_to_f64(&val), Some(0.0015)); + } + + #[test] + fn test_typed_value_to_f64_json_number() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::JsonVal(b"99.9".to_vec())), + }; + assert_eq!(typed_value_to_f64(&val), Some(99.9)); + } + + #[test] + fn test_typed_value_to_f64_string() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal("1.23".to_string())), + }; + assert_eq!(typed_value_to_f64(&val), Some(1.23)); + } + + #[test] + fn test_typed_value_to_f64_non_numeric_string() { + let val = proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal("hello".to_string())), + }; + assert_eq!(typed_value_to_f64(&val), None); + } + + #[test] + fn test_typed_value_to_f64_none() { + let val = proto::TypedValue { value: None }; + assert_eq!(typed_value_to_f64(&val), None); + } + + #[test] + fn test_nvue_subscribe_paths_all_enabled() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); + assert_eq!(paths.len(), 3); + + assert_eq!(paths[0].elem.len(), 2); + assert_eq!(paths[0].elem[0].name, "components"); + assert_eq!(paths[0].elem[1].name, "component"); + + assert_eq!(paths[1].elem.len(), 2); + assert_eq!(paths[1].elem[0].name, "interfaces"); + assert_eq!(paths[1].elem[1].name, "interface"); + + assert_eq!(paths[2].elem.len(), 2); + assert_eq!(paths[2].elem[0].name, "platform-general"); + assert_eq!(paths[2].elem[1].name, "leak-sensors"); + } + + #[test] + fn test_nvue_subscribe_paths_selective() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: false, + interfaces_enabled: true, + leak_sensors_enabled: false, + }); + assert_eq!(paths.len(), 1); + assert_eq!(paths[0].elem.len(), 2); + assert_eq!(paths[0].elem[0].name, "interfaces"); + assert_eq!(paths[0].elem[1].name, "interface"); + } + + #[test] + fn test_nvue_subscribe_paths_none_enabled() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: false, + interfaces_enabled: false, + leak_sensors_enabled: false, + }); + assert!(paths.is_empty()); + } + + #[test] + fn test_build_sample_subscribe_request() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); + let interval_nanos = 300_000_000_000u64; + + let req = build_sample_subscribe_request(&paths, interval_nanos); + + let sub_list = match req.request { + Some(proto::subscribe_request::Request::Subscribe(sl)) => sl, + _ => panic!("expected Subscribe variant"), + }; + + assert_eq!( + sub_list.mode, + i32::from(SubscriptionListMode::Stream), + "must use Stream mode for SAMPLE subscriptions" + ); + assert_eq!( + sub_list.encoding, + i32::from(Encoding::Json), + "encoding must be JSON" + ); + + let prefix = sub_list.prefix.expect("prefix must be set"); + assert_eq!(prefix.target, "nvos", "target must be nvos"); + + assert_eq!(sub_list.subscription.len(), 3); + for sub in &sub_list.subscription { + assert_eq!( + sub.mode, + i32::from(SubscriptionMode::Sample), + "each subscription must use Sample mode" + ); + assert_eq!( + sub.sample_interval, interval_nanos, + "sample_interval must match the requested interval" + ); + assert!(sub.path.is_some(), "each subscription must have a path"); + } + } + + #[test] + fn test_system_events_prefix() { + let prefix = system_events_prefix(); + assert_eq!(prefix.target, "nvos"); + assert_eq!(prefix.elem.len(), 1); + assert_eq!(prefix.elem[0].name, "system-events"); + } + + #[test] + fn test_system_events_subscribe_path() { + let paths = system_events_subscribe_path(); + assert_eq!(paths.len(), 1); + assert!( + paths[0].elem.is_empty(), + "empty path subscribes to all events under prefix" + ); + } + + #[test] + fn test_build_on_change_subscribe_request() { + let prefix = system_events_prefix(); + let paths = system_events_subscribe_path(); + + let req = build_on_change_subscribe_request(&prefix, &paths); + + let sub_list = match req.request { + Some(proto::subscribe_request::Request::Subscribe(sl)) => sl, + _ => panic!("expected Subscribe variant"), + }; + + assert_eq!( + sub_list.mode, + i32::from(SubscriptionListMode::Stream), + "must use Stream mode" + ); + assert_eq!( + sub_list.encoding, + i32::from(Encoding::Json), + "encoding must be JSON" + ); + assert!(sub_list.updates_only, "ON_CHANGE must use updates_only"); + + let req_prefix = sub_list.prefix.expect("prefix must be set"); + assert_eq!(req_prefix.target, "nvos"); + assert_eq!(req_prefix.elem.len(), 1); + assert_eq!(req_prefix.elem[0].name, "system-events"); + + assert_eq!(sub_list.subscription.len(), 1); + assert_eq!( + sub_list.subscription[0].mode, + i32::from(SubscriptionMode::OnChange), + "subscription must use OnChange mode" + ); + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/mod.rs b/crates/health/src/collectors/nvue/gnmi/mod.rs new file mode 100644 index 0000000000..3ce40c9d3d --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/mod.rs @@ -0,0 +1,33 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +pub(crate) mod client; +pub(crate) mod on_change_processor; +pub(crate) mod sample_processor; +pub(crate) mod subscriber; + +// prost generates ExtensionId::EidUnset / EidExperimental from gnmi_ext.proto, +// where the proto convention prefixes every value with the enum abbreviation. +// clippy flags the shared "Eid" prefix but we can't control generated code. +#[allow(clippy::enum_variant_names)] +pub mod proto { + #[allow(clippy::enum_variant_names)] + pub mod gnmi_ext { + tonic::include_proto!("gnmi_ext"); + } + tonic::include_proto!("gnmi"); +} diff --git a/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs new file mode 100644 index 0000000000..d8064e488f --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/on_change_processor.rs @@ -0,0 +1,572 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use std::time::Instant; + +use prometheus::{CounterVec, Gauge, Opts}; + +use super::client::typed_value_to_string; +use super::proto::{self, PathElem}; +use super::sample_processor::now_unix_secs; +use super::subscriber::GnmiStreamMetrics; +use crate::HealthError; +use crate::sink::{CollectorEvent, DataSink, EventContext, SensorHealthData}; + +type ParsedRow = HashMap; +type TableSnapshot = HashMap; + +pub(crate) const ON_CHANGE_STREAM_ID_SYSTEM_EVENTS: &str = "nvue_gnmi_events"; + +pub(crate) struct OnChangeStreamMetrics { + pub(crate) rows_total: CounterVec, + pub(crate) last_row_timestamp: Gauge, +} + +impl OnChangeStreamMetrics { + pub(crate) fn new( + registry: &prometheus::Registry, + prefix: &str, + stream_id: &str, + const_labels: HashMap, + ) -> Result { + let rows_total = CounterVec::new( + Opts::new( + format!("{prefix}_{stream_id}_total"), + "ON_CHANGE rows received by severity (field 'severity' if present)", + ) + .const_labels(const_labels.clone()), + &["severity"], + )?; + registry.register(Box::new(rows_total.clone()))?; + + let last_row_timestamp = Gauge::with_opts( + Opts::new( + format!("{prefix}_{stream_id}_last_timestamp"), + "Unix timestamp of most recent ON_CHANGE row", + ) + .const_labels(const_labels), + )?; + registry.register(Box::new(last_row_timestamp.clone()))?; + + Ok(Self { + rows_total, + last_row_timestamp, + }) + } +} + +pub(crate) struct GnmiOnChangeProcessor { + pub(crate) collector_name: String, + pub(crate) stream_metrics: OnChangeStreamMetrics, + pub(crate) data_sink: Option>, + pub(crate) event_context: EventContext, + pub(crate) switch_id: String, + previous_snapshot: Mutex, +} + +impl GnmiOnChangeProcessor { + pub(crate) fn new( + collector_name: String, + stream_metrics: OnChangeStreamMetrics, + data_sink: Option>, + event_context: EventContext, + switch_id: String, + ) -> Self { + Self { + collector_name, + stream_metrics, + data_sink, + event_context, + switch_id, + previous_snapshot: Mutex::new(HashMap::new()), + } + } + + #[allow(deprecated)] + pub(crate) fn process_subscribe_response( + &self, + resp: &proto::SubscribeResponse, + stream_metrics: &GnmiStreamMetrics, + ) { + let notification = match &resp.response { + Some(proto::subscribe_response::Response::Update(n)) => n, + Some(proto::subscribe_response::Response::SyncResponse(_)) => return, + Some(proto::subscribe_response::Response::Error(e)) => { + stream_metrics.stream_errors_total.inc(); + tracing::warn!( + code = e.code, + message = %e.message, + stream = %self.collector_name, + "nvue_gnmi ON_CHANGE: server error in stream" + ); + return; + } + None => return, + }; + + stream_metrics.notifications_received_total.inc(); + stream_metrics + .last_notification_timestamp + .set(now_unix_secs()); + + let start = Instant::now(); + let entity_count = self.process_notification(notification); + stream_metrics + .notification_processing_seconds + .observe(start.elapsed().as_secs_f64()); + stream_metrics.monitored_entities.set(entity_count as f64); + } + + fn process_notification(&self, notification: &proto::Notification) -> usize { + let prefix_elems: &[PathElem] = notification + .prefix + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let mut current: TableSnapshot = HashMap::new(); + + for update in ¬ification.update { + let val = match update.val.as_ref() { + Some(v) => v, + None => continue, + }; + + let update_elems: &[PathElem] = update + .path + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let combined: Vec<&PathElem> = prefix_elems.iter().chain(update_elems.iter()).collect(); + + let Some(instance_key) = find_instance_key(&combined) else { + continue; + }; + let Some(leaf_elem) = combined.last() else { + continue; + }; + + let value = typed_value_to_string(val).unwrap_or_default(); + current + .entry(instance_key.to_string()) + .or_default() + .insert(leaf_elem.name.clone(), value); + } + + let mut previous = match self.previous_snapshot.lock() { + Ok(guard) => guard, + Err(poisoned) => poisoned.into_inner(), + }; + for (instance_id, row) in ¤t { + let is_new_or_changed = previous.get(instance_id).map(|p| p != row).unwrap_or(true); + if is_new_or_changed { + self.emit_row_as_metric(instance_id, row); + } + } + + let entity_count = current.len(); + *previous = current; + entity_count + } + + fn emit_row_as_metric(&self, instance_id: &str, row: &ParsedRow) { + let severity = row.get("severity").map(String::as_str).unwrap_or("unknown"); + let text = row.get("text").map(String::as_str).unwrap_or(""); + + self.stream_metrics.last_row_timestamp.set(now_unix_secs()); + self.stream_metrics + .rows_total + .with_label_values(&[severity]) + .inc(); + + tracing::info!( + switch_id = %self.switch_id, + stream = %self.collector_name, + instance_id, + severity, + text, + "nvue_gnmi ON_CHANGE: row received" + ); + + let Some(sink) = &self.data_sink else { return }; + + let key = format!("{}:{}", self.collector_name, instance_id); + let mut labels = vec![ + (Cow::Borrowed("instance_id"), instance_id.to_string()), + (Cow::Borrowed("text"), text.to_string()), + ]; + for (key, value) in row { + if key != "text" { + labels.push((Cow::Owned(key.clone()), value.clone())); + } + } + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(SensorHealthData { + key, + name: self.collector_name.clone(), + metric_type: "on_change_row".to_string(), + unit: "severity".to_string(), + value: severity_to_f64(Some(severity)), + labels, + context: None, + })), + ); + } +} + +fn find_instance_key<'a>(elems: &[&'a PathElem]) -> Option<&'a str> { + elems + .iter() + .find(|e| !e.key.is_empty()) + .and_then(|e| e.key.values().next().map(String::as_str)) +} + +fn severity_to_f64(severity: Option<&str>) -> f64 { + match severity { + Some(s) if s.eq_ignore_ascii_case("informational") => 1.0, + Some(s) if s.eq_ignore_ascii_case("warning") => 2.0, + Some(s) if s.eq_ignore_ascii_case("error") => 3.0, + Some(s) if s.eq_ignore_ascii_case("critical") => 4.0, + _ => 0.0, + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use carbide_uuid::rack::RackId; + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; + use mac_address::MacAddress; + + use super::*; + use crate::endpoint::{BmcAddr, EndpointMetadata, SwitchData, SwitchEndpointRole}; + + const TEST_COLLECTOR_NAME: &str = "nvue_gnmi_system_events"; + + #[derive(Default)] + struct CapturingSink { + events: Mutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + self.events + .lock() + .expect("lock poisoned") + .push((context.clone(), event.clone())); + } + } + + fn test_labels() -> HashMap { + HashMap::from([( + "collector_type".to_string(), + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), + )]) + } + + fn test_switch_id(label: &str) -> SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } + + fn test_event_context(collector_type: &'static str) -> EventContext { + EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }, + collector_type, + metadata: None, + rack_id: None, + } + } + + fn test_processor(data_sink: Option>) -> GnmiOnChangeProcessor { + let registry = prometheus::Registry::new(); + let stream_metrics = + OnChangeStreamMetrics::new(®istry, "test", TEST_COLLECTOR_NAME, test_labels()) + .unwrap(); + GnmiOnChangeProcessor::new( + TEST_COLLECTOR_NAME.to_string(), + stream_metrics, + data_sink, + test_event_context(TEST_COLLECTOR_NAME), + "SN1234".to_string(), + ) + } + + fn make_path_elem(name: &str, keys: &[(&str, &str)]) -> PathElem { + PathElem { + name: name.to_string(), + key: keys + .iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + } + } + + fn make_typed_value_string(value: &str) -> proto::TypedValue { + proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal(value.to_string())), + } + } + + #[test] + fn test_find_instance_key() { + let elems = [ + make_path_elem("system-events", &[]), + make_path_elem("system-event", &[("event-id", "38")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ]; + let refs: Vec<&PathElem> = elems.iter().collect(); + assert_eq!(find_instance_key(&refs), Some("38")); + } + + #[test] + fn test_find_instance_key_missing() { + let elems = [ + make_path_elem("system-events", &[]), + make_path_elem("state", &[]), + ]; + let refs: Vec<&PathElem> = elems.iter().collect(); + assert_eq!(find_instance_key(&refs), None); + } + + #[test] + fn test_severity_to_f64() { + assert_eq!(severity_to_f64(Some("informational")), 1.0); + assert_eq!(severity_to_f64(Some("warning")), 2.0); + assert_eq!(severity_to_f64(Some("error")), 3.0); + assert_eq!(severity_to_f64(Some("critical")), 4.0); + assert_eq!(severity_to_f64(Some("CRITICAL")), 4.0); + assert_eq!(severity_to_f64(Some("other")), 0.0); + assert_eq!(severity_to_f64(None), 0.0); + } + + #[test] + fn test_on_change_stream_metrics_duplicate_registration_fails() { + let registry = prometheus::Registry::new(); + let _ = OnChangeStreamMetrics::new(®istry, "test", "stream_a", test_labels()).unwrap(); + let result = OnChangeStreamMetrics::new(®istry, "test", "stream_a", test_labels()); + assert!(result.is_err()); + } + + #[test] + fn test_process_notification_severity_and_text() { + let processor = test_processor(None); + let notification = proto::Notification { + prefix: Some(proto::Path { + elem: vec![make_path_elem("system-events", &[])], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "5")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("critical")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "5")]), + make_path_elem("state", &[]), + make_path_elem("text", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("System fatal state detected")), + ..Default::default() + }, + ], + ..Default::default() + }; + + let count = processor.process_notification(¬ification); + assert_eq!(count, 1); + assert_eq!( + processor + .stream_metrics + .rows_total + .with_label_values(&["critical"]) + .get(), + 1.0 + ); + assert!(processor.stream_metrics.last_row_timestamp.get() > 0.0); + } + + #[test] + fn test_process_notification_snapshot_diff_no_duplicate_emit() { + let processor = test_processor(None); + let notification = proto::Notification { + prefix: Some(proto::Path { + elem: vec![make_path_elem("system-events", &[])], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "7")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("error")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "7")]), + make_path_elem("state", &[]), + make_path_elem("text", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("same event")), + ..Default::default() + }, + ], + ..Default::default() + }; + + processor.process_notification(¬ification); + processor.process_notification(¬ification); + + assert_eq!( + processor + .stream_metrics + .rows_total + .with_label_values(&["error"]) + .get(), + 1.0 + ); + } + + #[test] + fn emitted_metrics_preserve_switch_position_context() { + let sink = Arc::new(CapturingSink::default()); + let switch_id = test_switch_id("switch-a"); + let registry = prometheus::Registry::new(); + let stream_metrics = + OnChangeStreamMetrics::new(®istry, "test", TEST_COLLECTOR_NAME, test_labels()) + .unwrap(); + let processor = GnmiOnChangeProcessor::new( + TEST_COLLECTOR_NAME.to_string(), + stream_metrics, + Some(sink.clone()), + EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }, + collector_type: ON_CHANGE_STREAM_ID_SYSTEM_EVENTS, + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, + })), + rack_id: Some(RackId::new("RACK_2")), + }, + "SN-SWITCH-001".to_string(), + ); + let notification = proto::Notification { + prefix: Some(proto::Path { + elem: vec![make_path_elem("system-events", &[])], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "42")]), + make_path_elem("state", &[]), + make_path_elem("severity", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("warning")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("system-event", &[("event-id", "42")]), + make_path_elem("state", &[]), + make_path_elem("text", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("Link down detected on swp1")), + ..Default::default() + }, + ], + ..Default::default() + }; + + assert_eq!(processor.process_notification(¬ification), 1); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1); + let (context, event) = &events[0]; + assert_eq!(context.switch_id(), Some(switch_id)); + assert_eq!(context.switch_slot_number(), Some(7)); + assert_eq!(context.switch_tray_index(), Some(3)); + assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); + let CollectorEvent::Metric(metric) = event else { + panic!("expected metric event"); + }; + assert_eq!(metric.metric_type, "on_change_row"); + assert_eq!(metric.value, 2.0); + assert!( + metric + .labels + .iter() + .any(|(key, value)| key == "instance_id" && value == "42") + ); + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs new file mode 100644 index 0000000000..ab301e8486 --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -0,0 +1,999 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::borrow::Cow; +use std::collections::HashSet; +use std::sync::Arc; +use std::time::Instant; + +use super::client::{typed_value_to_f64, typed_value_to_string}; +use super::proto::{self, PathElem}; +use super::subscriber::GnmiStreamMetrics; +use crate::sink::{CollectorEvent, DataSink, EventContext, SensorHealthData}; + +pub(crate) const NVUE_GNMI_SAMPLE_STREAM_ID: &str = "nvue_gnmi"; + +/// process NVUE gNMI SAMPLE notifications and emit them as `CollectorEvent::Metric` +pub(crate) struct GnmiSampleProcessor { + pub(crate) data_sink: Option>, + pub(crate) event_context: EventContext, + pub(crate) switch_id: String, +} + +impl GnmiSampleProcessor { + #[allow(deprecated)] + pub(crate) fn process_subscribe_response( + &self, + resp: &proto::SubscribeResponse, + stream_metrics: &GnmiStreamMetrics, + ) { + let notification = match &resp.response { + Some(proto::subscribe_response::Response::Update(n)) => n, + Some(proto::subscribe_response::Response::SyncResponse(_)) => return, + Some(proto::subscribe_response::Response::Error(e)) => { + stream_metrics.stream_errors_total.inc(); + tracing::warn!( + code = e.code, + message = %e.message, + "nvue_gnmi SAMPLE: server error in stream" + ); + return; + } + None => return, + }; + + stream_metrics.notifications_received_total.inc(); + stream_metrics + .last_notification_timestamp + .set(now_unix_secs()); + + let start = Instant::now(); + let entity_count = self.process_notification(notification); + stream_metrics + .notification_processing_seconds + .observe(start.elapsed().as_secs_f64()); + stream_metrics.monitored_entities.set(entity_count as f64); + } + + fn process_notification(&self, notification: &proto::Notification) -> usize { + let prefix_elems: &[PathElem] = notification + .prefix + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let mut entities: HashSet<(&str, &str)> = HashSet::new(); + + for update in ¬ification.update { + let val = match update.val.as_ref() { + Some(v) => v, + None => continue, + }; + + let update_elems: &[PathElem] = update + .path + .as_ref() + .map(|p| p.elem.as_slice()) + .unwrap_or_default(); + + let combined: Vec<&PathElem> = prefix_elems.iter().chain(update_elems.iter()).collect(); + + if let Some(iface) = find_elem_key_ref(&combined, "interface", "name") { + entities.insert(("interface", iface)); + self.process_interface_metric(&combined, iface, val); + } else if let Some(comp) = find_elem_key_ref(&combined, "component", "name") { + entities.insert(("component", comp)); + self.process_component_metric(&combined, comp, val); + } else if let Some(sensor_id) = find_elem_key_ref(&combined, "leak-sensor", "id") + && leaf_matches(&combined, &["state", "state"]) + { + entities.insert(("sensor", sensor_id)); + self.process_leak_sensor_metric(val, sensor_id); + } + } + + entities.len() + } + + fn process_interface_metric( + &self, + elems: &[&PathElem], + iface_name: &str, + val: &proto::TypedValue, + ) { + if leaf_matches(elems, &["state", "oper-status"]) { + let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); + self.emit_data_metric( + "interface_oper_status", + iface_name, + v, + "state", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["state", "counters", "in-errors"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_in_errors", + iface_name, + v, + "count", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["state", "counters", "out-errors"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_out_errors", + iface_name, + v, + "count", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["phy-diag", "state", "effective-ber"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_effective_ber", + iface_name, + v, + "ratio", + "interface_name", + iface_name, + ); + } else if leaf_matches(elems, &["phy-diag", "state", "symbol-ber"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_symbol_ber", + iface_name, + v, + "ratio", + "interface_name", + iface_name, + ); + } else if leaf_matches( + elems, + &["phy-diag", "state", "unintentional-link-down-events"], + ) && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "interface_link_down_events", + iface_name, + v, + "count", + "interface_name", + iface_name, + ); + } + } + + fn process_component_metric( + &self, + elems: &[&PathElem], + comp_name: &str, + val: &proto::TypedValue, + ) { + if leaf_matches(elems, &["healthz", "state", "status"]) { + let v = component_health_to_f64(typed_value_to_string(val).as_deref()); + self.emit_data_metric( + "component_health_status", + comp_name, + v, + "state", + "component_name", + comp_name, + ); + } else if leaf_matches(elems, &["state", "temperature", "instant"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_data_metric( + "component_temperature_celsius", + comp_name, + v, + "celsius", + "component_name", + comp_name, + ); + } + } + + fn process_leak_sensor_metric(&self, val: &proto::TypedValue, sensor_id: &str) { + let v = leak_sensor_to_f64(typed_value_to_string(val).as_deref()); + self.emit_data_metric( + "leak_sensor_state", + sensor_id, + v, + "state", + "sensor_id", + sensor_id, + ); + } + + fn emit_data_metric( + &self, + metric_type: &str, + entity_id: &str, + value: f64, + unit: &str, + entity_label_name: &'static str, + entity_label_value: &str, + ) { + let Some(sink) = &self.data_sink else { return }; + + let mut key = String::with_capacity(metric_type.len() + 1 + entity_id.len()); + key.push_str(metric_type); + key.push(':'); + key.push_str(entity_id); + + // only the domain-specific entity label; endpoint identity (ip, mac, + // serial_number, collector_type) is added by PrometheusSink from EventContext + let labels = vec![( + Cow::Borrowed(entity_label_name), + entity_label_value.to_string(), + )]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(SensorHealthData { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: unit.to_string(), + value, + labels, + context: None, + })), + ); + } +} + +fn find_elem_key_ref<'a>( + elems: &[&'a PathElem], + elem_name: &str, + key_name: &str, +) -> Option<&'a str> { + elems + .iter() + .find(|e| e.name == elem_name) + .and_then(|e| e.key.get(key_name).map(String::as_str)) +} + +fn leaf_matches(elems: &[&PathElem], expected: &[&str]) -> bool { + if elems.len() < expected.len() { + return false; + } + let start = elems.len() - expected.len(); + elems[start..] + .iter() + .zip(expected) + .all(|(elem, name)| elem.name == *name) +} + +fn oper_status_to_f64(status: Option<&str>) -> f64 { + match status { + Some(s) if s.eq_ignore_ascii_case("up") => 1.0, + _ => 0.0, + } +} + +fn component_health_to_f64(status: Option<&str>) -> f64 { + match status { + Some(s) if s.eq_ignore_ascii_case("healthy") => 1.0, + Some(s) if s.eq_ignore_ascii_case("unhealthy") => 2.0, + _ => 0.0, + } +} + +// /platform-general/leak-sensors/leak-sensor[id=X]/state/state +// NVOS values from nvidia-platform-general-ext LeakSensors type: +// "OK" -> 0.0 (no leak) +// "LEAK" -> 1.0 (leak detected) +// "UNSET" -> 0.0 (default / unmapped internal value) +fn leak_sensor_to_f64(status: Option<&str>) -> f64 { + match status { + Some(s) if s.eq_ignore_ascii_case("LEAK") => 1.0, + _ => 0.0, + } +} + +pub(crate) fn now_unix_secs() -> f64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs_f64()) + .unwrap_or(0.0) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; + + use carbide_uuid::rack::RackId; + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; + + use super::*; + use crate::endpoint::{EndpointMetadata, SwitchData, SwitchEndpointRole}; + + #[derive(Default)] + struct CapturingSink { + events: Mutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { + self.events + .lock() + .expect("lock poisoned") + .push((context.clone(), event.clone())); + } + } + + #[test] + fn test_leaf_matches() { + let elems: Vec = ["interfaces", "interface", "state", "oper-status"] + .iter() + .map(|n| PathElem { + name: n.to_string(), + key: Default::default(), + }) + .collect(); + let refs: Vec<&PathElem> = elems.iter().collect(); + + assert!(leaf_matches(&refs, &["state", "oper-status"])); + assert!(leaf_matches(&refs, &["oper-status"])); + assert!(!leaf_matches(&refs, &["counters", "oper-status"])); + assert!(!leaf_matches(&refs, &["a", "b", "c", "d", "e"])); + } + + #[test] + fn test_find_elem_key_ref() { + let mut key_map = HashMap::new(); + key_map.insert("name".to_string(), "nvl0".to_string()); + let elems = [ + PathElem { + name: "interfaces".to_string(), + key: Default::default(), + }, + PathElem { + name: "interface".to_string(), + key: key_map, + }, + ]; + let refs: Vec<&PathElem> = elems.iter().collect(); + + assert_eq!(find_elem_key_ref(&refs, "interface", "name"), Some("nvl0")); + assert_eq!(find_elem_key_ref(&refs, "interface", "id"), None); + assert_eq!(find_elem_key_ref(&refs, "component", "name"), None); + } + + #[test] + fn test_oper_status_mapping() { + assert_eq!(oper_status_to_f64(Some("UP")), 1.0); + assert_eq!(oper_status_to_f64(Some("up")), 1.0); + assert_eq!(oper_status_to_f64(Some("DOWN")), 0.0); + assert_eq!(oper_status_to_f64(None), 0.0); + } + + #[test] + fn test_component_health_mapping() { + assert_eq!(component_health_to_f64(Some("healthy")), 1.0); + assert_eq!(component_health_to_f64(Some("HEALTHY")), 1.0); + assert_eq!(component_health_to_f64(Some("unhealthy")), 2.0); + assert_eq!(component_health_to_f64(None), 0.0); + } + + #[test] + fn test_leak_sensor_mapping() { + assert_eq!(leak_sensor_to_f64(Some("OK")), 0.0); + assert_eq!(leak_sensor_to_f64(Some("ok")), 0.0); + assert_eq!(leak_sensor_to_f64(Some("LEAK")), 1.0); + assert_eq!(leak_sensor_to_f64(Some("leak")), 1.0); + assert_eq!(leak_sensor_to_f64(Some("Leak")), 1.0); + assert_eq!(leak_sensor_to_f64(Some("UNSET")), 0.0); + assert_eq!(leak_sensor_to_f64(None), 0.0); + } + + fn make_path_elem(name: &str, keys: &[(&str, &str)]) -> PathElem { + PathElem { + name: name.to_string(), + key: keys + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + } + } + + fn make_typed_value_string(s: &str) -> proto::TypedValue { + proto::TypedValue { + value: Some(proto::typed_value::Value::StringVal(s.to_string())), + } + } + + fn make_typed_value_uint(v: u64) -> proto::TypedValue { + proto::TypedValue { + value: Some(proto::typed_value::Value::UintVal(v)), + } + } + + fn test_processor() -> GnmiSampleProcessor { + use std::str::FromStr; + + use mac_address::MacAddress; + + use crate::endpoint::BmcAddr; + + let addr = BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }; + let event_context = EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr, + collector_type: NVUE_GNMI_SAMPLE_STREAM_ID, + metadata: None, + rack_id: None, + }; + GnmiSampleProcessor { + data_sink: None, + event_context, + switch_id: "serial-abc".to_string(), + } + } + + fn test_switch_id(label: &str) -> SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } + + #[test] + fn test_process_notification_interface_oper_status() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl4")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn emitted_metrics_preserve_switch_position_context() { + use std::str::FromStr; + + use mac_address::MacAddress; + + use crate::endpoint::BmcAddr; + + let sink = Arc::new(CapturingSink::default()); + let switch_id = test_switch_id("switch-a"); + let proc = GnmiSampleProcessor { + data_sink: Some(sink.clone()), + event_context: EventContext { + endpoint_key: "aa:bb:cc:dd:ee:ff".to_string(), + addr: BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: None, + mac: MacAddress::from_str("AA:BB:CC:DD:EE:FF").unwrap(), + }, + collector_type: NVUE_GNMI_SAMPLE_STREAM_ID, + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, + })), + rack_id: Some(RackId::new("RACK_2")), + }, + switch_id: "SN-SWITCH-001".to_string(), + }; + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl4")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1); + let (context, event) = &events[0]; + assert_eq!(context.switch_id(), Some(switch_id)); + assert_eq!(context.switch_slot_number(), Some(7)); + assert_eq!(context.switch_tray_index(), Some(3)); + assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); + assert!(matches!(event, CollectorEvent::Metric(_))); + } + + #[test] + fn test_process_notification_component_temperature() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", "PSU-1")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("temperature", &[]), + make_path_elem("instant", &[]), + ], + ..Default::default() + }), + val: Some(proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(42.5)), + }), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_multiple_updates() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("counters", &[]), + make_path_elem("in-errors", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(42)), + ..Default::default() + }, + ], + ..Default::default() + }; + + // same interface, so entity count is 1 even with multiple updates + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_mixed_entities() { + let proc = test_processor(); + + let iface_update = proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("DOWN")), + ..Default::default() + }; + + let comp_update = proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", "FAN-1")]), + make_path_elem("healthz", &[]), + make_path_elem("state", &[]), + make_path_elem("status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("healthy")), + ..Default::default() + }; + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![iface_update, comp_update], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 2); + } + + #[test] + fn test_process_notification_leak_sensor() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("leak-sensors", &[]), + make_path_elem("leak-sensor", &[("id", "1")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![make_path_elem("state", &[]), make_path_elem("state", &[])], + ..Default::default() + }), + val: Some(make_typed_value_string("LEAK")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_leak_sensor_ok() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("leak-sensors", &[]), + make_path_elem("leak-sensor", &[("id", "2")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![make_path_elem("state", &[]), make_path_elem("state", &[])], + ..Default::default() + }), + val: Some(make_typed_value_string("OK")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_update_without_val_is_skipped() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: None, + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 0); + } + + #[test] + fn test_process_notification_effective_ber() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl1")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("effective-ber", &[]), + ], + ..Default::default() + }), + val: Some(proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(1.5e-12)), + }), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_symbol_ber_and_link_down_events() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl2")]), + ], + ..Default::default() + }), + update: vec![ + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("symbol-ber", &[]), + ], + ..Default::default() + }), + val: Some(proto::TypedValue { + value: Some(proto::typed_value::Value::DoubleVal(3.2e-10)), + }), + ..Default::default() + }, + proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("unintentional-link-down-events", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(7)), + ..Default::default() + }, + ], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + #[test] + fn test_process_notification_out_errors() { + let proc = test_processor(); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl3")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("counters", &[]), + make_path_elem("out-errors", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(99)), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + } + + fn test_stream_metrics() -> super::super::subscriber::GnmiStreamMetrics { + use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntGauge}; + super::super::subscriber::GnmiStreamMetrics { + connection_state: IntGauge::new("test_conn_state", "test").unwrap(), + connected: IntGauge::new("test_connected", "test").unwrap(), + reconnections_total: Counter::new("test_reconn", "test").unwrap(), + server_initiated_closures_total: Counter::new("test_closures", "test").unwrap(), + connection_established_timestamp: Gauge::new("test_conn_ts", "test").unwrap(), + notifications_received_total: Counter::new("test_notif_total", "test").unwrap(), + last_notification_timestamp: Gauge::new("test_last_notif_ts", "test").unwrap(), + notification_processing_seconds: Histogram::with_opts(HistogramOpts::new( + "test_proc_secs", + "test", + )) + .unwrap(), + stream_errors_total: Counter::new("test_errors", "test").unwrap(), + monitored_entities: Gauge::new("test_entities", "test").unwrap(), + } + } + + #[test] + fn test_process_subscribe_response_sync_response_is_noop() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::SyncResponse(true)), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 0.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } + + #[test] + #[allow(deprecated)] + fn test_process_subscribe_response_error_increments_counter() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::Error(proto::Error { + code: 13, + message: "internal server error".into(), + ..Default::default() + })), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.stream_errors_total.get(), 1.0); + assert_eq!(metrics.notifications_received_total.get(), 0.0); + } + + #[test] + fn test_process_subscribe_response_none_is_noop() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: None, + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 0.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } + + #[test] + fn test_process_subscribe_response_update_increments_notification_counter() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::Update( + proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("oper-status", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("UP")), + ..Default::default() + }], + ..Default::default() + }, + )), + ..Default::default() + }; + + proc.process_subscribe_response(&resp, &metrics); + + assert_eq!(metrics.notifications_received_total.get(), 1.0); + assert_eq!(metrics.monitored_entities.get(), 1.0); + assert_eq!(metrics.stream_errors_total.get(), 0.0); + } +} diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs new file mode 100644 index 0000000000..37843e6d9d --- /dev/null +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -0,0 +1,576 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntGauge, Opts}; +use tokio_util::sync::CancellationToken; + +use super::client::{ + GnmiClient, nvue_subscribe_paths, system_events_prefix, system_events_subscribe_path, +}; +use super::on_change_processor::{ + GnmiOnChangeProcessor, ON_CHANGE_STREAM_ID_SYSTEM_EVENTS, OnChangeStreamMetrics, +}; +use super::proto; +use super::sample_processor::{GnmiSampleProcessor, NVUE_GNMI_SAMPLE_STREAM_ID, now_unix_secs}; +use crate::HealthError; +use crate::collectors::Collector; +use crate::collectors::runtime::{BackoffConfig, ExponentialBackoff, StreamingConnectionGuard}; +use crate::config::NvueGnmiConfig; +use crate::endpoint::BmcEndpoint; +use crate::metrics::CollectorRegistry; +use crate::sink::{CollectorEvent, DataSink, EventContext}; + +// gRPC ConnectivityState values for `connection_state`. 0 (UNKNOWN) is the gauge default. +const IDLE: i64 = 1; +const CONNECTING: i64 = 2; +const READY: i64 = 3; +const TRANSIENT_FAILURE: i64 = 4; +const SHUTDOWN: i64 = 5; + +pub(crate) struct GnmiStreamMetrics { + pub(crate) connection_state: IntGauge, + /// binary "is this stream live right now?" -- guard-managed, mirrors SSE's `connected` gauge + pub(crate) connected: IntGauge, + pub(crate) reconnections_total: Counter, + pub(crate) server_initiated_closures_total: Counter, + pub(crate) connection_established_timestamp: Gauge, + pub(crate) notifications_received_total: Counter, + pub(crate) last_notification_timestamp: Gauge, + pub(crate) notification_processing_seconds: Histogram, + pub(crate) stream_errors_total: Counter, + pub(crate) monitored_entities: Gauge, +} + +impl GnmiStreamMetrics { + fn new( + registry: &prometheus::Registry, + prefix: &str, + stream_name: &str, + const_labels: HashMap, + ) -> Result { + let connection_state = IntGauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_connection_state"), + "gRPC connection state: 0=UNKNOWN, 1=IDLE, 2=CONNECTING, 3=READY, 4=TRANSIENT_FAILURE, 5=SHUTDOWN", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(connection_state.clone()))?; + + let connected = IntGauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_stream_connected"), + "1 while the stream is connected (READY), 0 otherwise. Mirrors the SSE collector's stream_connected gauge for aggregate streaming dashboards.", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(connected.clone()))?; + + let reconnections_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_reconnections_total"), + "Total reconnection attempts", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(reconnections_total.clone()))?; + + let server_initiated_closures_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_server_initiated_closures_total"), + "Total times the server closed the stream cleanly", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(server_initiated_closures_total.clone()))?; + + let connection_established_timestamp = Gauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_connection_established_timestamp"), + "Unix timestamp when current connection was established. Compute uptime via time() - this_metric.", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(connection_established_timestamp.clone()))?; + + let notifications_received_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_notifications_received_total"), + "Total notification messages received", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(notifications_received_total.clone()))?; + + let last_notification_timestamp = Gauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_last_notification_timestamp"), + "Unix timestamp of most recent notification", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(last_notification_timestamp.clone()))?; + + let notification_processing_seconds = Histogram::with_opts( + HistogramOpts::new( + format!("{prefix}_nvue_gnmi{stream_name}_notification_processing_seconds"), + "Per-notification processing time", + ) + .const_labels(const_labels.clone()) + .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]), + )?; + registry.register(Box::new(notification_processing_seconds.clone()))?; + + let stream_errors_total = Counter::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_stream_errors_total"), + "Total stream errors", + ) + .const_labels(const_labels.clone()), + )?; + registry.register(Box::new(stream_errors_total.clone()))?; + + let monitored_entities = Gauge::with_opts( + Opts::new( + format!("{prefix}_nvue_gnmi{stream_name}_monitored_entities"), + "Unique entities in most recent notification batch", + ) + .const_labels(const_labels), + )?; + registry.register(Box::new(monitored_entities.clone()))?; + + Ok(Self { + connection_state, + connected, + reconnections_total, + server_initiated_closures_total, + connection_established_timestamp, + notifications_received_total, + last_notification_timestamp, + notification_processing_seconds, + stream_errors_total, + monitored_entities, + }) + } +} + +struct GnmiStreamConfig { + client: GnmiClient, + paths: Vec, + sample_interval_nanos: u64, +} + +pub fn spawn_gnmi_collector( + endpoint: &BmcEndpoint, + gnmi_config: &NvueGnmiConfig, + collector_registry: Arc, + data_sink: Option>, +) -> Result { + let switch_id = endpoint + .metadata + .as_ref() + .and_then(|m| m.serial_number().map(str::to_string)) + .unwrap_or_else(|| endpoint.addr.mac.to_string()); + let switch_ip = endpoint.addr.ip.to_string(); + let sample_event_context = EventContext::from_endpoint(endpoint, NVUE_GNMI_SAMPLE_STREAM_ID); + + let (username, password) = match endpoint.credentials() { + crate::endpoint::BmcCredentials::UsernamePassword { username, password } => { + (Some(username), password) + } + crate::endpoint::BmcCredentials::SessionToken { .. } => { + return Err(HealthError::GnmiError( + "gNMI collector does not support SessionToken credentials; expected UsernamePassword" + .into(), + )); + } + }; + let client = GnmiClient::new( + switch_id.clone(), + &switch_ip, + gnmi_config.gnmi_port, + username, + password, + gnmi_config.request_timeout, + ); + + let registry = collector_registry.registry(); + let prefix = collector_registry.prefix().clone(); + let collector_removed_sample_context = sample_event_context.clone(); + let mut collector_removed_on_change_context = None; + + let sample_const_labels = HashMap::from([ + ( + "collector_type".to_string(), + NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + ), + ("endpoint_key".to_string(), endpoint.hash_key().into_owned()), + ]); + + let sample_stream_metrics = GnmiStreamMetrics::new(registry, &prefix, "", sample_const_labels)?; + + let sample_config = GnmiStreamConfig { + client: client.clone(), + paths: nvue_subscribe_paths(&gnmi_config.paths), + sample_interval_nanos: gnmi_config.sample_interval.as_nanos() as u64, + }; + + let sample_processor = GnmiSampleProcessor { + data_sink: data_sink.clone(), + event_context: sample_event_context, + switch_id: switch_id.clone(), + }; + + let on_change_state = if gnmi_config.system_events_enabled { + let on_change_const_labels = HashMap::from([ + ( + "collector_type".to_string(), + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), + ), + ("endpoint_key".to_string(), endpoint.hash_key().into_owned()), + ]); + + let on_change_stream_metrics = + GnmiStreamMetrics::new(registry, &prefix, "_events", on_change_const_labels.clone())?; + let on_change_row_metrics = OnChangeStreamMetrics::new( + registry, + &prefix, + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS, + on_change_const_labels, + )?; + let on_change_event_context = + EventContext::from_endpoint(endpoint, ON_CHANGE_STREAM_ID_SYSTEM_EVENTS); + collector_removed_on_change_context = Some(on_change_event_context.clone()); + let on_change_processor = GnmiOnChangeProcessor::new( + ON_CHANGE_STREAM_ID_SYSTEM_EVENTS.to_string(), + on_change_row_metrics, + data_sink.clone(), + on_change_event_context, + switch_id, + ); + + Some((client, on_change_stream_metrics, on_change_processor)) + } else { + None + }; + let collector_removed_data_sink = data_sink; + + Ok(Collector::spawn_task(move |cancel_token| async move { + let sample_handle = tokio::spawn(gnmi_sample_task( + cancel_token.clone(), + sample_config, + sample_stream_metrics, + sample_processor, + )); + + let on_change_handle = + on_change_state.map(|(client, stream_metrics, on_change_processor)| { + tokio::spawn(gnmi_on_change_task( + cancel_token, + client, + stream_metrics, + on_change_processor, + )) + }); + + let _ = sample_handle.await; + if let Some(handle) = on_change_handle { + let _ = handle.await; + } + + if let Some(data_sink) = collector_removed_data_sink.as_deref() { + data_sink.handle_event( + &collector_removed_sample_context, + &CollectorEvent::CollectorRemoved, + ); + + if let Some(event_context) = &collector_removed_on_change_context { + data_sink.handle_event(event_context, &CollectorEvent::CollectorRemoved); + } + } + })) +} + +async fn gnmi_sample_task( + cancel_token: CancellationToken, + config: GnmiStreamConfig, + stream_metrics: GnmiStreamMetrics, + sample_processor: GnmiSampleProcessor, +) { + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_secs(2), + max: Duration::from_secs(60), + }); + + loop { + stream_metrics.connection_state.set(CONNECTING); + + let Some(stream) = cancel_token + .run_until_cancelled( + config + .client + .subscribe_sample(&config.paths, config.sample_interval_nanos), + ) + .await + else { + stream_metrics.connection_state.set(SHUTDOWN); + return; + }; + + match stream { + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: connection failed, backing off" + ); + } + Ok(mut stream) => { + stream_metrics.connection_state.set(READY); + stream_metrics + .connection_established_timestamp + .set(now_unix_secs()); + let _conn_guard = StreamingConnectionGuard::inc(stream_metrics.connected.clone()); + backoff.reset(); + tracing::info!( + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: stream connected" + ); + + loop { + let Some(msg) = cancel_token.run_until_cancelled(stream.message()).await else { + stream_metrics.connection_state.set(SHUTDOWN); + tracing::info!( + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: cancelled, shutting down" + ); + return; + }; + + match msg { + Ok(Some(resp)) => { + sample_processor.process_subscribe_response(&resp, &stream_metrics); + } + Ok(None) => { + stream_metrics.connection_state.set(IDLE); + stream_metrics.server_initiated_closures_total.inc(); + tracing::info!( + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: stream closed by server, reconnecting" + ); + backoff.reset(); + break; + } + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.stream_errors_total.inc(); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %sample_processor.switch_id, + "nvue_gnmi SAMPLE: stream error, reconnecting" + ); + break; + } + } + } + } + } + + if cancel_token + .run_until_cancelled(tokio::time::sleep(backoff.next_delay())) + .await + .is_none() + { + stream_metrics.connection_state.set(SHUTDOWN); + return; + } + } +} + +async fn gnmi_on_change_task( + cancel_token: CancellationToken, + client: GnmiClient, + stream_metrics: GnmiStreamMetrics, + on_change_processor: GnmiOnChangeProcessor, +) { + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_secs(2), + max: Duration::from_secs(60), + }); + let prefix = system_events_prefix(); + let paths = system_events_subscribe_path(); + + loop { + stream_metrics.connection_state.set(CONNECTING); + + let Some(stream) = cancel_token + .run_until_cancelled(client.subscribe_on_change(&prefix, &paths)) + .await + else { + stream_metrics.connection_state.set(SHUTDOWN); + return; + }; + + match stream { + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: connection failed, backing off" + ); + } + Ok(mut stream) => { + stream_metrics.connection_state.set(READY); + stream_metrics + .connection_established_timestamp + .set(now_unix_secs()); + let _conn_guard = StreamingConnectionGuard::inc(stream_metrics.connected.clone()); + backoff.reset(); + tracing::info!( + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: stream connected" + ); + + loop { + let Some(msg) = cancel_token.run_until_cancelled(stream.message()).await else { + stream_metrics.connection_state.set(SHUTDOWN); + tracing::info!( + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: cancelled, shutting down" + ); + return; + }; + + match msg { + Ok(Some(resp)) => { + on_change_processor.process_subscribe_response(&resp, &stream_metrics); + } + Ok(None) => { + stream_metrics.connection_state.set(IDLE); + stream_metrics.server_initiated_closures_total.inc(); + tracing::info!( + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: stream closed by server, reconnecting" + ); + backoff.reset(); + break; + } + Err(e) => { + stream_metrics.connection_state.set(TRANSIENT_FAILURE); + stream_metrics.stream_errors_total.inc(); + stream_metrics.reconnections_total.inc(); + tracing::warn!( + error = ?e, + switch_id = %on_change_processor.switch_id, + stream = %on_change_processor.collector_name, + "nvue_gnmi ON_CHANGE: stream error, reconnecting" + ); + break; + } + } + } + } + } + + if cancel_token + .run_until_cancelled(tokio::time::sleep(backoff.next_delay())) + .await + .is_none() + { + stream_metrics.connection_state.set(SHUTDOWN); + return; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_labels() -> HashMap { + HashMap::from([ + ("switch_id".to_string(), "test-switch".to_string()), + ("switch_ip".to_string(), "10.0.0.1".to_string()), + ]) + } + + #[test] + fn test_stream_metrics_registers_all_counters() { + let registry = prometheus::Registry::new(); + let metrics = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + + metrics.reconnections_total.inc(); + assert_eq!(metrics.reconnections_total.get(), 1.0); + + metrics.server_initiated_closures_total.inc(); + assert_eq!(metrics.server_initiated_closures_total.get(), 1.0); + + metrics.stream_errors_total.inc(); + assert_eq!(metrics.stream_errors_total.get(), 1.0); + } + + #[test] + fn test_stream_metrics_server_closures_independent_from_reconnections() { + let registry = prometheus::Registry::new(); + let metrics = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + + metrics.server_initiated_closures_total.inc(); + metrics.server_initiated_closures_total.inc(); + assert_eq!(metrics.server_initiated_closures_total.get(), 2.0); + assert_eq!(metrics.reconnections_total.get(), 0.0); + + metrics.reconnections_total.inc(); + assert_eq!(metrics.reconnections_total.get(), 1.0); + assert_eq!(metrics.server_initiated_closures_total.get(), 2.0); + } + + #[test] + fn test_stream_metrics_duplicate_registration_fails() { + let registry = prometheus::Registry::new(); + let _ = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + let result = GnmiStreamMetrics::new(®istry, "test", "", test_labels()); + assert!(result.is_err()); + } + + #[test] + fn test_stream_metrics_distinct_stream_names_coexist() { + let registry = prometheus::Registry::new(); + let sample = GnmiStreamMetrics::new(®istry, "test", "", test_labels()).unwrap(); + let events_labels = HashMap::from([ + ("switch_id".to_string(), "test-switch".to_string()), + ("switch_ip".to_string(), "10.0.0.2".to_string()), + ]); + let events = GnmiStreamMetrics::new(®istry, "test", "_events", events_labels).unwrap(); + + sample.server_initiated_closures_total.inc(); + assert_eq!(sample.server_initiated_closures_total.get(), 1.0); + assert_eq!(events.server_initiated_closures_total.get(), 0.0); + } +} diff --git a/crates/health/src/collectors/nvue/mod.rs b/crates/health/src/collectors/nvue/mod.rs index 592d1df205..f58a2999fe 100644 --- a/crates/health/src/collectors/nvue/mod.rs +++ b/crates/health/src/collectors/nvue/mod.rs @@ -15,4 +15,6 @@ * limitations under the License. */ +pub(crate) mod gnmi; pub(in crate::collectors) mod rest; +pub(crate) mod tls; diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 5076f5abe0..b2cc7c82f5 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -21,6 +21,7 @@ use std::time::Duration; use reqwest::Client; use reqwest::header::ACCEPT; use serde::Deserialize; +use serde::de::Error as _; use url::Url; use crate::HealthError; @@ -228,11 +229,37 @@ pub struct ClusterApp { pub type SdnPartitionsResponse = HashMap; +fn deserialize_optional_u32_from_number_or_string<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + #[derive(Deserialize)] + #[serde(untagged)] + enum U32OrString { + Number(u32), + String(String), + } + + match Option::::deserialize(deserializer)? { + Some(U32OrString::Number(value)) => Ok(Some(value)), + Some(U32OrString::String(value)) => value.parse::().map(Some).map_err(|error| { + D::Error::custom(format!("invalid numeric string for num-gpus: {error}")) + }), + None => Ok(None), + } +} + #[derive(Debug, Clone, Deserialize, Default)] pub struct SdnPartition { pub name: Option, pub health: Option, - #[serde(rename = "num-gpus")] + #[serde( + default, + rename = "num-gpus", + deserialize_with = "deserialize_optional_u32_from_number_or_string" + )] pub num_gpus: Option, } @@ -354,6 +381,23 @@ mod tests { assert_eq!(resp.num_gpus, Some(8)); } + #[test] + fn test_parse_sdn_partition_string_num_gpus() { + let json = r#"{ + "name": "Default Partition", + "num-gpus": "8", + "health": "unhealthy", + "resiliency-mode": "adaptive_bandwidth", + "mcast-limit": 1024, + "partition-type": "gpuuid_based" + }"#; + + let resp: SdnPartition = serde_json::from_str(json).unwrap(); + assert_eq!(resp.name.as_deref(), Some("Default Partition")); + assert_eq!(resp.health.as_deref(), Some("unhealthy")); + assert_eq!(resp.num_gpus, Some(8)); + } + #[test] fn test_parse_sdn_partitions_map() { let json = r#"{ diff --git a/crates/health/src/collectors/nvue/tls.rs b/crates/health/src/collectors/nvue/tls.rs new file mode 100644 index 0000000000..a715e644c0 --- /dev/null +++ b/crates/health/src/collectors/nvue/tls.rs @@ -0,0 +1,74 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::sync::Arc; + +use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified, ServerCertVerifier}; +use rustls::pki_types::{CertificateDer, ServerName, UnixTime}; +use rustls::{ClientConfig, DigitallySignedStruct, SignatureScheme}; + +// ! dangerous cert verifier that accepts any server certificate without validation. +// ! only enable in test environments where you cannot replace NVOS self-signed certificates. +#[derive(Debug)] +struct AcceptAnyCertVerifier; + +impl ServerCertVerifier for AcceptAnyCertVerifier { + fn verify_server_cert( + &self, + _end_entity: &CertificateDer<'_>, + _intermediates: &[CertificateDer<'_>], + _server_name: &ServerName, + _ocsp_response: &[u8], + _now: UnixTime, + ) -> Result { + Ok(ServerCertVerified::assertion()) + } + + fn verify_tls12_signature( + &self, + _message: &[u8], + _cert: &CertificateDer<'_>, + _dss: &DigitallySignedStruct, + ) -> Result { + Ok(HandshakeSignatureValid::assertion()) + } + + fn verify_tls13_signature( + &self, + _message: &[u8], + _cert: &CertificateDer<'_>, + _dss: &DigitallySignedStruct, + ) -> Result { + Ok(HandshakeSignatureValid::assertion()) + } + + fn supported_verify_schemes(&self) -> Vec { + rustls::crypto::aws_lc_rs::default_provider() + .signature_verification_algorithms + .supported_schemes() + } +} + +/// build a rustls ClientConfig that dangerously skips server certificate verification. +pub fn self_signed_tls_config() -> ClientConfig { + ClientConfig::builder_with_provider(Arc::new(rustls::crypto::aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("default protocol versions are valid") + .dangerous() + .with_custom_certificate_verifier(Arc::new(AcceptAnyCertVerifier)) + .with_no_client_auth() +} diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 94203d659a..849dd02522 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -235,18 +235,18 @@ impl StreamMetrics { } } -/// RAII guard: increments `active_sse_connections` on construction, decrements on drop. +/// RAII guard: increments the passed IntGauge on construction, decrements on drop. /// Ensures every exit path from a connected stream (cancel, error, end, reconnect) dec's. -struct SseConnectionGuard(IntGauge); +pub(crate) struct StreamingConnectionGuard(IntGauge); -impl SseConnectionGuard { - fn inc(gauge: IntGauge) -> Self { +impl StreamingConnectionGuard { + pub(crate) fn inc(gauge: IntGauge) -> Self { gauge.inc(); Self(gauge) } } -impl Drop for SseConnectionGuard { +impl Drop for StreamingConnectionGuard { fn drop(&mut self) { self.0.dec(); } @@ -516,7 +516,7 @@ impl Collector { Ok(mut stream) => { // the guard lives exactly as long as we hold an open stream; Drop // handles dec for every exit path (shutdown, error, stream end). - let _conn_guard = SseConnectionGuard::inc(metrics.connected.clone()); + let _conn_guard = StreamingConnectionGuard::inc(metrics.connected.clone()); backoff.reset(); tracing::info!( collector_type, @@ -581,6 +581,23 @@ impl Collector { }) } + /// spawn helper for streaming collectors that don't fit `StreamingCollector` + /// (e.g. gNMI bidi subscribe with in-loop multiplexing). The closure gets a + /// CancellationToken and should return once it's cancelled. + pub fn spawn_task(task_fn: F) -> Self + where + F: FnOnce(CancellationToken) -> Fut + Send + 'static, + Fut: std::future::Future + Send + 'static, + { + let cancel_token = CancellationToken::new(); + let cancel_clone = cancel_token.clone(); + let handle = tokio::spawn(task_fn(cancel_clone)); + Self { + handle, + cancel_token, + } + } + pub async fn stop(self) { self.cancel_token.cancel(); let _ = self.handle.await; diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 894998531e..234dd1e568 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -111,7 +111,9 @@ pub struct StaticBmcEndpoint { pub struct StaticMachineEndpoint { pub id: String, pub serial: Option, + #[serde(alias = "physical_slot_number")] pub slot_number: Option, + #[serde(alias = "compute_tray_index")] pub tray_index: Option, pub nvlink_domain_uuid: Option, } @@ -139,7 +141,9 @@ fn default_static_switch_endpoint_role() -> StaticSwitchEndpointRole { pub struct StaticSwitchEndpoint { pub id: Option, pub serial: Option, + #[serde(alias = "physical_slot_number")] pub slot_number: Option, + #[serde(alias = "compute_tray_index")] pub tray_index: Option, #[serde(default = "default_static_switch_endpoint_role")] pub endpoint_role: StaticSwitchEndpointRole, @@ -671,12 +675,66 @@ impl Default for NmxtCollectorConfig { #[serde(default)] pub struct NvueCollectorConfig { pub rest: Configurable, + pub gnmi: Configurable, } impl Default for NvueCollectorConfig { fn default() -> Self { Self { rest: Configurable::Enabled(NvueRestConfig::default()), + gnmi: Configurable::Disabled, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +pub struct NvueGnmiConfig { + /// gNMI server port on the switch. + pub gnmi_port: u16, + + /// Interval between SAMPLE mode subscription updates. + #[serde(with = "humantime_serde")] + pub sample_interval: Duration, + + /// Timeout for gRPC connection attempts. + #[serde(with = "humantime_serde")] + pub request_timeout: Duration, + + /// Enable gNMI ON_CHANGE subscription for live system-event messages. + #[serde(alias = "system_events_subscription_enabled", alias = "events_enabled")] + pub system_events_enabled: bool, + + /// gNMI SAMPLE subscription paths. + pub paths: NvueGnmiPaths, +} + +impl Default for NvueGnmiConfig { + fn default() -> Self { + Self { + gnmi_port: 9339, + sample_interval: Duration::from_secs(300), + request_timeout: Duration::from_secs(30), + system_events_enabled: true, + paths: NvueGnmiPaths::default(), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +pub struct NvueGnmiPaths { + pub components_enabled: bool, + pub interfaces_enabled: bool, + pub leak_sensors_enabled: bool, +} + +impl Default for NvueGnmiPaths { + fn default() -> Self { + Self { + components_enabled: true, + interfaces_enabled: true, + leak_sensors_enabled: true, } } } @@ -1024,6 +1082,14 @@ mod tests { } else { panic!("nvue rest config should be enabled in example config"); } + if let Configurable::Enabled(ref gnmi) = nvue.gnmi { + assert_eq!(gnmi.gnmi_port, 9339); + assert_eq!(gnmi.sample_interval, Duration::from_secs(300)); + assert_eq!(gnmi.request_timeout, Duration::from_secs(30)); + assert!(gnmi.system_events_enabled); + } else { + panic!("nvue gnmi config should be enabled in example config"); + } } else { panic!("nvue config should be enabled in example config"); } @@ -1353,6 +1419,37 @@ interfaces_enabled = false } } + #[test] + fn test_nvue_gnmi_events_disabled() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.nvue.gnmi] +gnmi_port = 9339 +system_events_enabled = false +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("failed to parse"); + + if let Configurable::Enabled(ref nvue) = config.collectors.nvue { + if let Configurable::Enabled(ref gnmi) = nvue.gnmi { + assert!(!gnmi.system_events_enabled); + } else { + panic!("gnmi config should be enabled"); + } + } else { + panic!("nvue config should be enabled"); + } + } + #[test] fn test_static_endpoint_with_switch_serial() { let toml_content = r#" @@ -1559,6 +1656,48 @@ machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", ); } + #[test] + fn test_static_endpoints_accept_position_field_aliases() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +mac = "11:22:33:44:55:11" +username = "admin" +password = "pass" +machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", physical_slot_number = 15, compute_tray_index = 5 } + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.1" +mac = "11:22:33:44:55:66" +username = "cumulus" +password = "pass" +switch = { serial = "SN-SW-001", physical_slot_number = 7, compute_tray_index = 3 } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("failed to parse static endpoint config"); + + let machine = config.endpoint_sources.static_bmc_endpoints[0] + .machine + .as_ref() + .expect("machine metadata"); + assert_eq!(machine.slot_number, Some(15)); + assert_eq!(machine.tray_index, Some(5)); + + let switch = config.endpoint_sources.static_bmc_endpoints[1] + .switch + .as_ref() + .expect("switch metadata"); + assert_eq!(switch.slot_number, Some(7)); + assert_eq!(switch.tray_index, Some(3)); + } + #[test] fn test_static_endpoint_rejects_multiple_identity_types() { let toml_content = r#" diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 0c38c32d10..be7fa59ff8 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -46,16 +46,18 @@ pub(super) enum CollectorKind { LeakDetector, Nmxt, NvueRest, + NvueGnmi, } impl CollectorKind { - pub(super) const ALL: [CollectorKind; 6] = [ + pub(super) const ALL: [CollectorKind; 7] = [ CollectorKind::Sensor, CollectorKind::Logs, CollectorKind::Firmware, CollectorKind::LeakDetector, CollectorKind::Nmxt, CollectorKind::NvueRest, + CollectorKind::NvueGnmi, ]; pub(super) fn stop_message(self) -> &'static str { @@ -68,6 +70,9 @@ impl CollectorKind { } CollectorKind::Nmxt => "Stopping NMX-T collector for removed BMC endpoint", CollectorKind::NvueRest => "Stopping NVUE REST collector for removed BMC endpoint", + CollectorKind::NvueGnmi => { + "Stopping NVUE gNMI streaming collector for removed switch endpoint" + } } } } @@ -79,6 +84,7 @@ pub(super) struct CollectorState { logs: HashMap, Collector>, nmxt: HashMap, Collector>, nvue_rest: HashMap, Collector>, + nvue_gnmi: HashMap, Collector>, } impl CollectorState { @@ -90,6 +96,7 @@ impl CollectorState { logs: HashMap::new(), nmxt: HashMap::new(), nvue_rest: HashMap::new(), + nvue_gnmi: HashMap::new(), } } @@ -101,6 +108,7 @@ impl CollectorState { CollectorKind::LeakDetector => &self.leak_detector, CollectorKind::Nmxt => &self.nmxt, CollectorKind::NvueRest => &self.nvue_rest, + CollectorKind::NvueGnmi => &self.nvue_gnmi, } } @@ -115,6 +123,7 @@ impl CollectorState { CollectorKind::LeakDetector => &mut self.leak_detector, CollectorKind::Nmxt => &mut self.nmxt, CollectorKind::NvueRest => &mut self.nvue_rest, + CollectorKind::NvueGnmi => &mut self.nvue_gnmi, } } @@ -146,6 +155,7 @@ impl CollectorState { .chain(self.leak_detector.keys()) .chain(self.nmxt.keys()) .chain(self.nvue_rest.keys()) + .chain(self.nvue_gnmi.keys()) .filter(|key| !active_keys.contains(*key)) .cloned() .collect() @@ -224,3 +234,37 @@ impl DiscoveryLoopContext { }) } } + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + use std::collections::HashSet; + + use super::*; + use crate::collectors::Collector; + + fn noop_collector() -> Collector { + Collector::spawn_task(|_| async {}) + } + + #[tokio::test] + async fn removed_keys_includes_nvue_gnmi_collectors() { + let mut state = CollectorState::new(); + state.insert( + CollectorKind::NvueGnmi, + Cow::Borrowed("removed-gNMI-endpoint"), + noop_collector(), + ); + state.insert( + CollectorKind::NvueRest, + Cow::Borrowed("active-rest-endpoint"), + noop_collector(), + ); + + let active = HashSet::from([Cow::Borrowed("active-rest-endpoint")]); + let removed = state.removed_keys(&active); + + assert!(removed.contains(&Cow::Borrowed("removed-gNMI-endpoint"))); + assert!(!removed.contains(&Cow::Borrowed("active-rest-endpoint"))); + } +} diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 2cdcbf3a9a..9e29441943 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -25,10 +25,10 @@ use crate::collectors::{ LeakDetectorCollector, LeakDetectorCollectorConfig, LogsCollector, LogsCollectorConfig, NmxtCollector, NmxtCollectorConfig, NvueRestCollector, NvueRestCollectorConfig, SensorCollector, SensorCollectorConfig, SseLogCollector, SseLogCollectorConfig, - StreamingCollectorStartContext, + StreamingCollectorStartContext, spawn_gnmi_collector, }; use crate::config::{Configurable, LogCollectionMode}; -use crate::endpoint::{BmcEndpoint, SwitchEndpointRole}; +use crate::endpoint::{BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; use crate::sink::DataSink; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { @@ -364,6 +364,35 @@ fn spawn_switch_host_collectors( } } + if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config + && let Configurable::Enabled(gnmi_cfg) = &nvue_cfg.gnmi + && !ctx.collectors.contains(CollectorKind::NvueGnmi, &key) + && matches!(endpoint.metadata, Some(EndpointMetadata::Switch(_))) + { + let collector_registry = Arc::new( + ctx.metrics_manager + .create_collector_registry(format!("nvue_gnmi_collector_{key}"), metrics_prefix)?, + ); + match spawn_gnmi_collector(endpoint, gnmi_cfg, collector_registry, data_sink.clone()) { + Ok(handle) => { + ctx.collectors + .insert(CollectorKind::NvueGnmi, key.clone().into(), handle); + tracing::info!( + endpoint_key = %key, + total_nvue_gnmi_collectors = ctx.collectors.len(CollectorKind::NvueGnmi), + "Started NVUE gNMI streaming collection for switch endpoint" + ); + } + Err(error) => { + tracing::error!( + ?error, + endpoint_key = %key, + "Could not start NVUE gNMI collector for switch" + ); + } + } + } + Ok(()) } diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 12fbd024e3..d0f3dcb08d 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -85,6 +85,9 @@ pub enum HealthError { #[error("Redfish SSE not available: {0}")] SseNotAvailable(String), + + #[error("gNMI error: {0}")] + GnmiError(String), } impl From for HealthError { diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index 168c453862..5c605f4bb0 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -19,10 +19,15 @@ use std::collections::HashMap; use std::time::SystemTime; use super::collector_logs::ExportLogsServiceRequest; +use super::collector_metrics::ExportMetricsServiceRequest; use super::common::{AnyValue, KeyValue, any_value}; use super::logs::{LogRecord as OtlpLogRecord, ResourceLogs, ScopeLogs, SeverityNumber}; +use super::metrics::{ + Gauge as OtlpGauge, Metric as OtlpMetric, NumberDataPoint, ResourceMetrics, ScopeMetrics, + metric, number_data_point, +}; use super::resource::Resource; -use crate::sink::{CollectorEvent, EventContext}; +use crate::sink::{CollectorEvent, EventContext, SensorHealthData}; fn severity_text_to_number(severity: &str) -> i32 { match severity.to_uppercase().as_str() { @@ -74,6 +79,9 @@ fn resource_attributes(context: &EventContext) -> Vec { if let Some(switch_id) = context.switch_id() { attrs.push(kv("switch.id", switch_id.to_string())); } + if let Some(rack_id) = context.rack_id() { + attrs.push(kv("rack.id", rack_id.to_string())); + } if let Some(slot) = context.slot_number() { attrs.push(int_kv("machine.slot_number", i64::from(slot))); } @@ -193,6 +201,66 @@ pub fn build_export_request(batch: &[(EventContext, CollectorEvent)]) -> ExportL ExportLogsServiceRequest { resource_logs } } +/// group metric samples by endpoint and build an ExportMetricsServiceRequest. +/// every sample maps to an OTLP `Gauge` point; Sum/Histogram is a follow-up. +pub fn build_metrics_export_request( + batch: &[(EventContext, SensorHealthData)], +) -> ExportMetricsServiceRequest { + let observed_nanos = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + + let mut by_endpoint: HashMap, Vec)> = HashMap::new(); + + for (context, sample) in batch { + let data_point = NumberDataPoint { + attributes: sample + .labels + .iter() + .map(|(k, v)| kv(k, v.clone())) + .collect(), + time_unix_nano: observed_nanos, + value: Some(number_data_point::Value::AsDouble(sample.value)), + ..Default::default() + }; + + let otlp_metric = OtlpMetric { + name: sample.metric_type.clone(), + description: String::new(), + unit: sample.unit.clone(), + data: Some(metric::Data::Gauge(OtlpGauge { + data_points: vec![data_point], + })), + ..Default::default() + }; + + by_endpoint + .entry(context.endpoint_key.clone()) + .or_insert_with(|| (resource_attributes(context), Vec::new())) + .1 + .push(otlp_metric); + } + + let resource_metrics = by_endpoint + .into_values() + .map(|(attrs, metrics)| ResourceMetrics { + resource: Some(Resource { + attributes: attrs, + dropped_attributes_count: 0, + }), + scope_metrics: vec![ScopeMetrics { + scope: None, + metrics, + schema_url: String::new(), + }], + schema_url: String::new(), + }) + .collect(); + + ExportMetricsServiceRequest { resource_metrics } +} + #[cfg(test)] mod tests { use std::borrow::Cow; @@ -200,6 +268,7 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; @@ -272,11 +341,12 @@ mod tests { tray_index: Some(5), nvlink_domain_uuid: Some(domain_uuid), })), - rack_id: None, + rack_id: Some(RackId::new("RACK_1")), }; let attrs = resource_attributes(&context); + assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_1")); assert_eq!(attr_int_value(&attrs, "machine.slot_number"), Some(15)); assert_eq!(attr_int_value(&attrs, "machine.tray_index"), Some(5)); assert_eq!( @@ -306,7 +376,7 @@ mod tests { is_primary: false, nmxt_enabled: false, })), - rack_id: None, + rack_id: Some(RackId::new("RACK_2")), }; let attrs = resource_attributes(&context); @@ -315,6 +385,7 @@ mod tests { attr_value(&attrs, "switch.id"), Some(switch_id_attr.as_str()) ); + assert_eq!(attr_value(&attrs, "rack.id"), Some("RACK_2")); assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); assert_eq!(attr_int_value(&attrs, "switch.tray_index"), Some(3)); } diff --git a/crates/health/src/otlp/metrics_drain.rs b/crates/health/src/otlp/metrics_drain.rs new file mode 100644 index 0000000000..a3d281c4b5 --- /dev/null +++ b/crates/health/src/otlp/metrics_drain.rs @@ -0,0 +1,198 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::sync::Arc; +use std::time::Duration; + +use tonic::transport::Channel; + +use super::collector_metrics::metrics_service_client::MetricsServiceClient; +use super::convert::build_metrics_export_request; +use crate::collectors::{BackoffConfig, ExponentialBackoff}; +use crate::sink::otlp::OtlpMetricsQueue; +use crate::sink::{EventContext, SensorHealthData}; + +pub(crate) struct OtlpMetricsDrainTask { + queue: Arc, + endpoint: String, + batch_size: usize, + flush_interval: Duration, +} + +impl OtlpMetricsDrainTask { + pub fn new( + queue: Arc, + endpoint: String, + batch_size: usize, + flush_interval: Duration, + ) -> Self { + Self { + queue, + endpoint, + batch_size, + flush_interval, + } + } + + fn drain_batch(&self, batch: &mut Vec<(EventContext, SensorHealthData)>) { + let remaining = self.batch_size.saturating_sub(batch.len()); + for _ in 0..remaining { + match self.queue.pop() { + Some((_key, value)) => batch.push(value), + None => break, + } + } + } + + pub async fn run(self) { + let mut client = match self.connect().await { + Some(c) => c, + None => return, + }; + + let mut batch = Vec::with_capacity(self.batch_size); + let mut interval = tokio::time::interval(self.flush_interval); + + loop { + tokio::select! { + _ = self.queue.notified() => { + self.drain_batch(&mut batch); + if batch.len() >= self.batch_size { + self.flush(&mut client, &mut batch).await; + interval.reset(); + } + } + _ = interval.tick() => { + self.drain_batch(&mut batch); + if !batch.is_empty() { + self.flush(&mut client, &mut batch).await; + } + } + } + } + } + + async fn connect(&self) -> Option> { + let endpoint = match Channel::from_shared(self.endpoint.clone()) { + Ok(e) => e, + Err(error) => { + tracing::error!( + ?error, + endpoint = %self.endpoint, + "invalid otlp metrics endpoint uri, stopping drain" + ); + return None; + } + }; + + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_secs(1), + max: Duration::from_secs(30), + }); + + loop { + match endpoint.connect().await { + Ok(channel) => { + tracing::info!(endpoint = %self.endpoint, "connected to otlp metrics collector"); + return Some(MetricsServiceClient::new(channel)); + } + Err(error) => { + let delay = backoff.next_delay(); + tracing::warn!( + ?error, + endpoint = %self.endpoint, + retry_in = ?delay, + "failed to connect to otlp metrics collector" + ); + tokio::time::sleep(delay).await; + } + } + } + } + + async fn flush( + &self, + client: &mut MetricsServiceClient, + batch: &mut Vec<(EventContext, SensorHealthData)>, + ) { + if batch.is_empty() { + return; + } + + let request = build_metrics_export_request(batch); + batch.clear(); + + let point_count = request + .resource_metrics + .iter() + .flat_map(|rm| &rm.scope_metrics) + .flat_map(|sm| &sm.metrics) + .count(); + + if point_count == 0 { + return; + } + + const MAX_RETRIES: usize = 5; + + let mut backoff = ExponentialBackoff::new(&BackoffConfig { + initial: Duration::from_millis(100), + max: Duration::from_secs(10), + }); + + for attempt in 0..=MAX_RETRIES { + match client.export(request.clone()).await { + Ok(_) => { + tracing::debug!(point_count, "exported metrics to otlp collector"); + break; + } + Err(status) if is_retryable(&status) && attempt < MAX_RETRIES => { + let delay = backoff.next_delay(); + tracing::warn!( + code = ?status.code(), + message = status.message(), + attempt, + retry_in = ?delay, + "retryable otlp metrics export error" + ); + tokio::time::sleep(delay).await; + } + Err(status) => { + tracing::error!( + code = ?status.code(), + message = status.message(), + point_count, + attempt, + "otlp metrics export failed, dropping batch" + ); + break; + } + } + } + } +} + +fn is_retryable(status: &tonic::Status) -> bool { + matches!( + status.code(), + tonic::Code::Unavailable + | tonic::Code::DeadlineExceeded + | tonic::Code::ResourceExhausted + | tonic::Code::Aborted + | tonic::Code::Internal + ) +} diff --git a/crates/health/src/otlp/mod.rs b/crates/health/src/otlp/mod.rs index ecd76b6c47..632212b9ea 100644 --- a/crates/health/src/otlp/mod.rs +++ b/crates/health/src/otlp/mod.rs @@ -17,6 +17,7 @@ pub mod convert; pub mod drain; +pub mod metrics_drain; #[allow(clippy::all)] pub mod opentelemetry { @@ -36,17 +37,29 @@ pub mod opentelemetry { tonic::include_proto!("opentelemetry.proto.logs.v1"); } } + pub mod metrics { + pub mod v1 { + tonic::include_proto!("opentelemetry.proto.metrics.v1"); + } + } pub mod collector { pub mod logs { pub mod v1 { tonic::include_proto!("opentelemetry.proto.collector.logs.v1"); } } + pub mod metrics { + pub mod v1 { + tonic::include_proto!("opentelemetry.proto.collector.metrics.v1"); + } + } } } } pub use opentelemetry::proto::collector::logs::v1 as collector_logs; +pub use opentelemetry::proto::collector::metrics::v1 as collector_metrics; pub use opentelemetry::proto::common::v1 as common; pub use opentelemetry::proto::logs::v1 as logs; +pub use opentelemetry::proto::metrics::v1 as metrics; pub use opentelemetry::proto::resource::v1 as resource; diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index be183376df..26c1a50909 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -21,29 +21,36 @@ use prometheus::Counter; use super::dedup_queue::DedupQueue; use super::event_mapper::RedfishEventMapper; -use super::{CollectorEvent, DataSink, EventContext}; +use super::{CollectorEvent, DataSink, EventContext, SensorHealthData}; use crate::HealthError; use crate::config::OtlpSinkConfig; use crate::metrics::MetricsManager; use crate::otlp::drain::OtlpDrainTask; +use crate::otlp::metrics_drain::OtlpMetricsDrainTask; pub(crate) type OtlpQueue = DedupQueue; +pub(crate) type OtlpMetricsQueue = DedupQueue; #[cfg(not(feature = "bench-hooks"))] pub(crate) struct OtlpSink { queue: Arc, + metrics_queue: Arc, replaced_total: Counter, + metrics_replaced_total: Counter, mapper: Arc, } #[cfg(feature = "bench-hooks")] pub struct OtlpSink { queue: Arc, + metrics_queue: Arc, replaced_total: Counter, + metrics_replaced_total: Counter, mapper: Arc, } -pub(crate) fn is_otlp_relevant(event: &CollectorEvent) -> bool { +/// true for events that belong in the logs drain; metrics and collection sentinels are not. +pub(crate) fn is_otlp_log_relevant(event: &CollectorEvent) -> bool { !matches!( event, CollectorEvent::Metric(_) @@ -65,15 +72,24 @@ impl OtlpSink { })?; let queue: Arc = Arc::new(DedupQueue::new()); + let metrics_queue: Arc = Arc::new(DedupQueue::new()); let replaced_total = Counter::new( format!("{prefix}_otlp_sink_replaced_total"), - "total events replaced in the otlp queue before drain could process them", + "total log events replaced in the otlp queue before drain could process them", )?; metrics_manager .global_registry() .register(Box::new(replaced_total.clone()))?; + let metrics_replaced_total = Counter::new( + format!("{prefix}_otlp_sink_metrics_replaced_total"), + "total metric samples replaced in the otlp queue before drain could process them", + )?; + metrics_manager + .global_registry() + .register(Box::new(metrics_replaced_total.clone()))?; + let drain = OtlpDrainTask::new( queue.clone(), config.endpoint.clone(), @@ -82,9 +98,20 @@ impl OtlpSink { ); handle.spawn(drain.run()); + // separate drain task so metrics don't head-of-line-block the logs export and vice versa + let metrics_drain = OtlpMetricsDrainTask::new( + metrics_queue.clone(), + config.endpoint.clone(), + config.batch_size, + config.flush_interval, + ); + handle.spawn(metrics_drain.run()); + Ok(Self { queue, + metrics_queue, replaced_total, + metrics_replaced_total, mapper, }) } @@ -95,7 +122,9 @@ impl OtlpSink { pub fn new_for_bench(mapper: Arc) -> Self { Self { queue: Arc::new(DedupQueue::new()), + metrics_queue: Arc::new(DedupQueue::new()), replaced_total: Counter::new("bench_replaced", "bench").unwrap(), + metrics_replaced_total: Counter::new("bench_metrics_replaced", "bench").unwrap(), mapper, } } @@ -106,6 +135,10 @@ impl OtlpSink { pub fn pop_for_bench(&self) -> Option<(EventContext, CollectorEvent)> { self.queue.pop().map(|(_key, value)| value) } + + pub fn pop_metric_for_bench(&self) -> Option<(EventContext, SensorHealthData)> { + self.metrics_queue.pop().map(|(_key, value)| value) + } } impl DataSink for OtlpSink { @@ -114,7 +147,18 @@ impl DataSink for OtlpSink { } fn handle_event(&self, context: &EventContext, event: &CollectorEvent) { - if !is_otlp_relevant(event) { + if let CollectorEvent::Metric(sample) = event { + let key = format!("{}|{}", context.endpoint_key, sample.key); + if self + .metrics_queue + .save_latest(key, (context.clone(), (**sample).clone())) + { + self.metrics_replaced_total.inc(); + } + return; + } + + if !is_otlp_log_relevant(event) { return; } @@ -198,23 +242,53 @@ mod tests { } #[test] - fn is_otlp_relevant_excludes_metric_events() { - assert!(!is_otlp_relevant(&metric_event())); - assert!(!is_otlp_relevant(&CollectorEvent::MetricCollectionStart)); - assert!(!is_otlp_relevant(&CollectorEvent::MetricCollectionEnd)); + fn is_otlp_log_relevant_excludes_metric_events() { + assert!(!is_otlp_log_relevant(&metric_event())); + assert!(!is_otlp_log_relevant( + &CollectorEvent::MetricCollectionStart + )); + assert!(!is_otlp_log_relevant(&CollectorEvent::MetricCollectionEnd)); } #[test] - fn is_otlp_relevant_includes_log_events() { - assert!(is_otlp_relevant(&log_event("OpenBMC.0.1.Test", "[]"))); + fn is_otlp_log_relevant_includes_log_events() { + assert!(is_otlp_log_relevant(&log_event("OpenBMC.0.1.Test", "[]"))); } #[test] - fn metric_events_are_not_queued() { + fn metric_events_go_to_metrics_queue_not_logs_queue() { let sink = test_sink(); let ctx = test_context(); sink.handle_event(&ctx, &metric_event()); + assert!(sink.queue.pop().is_none(), "logs queue should be empty"); + assert!( + sink.metrics_queue.pop().is_some(), + "metrics queue should have the sample" + ); + } + + #[test] + fn metric_collection_sentinels_are_no_op() { + let sink = test_sink(); + let ctx = test_context(); + sink.handle_event(&ctx, &CollectorEvent::MetricCollectionStart); + sink.handle_event(&ctx, &CollectorEvent::MetricCollectionEnd); assert!(sink.queue.pop().is_none()); + assert!(sink.metrics_queue.pop().is_none()); + } + + #[test] + fn metric_events_dedup_by_sample_key() { + let sink = test_sink(); + let ctx = test_context(); + sink.handle_event(&ctx, &metric_event()); + sink.handle_event(&ctx, &metric_event()); + let mut count = 0; + while sink.metrics_queue.pop().is_some() { + count += 1; + } + assert_eq!(count, 1, "same key should dedup to one entry"); + assert_eq!(sink.metrics_replaced_total.get() as u64, 1); } #[test] diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index e36c1e472f..ac0ddbf6b6 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -103,6 +103,9 @@ impl PrometheusSink { if let Some(serial) = context.serial_number() { labels.push((Cow::Borrowed("serial_number"), serial.to_string())); } + if let Some(rack_id) = context.rack_id() { + labels.push((Cow::Borrowed("rack_id"), rack_id.to_string())); + } if let Some(slot) = context.slot_number() { labels.push((Cow::Borrowed("machine_slot_number"), slot.to_string())); } @@ -237,6 +240,7 @@ mod tests { use std::str::FromStr; use carbide_uuid::nvlink::NvLinkDomainId; + use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; @@ -269,7 +273,7 @@ mod tests { tray_index: Some(5), nvlink_domain_uuid: Some(NvLinkDomainId::nil()), })), - rack_id: None, + rack_id: Some(RackId::new("RACK_1")), }; let labels = PrometheusSink::stream_static_labels(&context); @@ -284,6 +288,7 @@ mod tests { Some("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") ); assert_eq!(label_value("serial_number"), Some("MN-001")); + assert_eq!(label_value("rack_id"), Some("RACK_1")); assert_eq!(label_value("machine_slot_number"), Some("15")); assert_eq!(label_value("machine_tray_index"), Some("5")); assert_eq!( @@ -313,7 +318,7 @@ mod tests { is_primary: false, nmxt_enabled: false, })), - rack_id: None, + rack_id: Some(RackId::new("RACK_2")), }; let labels = PrometheusSink::stream_static_labels(&context); @@ -325,6 +330,7 @@ mod tests { assert_eq!(label_value("switch_id"), Some(switch_id_label.as_str())); assert_eq!(label_value("serial_number"), Some("SN-SWITCH-001")); + assert_eq!(label_value("rack_id"), Some("RACK_2")); assert_eq!(label_value("switch_slot_number"), Some("7")); assert_eq!(label_value("switch_tray_index"), Some("3")); } diff --git a/crates/mqtt-common/Cargo.toml b/crates/mqtt-common/Cargo.toml index 6f47dbf996..69c16bef1c 100644 --- a/crates/mqtt-common/Cargo.toml +++ b/crates/mqtt-common/Cargo.toml @@ -29,4 +29,3 @@ opentelemetry = { workspace = true } tokio = { workspace = true } tokio-util = { workspace = true } tracing = { workspace = true } - diff --git a/docs/architecture/health_aggregation.md b/docs/architecture/health_aggregation.md index c5100b2dca..bdccbdb2ef 100644 --- a/docs/architecture/health_aggregation.md +++ b/docs/architecture/health_aggregation.md @@ -268,8 +268,8 @@ ranges or by interpreting the `health_ok` values provided by BMCs. For production deployments, `carbide-hw-health` discovers machine, switch, and power-shelf BMC endpoints from Carbide API via `[endpoint_sources.carbide_api]`. Machine endpoints carry the inventory metadata needed to interpret hardware health in fleet context, including machine ID, serial number, rack ID, rack placement, and NVLink domain UUID when present. Switch endpoints carry switch ID, serial number, and rack placement when present. Local and test deployments can instead configure explicit machine, switch, or power-shelf identity with `[[endpoint_sources.static_bmc_endpoints]]`; static machine endpoints can include the same serial number, rack placement, and NVLink domain UUID metadata, static switch endpoints can include serial number and rack placement metadata, and all static endpoints can provide `rack_id` when rack-level rollups are needed. The publishing sinks expose that inventory context using the conventions of the target backend: -- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`; switch metadata uses `switch_id`, `serial_number`, `switch_slot_number`, and `switch_tray_index`. -- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`; switch metadata uses `switch.id`, integer `switch.slot_number`, and integer `switch.tray_index`. +- `[sinks.prometheus]` adds machine metadata as metric labels named `machine_id`, `serial_number`, `rack_id`, `machine_slot_number`, `machine_tray_index`, and `nvlink_domain_uuid`; switch metadata uses `switch_id`, `serial_number`, `rack_id`, `switch_slot_number`, and `switch_tray_index`. +- `[sinks.otlp]` adds machine metadata as OTLP resource attributes named `machine.id`, `rack.id`, integer `machine.slot_number`, integer `machine.tray_index`, and `nvlink.domain.uuid`; switch metadata uses `switch.id`, `rack.id`, integer `switch.slot_number`, and integer `switch.tray_index`. - `[sinks.health_report]`, `[sinks.rack_health_report]`, `[sinks.switch_health_report]`, and `[sinks.power_shelf_health_report]` use the same event context when submitting assessed health reports back to Carbide API. The persisted `HealthReport` and `HealthProbeAlert` schemas remain the probe success/alert model described above. ### BMC inventory monitoring