Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
0e1d965
feat(health): add machine placement metadata to events
mkoci May 12, 2026
0053968
feat(health): forward machine metadata from API
mkoci May 12, 2026
037dd50
feat(health): expose machine metadata as Prometheus labels
mkoci May 12, 2026
40c3d49
feat(health): emit machine metadata as OTLP attributes
mkoci May 12, 2026
f22528b
docs(health): document hardware health metadata surfaces
mkoci May 14, 2026
83149ec
feat(health): expose switch placement metadata
mkoci May 14, 2026
c52200e
Merge branch 'main' into feature/hw-health-machine-metadata
mkoci May 14, 2026
6331ef8
refactor(health): rename SseConnectionGuard to StreamingConnectionGuard
mkoci Apr 23, 2026
89acbe5
feat(health): vendor openconfig/gnmi protos for reproducible builds
mkoci Apr 23, 2026
57f9b0b
feat(health): add NVUE gNMI streaming collector
mkoci Apr 23, 2026
8c0f234
feat(health): add OTLP metrics export via MetricsService
mkoci Apr 23, 2026
c8e3c83
fix(health): adapt nvos streaming to metadata stack
mkoci May 14, 2026
7f1445a
Merge remote-tracking branch 'mk-origin/main' into feature/nvos_strea…
mkoci May 15, 2026
3aafde5
fix(health): add support for positional metadata in telemetry
mkoci May 15, 2026
cc59345
Merge branch 'NVIDIA:main' into feature/nvos_streaming
mkoci May 15, 2026
3885ae9
Merge remote-tracking branch 'NVIDIA/main' into feature-nvos-health
mkoci May 15, 2026
37371fb
feat(health): restore nvue gnmi on-change events
mkoci May 15, 2026
543be1a
fix(health): typo
mkoci May 17, 2026
11efebf
Merge remote-tracking branch 'NVIDIA/main' into feature/nvos_streaming
mkoci May 18, 2026
57221f2
Merge branch 'feature/nvos_streaming' into feature-nvos-health
mkoci May 18, 2026
0f3fd25
lint(health): tidy up
mkoci May 18, 2026
438019d
bug(health): gate switch hosts and bmcs in spawn to avoid redfish cal…
mkoci May 17, 2026
db6dca4
feat(health): add SwitchEndpointRole to distinguish switch BMC from Host
mkoci May 18, 2026
3fbc376
feat(health): add static config shape for switch bmc/host
mkoci May 19, 2026
817c2f3
feat(health): gate switch collection by endpoint role (host/bmc)
mkoci May 19, 2026
83f05fe
feat(api): expose switch host endpoints and nvos credentials for host…
mkoci May 19, 2026
039a020
feat(health): wire switch host endpoint to health discovery
mkoci May 19, 2026
58e8c24
feat(health): discover switch bmc and host endpoints, respectively
mkoci May 19, 2026
4f8e492
fix(health): remove unnecessary helpers
mkoci May 20, 2026
7277eef
Merge branch 'NVIDIA:main' into bug_1744_no_redfish_for_switch_hosts
mkoci May 20, 2026
eb11bb4
fix(api): use proper errors when bmc/switch host credentials are not …
mkoci May 20, 2026
4099f62
fix(health): lint-police
mkoci May 20, 2026
804fd0f
fix(api, health): reshape Switch API surface with switch bmc and nvos…
mkoci May 21, 2026
4ebcf2e
fix(api): broken test
mkoci May 21, 2026
135d494
Merge branch 'main' into bug_1744_no_redfish_for_switch_hosts
mkoci May 21, 2026
f93a8ee
Merge branch 'NVIDIA:main' into feature-nvos-health
mkoci May 21, 2026
230c2b9
Merge remote-tracking branch 'upstream/main' into bug_1744_no_redfish…
mkoci May 21, 2026
eadec73
Merge branch 'main' into bug_1744_no_redfish_for_switch_hosts
mkoci May 22, 2026
877af27
Merge branch 'NVIDIA:main' into feature-nvos-health
mkoci May 22, 2026
9f7ec17
Merge branch 'bug_1744_no_redfish_for_switch_hosts' into feature-nvos…
mkoci May 22, 2026
6670948
fix(health): adds SwitchNvosInfo, explicit `spawn_collectors_for_endp…
mkoci May 26, 2026
bb5ec30
Merge branch 'main' into bug_1744_no_redfish_for_switch_hosts
mkoci May 26, 2026
04a4d9f
Merge branch 'bug_1744_no_redfish_for_switch_hosts' into feature-nvos…
mkoci May 26, 2026
b048f03
fix(health): parse nvl partition num-gpus properly (it returns a stri…
mkoci May 27, 2026
4a38257
fix(health): ensure collector removal like #1302
mkoci May 28, 2026
a1f996c
Merge remote-tracking branch 'origin/main' into feature-nvos-health
mkoci May 28, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 116 additions & 46 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions crates/health/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,17 @@ http = { workspace = true }
humantime = { workspace = true }
humantime-serde = { workspace = true }
hyper = { workspace = true }
hyper-rustls = { workspace = true, features = ["http2"] }
hyper-util = { workspace = true }
mac_address = { workspace = true }
prometheus = { workspace = true }
reqwest = { workspace = true, features = ["query", "json"] }
rustls = { workspace = true }
rustls-pki-types = { workspace = true }
serde = { features = ["derive"], workspace = true }
serde_json = { workspace = true }
serde_with = { workspace = true }
tokio-stream = { workspace = true }
tokio-util = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { features = [
Expand Down
31 changes: 28 additions & 3 deletions crates/health/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,42 @@ use std::path::PathBuf;
fn main() -> Result<(), Box<dyn std::error::Error>> {
carbide_version::build();

// vendored from opentelemetry-proto v1.5.0
let proto_dir = PathBuf::from("proto");

println!("cargo:rerun-if-changed=proto/");

// vendored from opentelemetry-proto v1.5.0
tonic_prost_build::configure()
.build_server(false)
.build_client(true)
.compile_protos(
&[
proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto"),
proto_dir.join("opentelemetry/proto/collector/metrics/v1/metrics_service.proto"),
],
std::slice::from_ref(&proto_dir),
)?;

// vendored from openconfig/gnmi v0.11.0
// gnmi_ext compiled separately so gnmi.proto can extern_path it and reuse the types
tonic_prost_build::configure()
.build_client(true)
.build_server(false)
.compile_protos(
&[proto_dir.join("github.com/openconfig/gnmi/proto/gnmi_ext/gnmi_ext.proto")],
std::slice::from_ref(&proto_dir),
)?;

tonic_prost_build::configure()
.build_client(true)
.build_server(false)
.extern_path(
".gnmi_ext",
"crate::collectors::nvue::gnmi::proto::gnmi_ext",
)
.compile_protos(
&[proto_dir.join("opentelemetry/proto/collector/logs/v1/logs_service.proto")],
&[proto_dir],
&[proto_dir.join("github.com/openconfig/gnmi/proto/gnmi/gnmi.proto")],
std::slice::from_ref(&proto_dir),
)?;

Ok(())
Expand Down
16 changes: 16 additions & 0 deletions crates/health/example/config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,22 @@ cluster_apps_enabled = true
sdn_partitions_enabled = true
interfaces_enabled = true

# NVUE gNMI streaming collector (switches only, disabled by default).
# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink
# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when
# configured separately) pushes to an OTel Collector.
[collectors.nvue.gnmi]
gnmi_port = 9339
sample_interval = "5m"
request_timeout = "30s"
# gNMI ON_CHANGE subscription for system events
system_events_enabled = true

[collectors.nvue.gnmi.paths]
components_enabled = true
interfaces_enabled = true
leak_sensors_enabled = true

# ==============================================================================
# Processors
# ==============================================================================
Expand Down
Loading
Loading