From 72675aefe7baac425a3f26df0f15faa7e9422035 Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Thu, 23 Apr 2026 12:03:17 +0200 Subject: [PATCH 1/7] add grafana httproute, add user info in openbao script, require login for grafana --- .../templates/openbao-secret-definitions.yaml | 8 +++++ .../templates/grafana-externalsecret.yaml | 24 ++++++++++++++ .../v1.0.7/templates/grafana-httproute.yaml | 31 +++++++++++++++++++ .../v1.0.7/templates/lgtm-stack.yaml | 21 +++++++++++-- 4 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml create mode 100644 sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml diff --git a/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml b/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml index ee820eed..8f247230 100644 --- a/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml +++ b/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml @@ -34,6 +34,14 @@ data: # Cluster domain for ingress and routing secrets/cluster-domain|static|{{ .Values.domain }}|0 + + # ============================================================================= + # OBSERVABILITY SECRETS + # ============================================================================= + + # Grafana bootstrap admin credentials + secrets/grafana-admin-user|static|admin|0 + secrets/grafana-admin-password|random||24 # ============================================================================= # AIRM APPLICATION SECRETS diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml new file mode 100644 index 00000000..5c024f9b --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml @@ -0,0 +1,24 @@ +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: grafana-admin-credentials + namespace: {{ .Release.Namespace }} + annotations: + argocd.argoproj.io/hook: PreSync +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: openbao-secret-store + target: + creationPolicy: Owner + name: grafana-admin-credentials + data: + - secretKey: username + remoteRef: + key: grafana-admin-user + property: value + - secretKey: password + remoteRef: + key: grafana-admin-password + property: value diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml new file mode 100644 index 00000000..408a0e29 --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml @@ -0,0 +1,31 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: grafana-route + namespace: {{ .Release.Namespace }} +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: https + namespace: kgateway-system + rules: + - backendRefs: + - group: "" + kind: Service + name: lgtm-stack + port: {{ .Values.services.lgtm.grafana }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: grafana\..* + path: + type: PathPrefix + value: / diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index cb76e752..d61cff65 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -266,9 +266,15 @@ spec: - name: FOLDER_ANNOTATION value: "grafana_folder" - name: REQ_USERNAME - value: admin + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: username - name: REQ_PASSWORD - value: admin + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: password - name: REQ_URL value: http://localhost:3000/api/admin/provisioning/dashboards/reload - name: REQ_METHOD @@ -285,6 +291,17 @@ spec: mountPath: "/tmp/dashboards" - name: lgtm image: ghcr.io/silogen/docker-otel-lgtm:v1.0.7 + env: + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: username + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: password ports: - containerPort: 3000 - containerPort: 4317 From 27da0e5efd4bfd39275cae962bf78075c70944ea Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Thu, 23 Apr 2026 13:16:42 +0200 Subject: [PATCH 2/7] require grafana login, not yet required --- sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index d61cff65..8c79909c 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -292,6 +292,10 @@ spec: - name: lgtm image: ghcr.io/silogen/docker-otel-lgtm:v1.0.7 env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "false" + - name: GF_AUTH_BASIC_ENABLED + value: "true" - name: GF_SECURITY_ADMIN_USER valueFrom: secretKeyRef: From b3592f3d7779481b9d7cf007ccb3fe0eb5bfbfd7 Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Thu, 23 Apr 2026 13:41:32 +0200 Subject: [PATCH 3/7] initial cluster-health-overview dashboard --- .../dashboards-cluster-health-overview.yaml | 567 ++++++++++++++++++ 1 file changed, 567 insertions(+) create mode 100644 sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml new file mode 100644 index 00000000..e44e89a6 --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml @@ -0,0 +1,567 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: lgtm-cluster-health-overview + namespace: {{ .Release.Namespace }} + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "common" +data: + cluster-health-overview.json: |- + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "DS_LOKI", + "label": "Loki", + "description": "", + "type": "datasource", + "pluginId": "loki", + "pluginName": "Loki" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "panel", + "id": "logs", + "name": "Logs", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Disk space usage for root and /mnt/disk* partitions", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "decimals": 1 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Used (GB)" }, + "properties": [ + { "id": "unit", "value": "decgbytes" } + ] + }, + { + "matcher": { "id": "byName", "options": "Capacity (GB)" }, + "properties": [ + { "id": "unit", "value": "decgbytes" } + ] + }, + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "id": 2, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Used (GB)" }] + }, + "title": "Disk Space Details", + "type": "table", + "transformations": [ + { "id": "merge", "options": {} }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true }, + "renameByName": { + "device": "Device", + "mountpoint": "Mount Point", + "nodename": "Hostname", + "Value #A": "Capacity (GB)", + "Value #B": "Used (GB)", + "Value #C": "Used %", + "Value #total": "Capacity (GB)", + "Value #used_gb": "Used (GB)", + "Value #used_pct": "Used %" + }, + "indexByName": { + "nodename": 0, + "device": 1, + "mountpoint": 2, + "Value #used_gb": 3, + "Value #total": 4, + "Value #used_pct": 5, + "Value #B": 3, + "Value #A": 4, + "Value #C": 5 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (nodename, device, mountpoint) ((node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} / 1024 / 1024 / 1024) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (nodename, device, mountpoint) ((((node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} - node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"}) / 1024 / 1024 / 1024) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"}))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (nodename, device, mountpoint) (((100 - (node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} / node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} * 100)) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"}))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "C" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "decimals": 1 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Role" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "text": "Agent" }, + "1": { "text": "Server" } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { "text": "Agent" } + }, + "type": "special" + } + ] + } + ] + }, + { + "matcher": { "id": "byName", "options": "Node Status" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "text": "NotReady" }, + "1": { "text": "Ready" } + }, + "type": "value" + } + ] + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + }, + { + "matcher": { "id": "byName", "options": "CPU Usage %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Memory Usage %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Since Reboot" }, + "properties": [ + { "id": "unit", "value": "s" } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Memory Usage %" }] + }, + "title": "Node Health Details", + "type": "table", + "transformations": [ + { "id": "merge", "options": {} }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true, "node": true }, + "renameByName": { + "hostname": "Hostname", + "Value #role": "Role", + "Value #status": "Node Status", + "Value #cpu": "CPU Usage %", + "Value #mem": "Memory Usage %", + "Value #reboot": "Since Reboot" + }, + "indexByName": { + "hostname": 0, + "Value #role": 1, + "Value #status": 2, + "Value #cpu": 3, + "Value #mem": 4, + "Value #reboot": 5 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "max by (hostname) (label_replace(kube_node_role{role=~\"control-plane|master\"}, \"hostname\", \"$1\", \"node\", \"(.*)\"))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "role" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "max by (hostname) (label_replace(kube_node_status_condition{condition=\"Ready\",status=\"true\"}, \"hostname\", \"$1\", \"node\", \"(.*)\"))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "status" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "100 - (avg by (hostname) (rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m]) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) * 100)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "cpu" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "100 * (1 - avg by (hostname) ((node_memory_MemAvailable_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) / (node_memory_MemTotal_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "mem" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "avg by (hostname) ((time() - node_boot_time_seconds{instance=~\"$instance\"}) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "reboot" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Age" }, + "properties": [ + { "id": "unit", "value": "s" } + ] + } + ] + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 16 }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Age" }] + }, + "title": "Crashing/Error/Not-Running Pods", + "type": "table", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "condition": true, "instance": true, "job": true, "uid": true }, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "phase": "Pod Status", + "Value": "Age" + }, + "indexByName": { + "namespace": 0, + "pod": 1, + "phase": 2, + "Value": 3 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "max by (namespace, pod, phase) (((time() - kube_pod_created) * on(namespace, pod) group_left(phase) max by (namespace, pod, phase) (kube_pod_status_phase{phase=~\"Pending|Running|Failed|Unknown\"} == 1)) * on(namespace, pod) group_left() (max by (namespace, pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CreateContainerError\"} == 1) or max by (namespace, pod) (kube_pod_container_status_terminated_reason{reason=~\"Error|OOMKilled|ContainerCannotRun\"} == 1) or max by (namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Failed|Unknown\"} == 1) or max by (namespace, pod) (kube_pod_status_ready{condition=\"true\"} == 0)))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "loki", "uid": "${DS_LOKI}" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 26 }, + "id": 5, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "${DS_LOKI}" }, + "expr": "{pod=~\".*kube-apiserver.*\"} |~ \"(?i)error\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "kube-apiserver ERROR Logs (pod + node)", + "type": "logs" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 14 }, + { "color": "green", "value": 30 } + ] + }, + "decimals": 1 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Days Until Expiry" }, + "properties": [ + { "id": "unit", "value": "d" }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 36 }, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": false, "displayName": "Days Until Expiry" }] + }, + "title": "TLS Certificate Expiry (cert-manager)", + "type": "table", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true }, + "renameByName": { + "namespace": "Namespace", + "name": "Certificate", + "Value": "Days Until Expiry" + }, + "indexByName": { + "namespace": 0, + "name": 1, + "Value": 2 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "max by (namespace, name) ((certmanager_certificate_expiration_timestamp_seconds - time()) / 86400)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "schemaVersion": 38, + "tags": ["disk", "storage", "node-exporter"], + "templating": { + "list": [ + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "definition": "label_values(node_filesystem_size_bytes, instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(node_filesystem_size_bytes, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "label": "Instance", + "type": "query" + } + ] + }, + "time": { "from": "now-5m", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Disk Space Monitor", + "uid": "disk-space-monitor", + "version": 1 + } From 23275e0963d693543a12236486597554bfad766d Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Thu, 23 Apr 2026 13:52:33 +0200 Subject: [PATCH 4/7] fix: add premade dashboards, rename cluster-overview dashboard --- root/values.yaml | 10 +++++ .../dashboards-cluster-health-overview.yaml | 38 +++++++++---------- .../v1.0.7/templates/lgtm-stack.yaml | 33 ++++++++++++++++ sources/otel-lgtm-stack/v1.0.7/values.yaml | 5 +++ 4 files changed, 67 insertions(+), 19 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 762b834d..2c98bc6b 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -791,6 +791,16 @@ apps: memory: 1Gi dashboards: enabled: true + github: + enabled: true + folder: kubernetes + urls: + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-api-server.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-coredns.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json kubeStateMetrics: enabled: true lgtm: diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml index e44e89a6..fe078563 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml @@ -72,7 +72,7 @@ data: "links": [], "panels": [ { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -154,7 +154,7 @@ data: ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (nodename, device, mountpoint) ((node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} / 1024 / 1024 / 1024) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"})", "format": "table", "instant": true, @@ -162,7 +162,7 @@ data: "refId": "A" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (nodename, device, mountpoint) ((((node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} - node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"}) / 1024 / 1024 / 1024) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"}))", "format": "table", "instant": true, @@ -170,7 +170,7 @@ data: "refId": "B" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (nodename, device, mountpoint) (((100 - (node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} / node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} * 100)) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"}))", "format": "table", "instant": true, @@ -180,7 +180,7 @@ data: ] }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -330,7 +330,7 @@ data: ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max by (hostname) (label_replace(kube_node_role{role=~\"control-plane|master\"}, \"hostname\", \"$1\", \"node\", \"(.*)\"))", "format": "table", "instant": true, @@ -338,7 +338,7 @@ data: "refId": "role" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max by (hostname) (label_replace(kube_node_status_condition{condition=\"Ready\",status=\"true\"}, \"hostname\", \"$1\", \"node\", \"(.*)\"))", "format": "table", "instant": true, @@ -346,7 +346,7 @@ data: "refId": "status" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "100 - (avg by (hostname) (rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m]) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) * 100)", "format": "table", "instant": true, @@ -354,7 +354,7 @@ data: "refId": "cpu" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "100 * (1 - avg by (hostname) ((node_memory_MemAvailable_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) / (node_memory_MemTotal_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))))", "format": "table", "instant": true, @@ -362,7 +362,7 @@ data: "refId": "mem" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg by (hostname) ((time() - node_boot_time_seconds{instance=~\"$instance\"}) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))", "format": "table", "instant": true, @@ -372,7 +372,7 @@ data: ] }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -428,7 +428,7 @@ data: ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max by (namespace, pod, phase) (((time() - kube_pod_created) * on(namespace, pod) group_left(phase) max by (namespace, pod, phase) (kube_pod_status_phase{phase=~\"Pending|Running|Failed|Unknown\"} == 1)) * on(namespace, pod) group_left() (max by (namespace, pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CreateContainerError\"} == 1) or max by (namespace, pod) (kube_pod_container_status_terminated_reason{reason=~\"Error|OOMKilled|ContainerCannotRun\"} == 1) or max by (namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Failed|Unknown\"} == 1) or max by (namespace, pod) (kube_pod_status_ready{condition=\"true\"} == 0)))", "format": "table", "instant": true, @@ -438,7 +438,7 @@ data: ] }, { - "datasource": { "type": "loki", "uid": "${DS_LOKI}" }, + "datasource": { "type": "loki", "uid": "loki" }, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 26 }, "id": 5, "options": { @@ -452,7 +452,7 @@ data: }, "targets": [ { - "datasource": { "type": "loki", "uid": "${DS_LOKI}" }, + "datasource": { "type": "loki", "uid": "loki" }, "expr": "{pod=~\".*kube-apiserver.*\"} |~ \"(?i)error\"", "queryType": "range", "refId": "A" @@ -462,7 +462,7 @@ data: "type": "logs" }, { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -522,7 +522,7 @@ data: ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "max by (namespace, name) ((certmanager_certificate_expiration_timestamp_seconds - time()) / 86400)", "format": "table", "instant": true, @@ -539,7 +539,7 @@ data: "list": [ { "current": {}, - "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values(node_filesystem_size_bytes, instance)", "hide": 0, "includeAll": true, @@ -561,7 +561,7 @@ data: "time": { "from": "now-5m", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "Disk Space Monitor", - "uid": "disk-space-monitor", + "title": "Cluster Health Overview", + "uid": "cluster-health-overview", "version": 1 } diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index 8c79909c..f0199fe4 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -250,6 +250,39 @@ spec: spec: serviceAccountName: grafana-sidecar automountServiceAccountToken: true + {{- if and .Values.dashboards.enabled .Values.dashboards.github.enabled (gt (len .Values.dashboards.github.urls) 0) }} + initContainers: + - name: grafana-github-dashboards + image: {{ .Values.dashboards.github.image | quote }} + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -ec + args: + - | + set -eu + target_dir="/tmp/dashboards/{{ .Values.dashboards.github.folder }}" + mkdir -p "$target_dir" + i=1 + {{- range $url := .Values.dashboards.github.urls }} + if curl -fsSL {{ $url | quote }} -o "$target_dir/dashboard-$i.json"; then + echo "Downloaded dashboard $i" + else + echo "WARN: failed to download dashboard $i from {{ $url }}" + fi + i=$((i+1)) + {{- end }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + volumeMounts: + - name: sc-dashboard-volume + mountPath: /tmp/dashboards + {{- end }} containers: - name: grafana-sc-dashboard image: "quay.io/kiwigrid/k8s-sidecar:1.27.4" diff --git a/sources/otel-lgtm-stack/v1.0.7/values.yaml b/sources/otel-lgtm-stack/v1.0.7/values.yaml index 6ea2fd2e..b6ef2f7b 100644 --- a/sources/otel-lgtm-stack/v1.0.7/values.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/values.yaml @@ -69,6 +69,11 @@ services: # Component enablement dashboards: enabled: true + github: + enabled: false + folder: kubernetes + image: curlimages/curl:8.8.0 + urls: [] nodeExporter: enabled: true From ed6bc13c69d7235585b3df961c5ec8626a6db478 Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Thu, 23 Apr 2026 14:47:54 +0200 Subject: [PATCH 5/7] fix: remove preinstall dashboards from root/values --- root/values.yaml | 12 ------------ sources/otel-lgtm-stack/v1.0.7/values.yaml | 10 ++++++++-- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 2c98bc6b..70c21005 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -789,18 +789,6 @@ apps: requests: cpu: 500m memory: 1Gi - dashboards: - enabled: true - github: - enabled: true - folder: kubernetes - urls: - - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-api-server.json - - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-coredns.json - - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json - - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json - - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json - - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json kubeStateMetrics: enabled: true lgtm: diff --git a/sources/otel-lgtm-stack/v1.0.7/values.yaml b/sources/otel-lgtm-stack/v1.0.7/values.yaml index b6ef2f7b..517b156e 100644 --- a/sources/otel-lgtm-stack/v1.0.7/values.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/values.yaml @@ -70,10 +70,16 @@ services: dashboards: enabled: true github: - enabled: false + enabled: true folder: kubernetes image: curlimages/curl:8.8.0 - urls: [] + urls: + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-api-server.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-coredns.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json nodeExporter: enabled: true From 94578b504de11fd599195f60209d000412356f4c Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Thu, 30 Apr 2026 16:06:40 +0200 Subject: [PATCH 6/7] feat: enhance cluster health overview dashboard with GPU detection and improved CPU usage metrics --- .../dashboards-cluster-health-overview.yaml | 225 +++++++++++------- 1 file changed, 142 insertions(+), 83 deletions(-) diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml index fe078563..10dcf59f 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml @@ -290,6 +290,41 @@ data: "properties": [ { "id": "unit", "value": "s" } ] + }, + { + "matcher": { "id": "byName", "options": "GPU Detected" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "text": "No" }, + "1": { "text": "Yes" } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { "text": "No" } + }, + "type": "special" + } + ] + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] } ] }, @@ -315,7 +350,8 @@ data: "Value #status": "Node Status", "Value #cpu": "CPU Usage %", "Value #mem": "Memory Usage %", - "Value #reboot": "Since Reboot" + "Value #reboot": "Since Reboot", + "Value #gpu": "GPU Detected" }, "indexByName": { "hostname": 0, @@ -323,7 +359,8 @@ data: "Value #status": 2, "Value #cpu": 3, "Value #mem": 4, - "Value #reboot": 5 + "Value #reboot": 5, + "Value #gpu": 6 } } } @@ -368,98 +405,124 @@ data: "instant": true, "legendFormat": "", "refId": "reboot" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max by (hostname) ((label_replace((kube_node_status_capacity{resource=~\".*gpu.*\"} > 0), \"hostname\", \"$1\", \"node\", \"(.*)\")) or on(hostname) (label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\") * 0))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "gpu" } ] }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Per-node CPU usage (1 - idle). Same expression as the **Node Health Details** table column, plotted over time.", "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "custom": { - "align": "left", - "cellOptions": { "type": "auto" }, - "inspect": false + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] - } + }, + "unit": "percent", + "min": 0, + "max": 100 }, - "overrides": [ - { - "matcher": { "id": "byName", "options": "Age" }, - "properties": [ - { "id": "unit", "value": "s" } - ] - } - ] + "overrides": [] }, - "gridPos": { "h": 10, "w": 24, "x": 0, "y": 16 }, - "id": 4, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 7, "options": { - "cellHeight": "sm", - "footer": { "show": false }, - "showHeader": true, - "sortBy": [{ "desc": true, "displayName": "Age" }] + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } }, - "title": "Crashing/Error/Not-Running Pods", - "type": "table", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { "Time": true, "__name__": true, "condition": true, "instance": true, "job": true, "uid": true }, - "renameByName": { - "namespace": "Namespace", - "pod": "Pod", - "phase": "Pod Status", - "Value": "Age" - }, - "indexByName": { - "namespace": 0, - "pod": 1, - "phase": 2, - "Value": 3 - } - } - } - ], "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "max by (namespace, pod, phase) (((time() - kube_pod_created) * on(namespace, pod) group_left(phase) max by (namespace, pod, phase) (kube_pod_status_phase{phase=~\"Pending|Running|Failed|Unknown\"} == 1)) * on(namespace, pod) group_left() (max by (namespace, pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CreateContainerError\"} == 1) or max by (namespace, pod) (kube_pod_container_status_terminated_reason{reason=~\"Error|OOMKilled|ContainerCannotRun\"} == 1) or max by (namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Failed|Unknown\"} == 1) or max by (namespace, pod) (kube_pod_status_ready{condition=\"true\"} == 0)))", - "format": "table", - "instant": true, - "legendFormat": "", + "expr": "100 - (avg by (hostname) (rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m]) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) * 100)", + "legendFormat": "{{hostname}}", "refId": "A" } - ] + ], + "title": "CPU Usage % by Node", + "type": "timeseries" }, { - "datasource": { "type": "loki", "uid": "loki" }, - "gridPos": { "h": 10, "w": 24, "x": 0, "y": 26 }, - "id": 5, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Per-node memory usage (1 - MemAvailable / MemTotal). Same expression as the **Node Health Details** table column, plotted over time.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 8, "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": true, - "showCommonLabels": false, - "showLabels": true, - "sortOrder": "Descending", - "wrapLogMessage": true + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } }, "targets": [ { - "datasource": { "type": "loki", "uid": "loki" }, - "expr": "{pod=~\".*kube-apiserver.*\"} |~ \"(?i)error\"", - "queryType": "range", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 * (1 - avg by (hostname) ((node_memory_MemAvailable_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) / (node_memory_MemTotal_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))))", + "legendFormat": "{{hostname}}", "refId": "A" } ], - "title": "kube-apiserver ERROR Logs (pod + node)", - "type": "logs" + "title": "Memory Usage % by Node", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -474,48 +537,44 @@ data: "mappings": [], "thresholds": { "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "yellow", "value": 14 }, - { "color": "green", "value": 30 } - ] - }, - "decimals": 1 + "steps": [{ "color": "green", "value": null }] + } }, "overrides": [ { - "matcher": { "id": "byName", "options": "Days Until Expiry" }, + "matcher": { "id": "byName", "options": "Age" }, "properties": [ - { "id": "unit", "value": "d" }, - { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + { "id": "unit", "value": "s" } ] } ] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 36 }, - "id": 6, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 }, + "id": 4, "options": { "cellHeight": "sm", "footer": { "show": false }, "showHeader": true, - "sortBy": [{ "desc": false, "displayName": "Days Until Expiry" }] + "sortBy": [{ "desc": true, "displayName": "Age" }] }, - "title": "TLS Certificate Expiry (cert-manager)", + "title": "Crashing/Error/Not-Running Pods", "type": "table", "transformations": [ { "id": "organize", "options": { - "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true }, + "excludeByName": { "Time": true, "__name__": true, "condition": true, "instance": true, "job": true, "uid": true }, "renameByName": { "namespace": "Namespace", - "name": "Certificate", - "Value": "Days Until Expiry" + "pod": "Pod", + "phase": "Pod Status", + "Value": "Age" }, "indexByName": { "namespace": 0, - "name": 1, - "Value": 2 + "pod": 1, + "phase": 2, + "Value": 3 } } } @@ -523,7 +582,7 @@ data: "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "max by (namespace, name) ((certmanager_certificate_expiration_timestamp_seconds - time()) / 86400)", + "expr": "max by (namespace, pod, phase) (((time() - kube_pod_created) * on(namespace, pod) group_left(phase) max by (namespace, pod, phase) (kube_pod_status_phase{phase=~\"Pending|Running|Failed|Unknown\"} == 1)) * on(namespace, pod) group_left() (max by (namespace, pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CreateContainerError\"} == 1) or max by (namespace, pod) (kube_pod_container_status_terminated_reason{reason=~\"Error|OOMKilled|ContainerCannotRun\"} == 1) or max by (namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Failed|Unknown\"} == 1) or max by (namespace, pod) (kube_pod_status_ready{condition=\"true\"} == 0)))", "format": "table", "instant": true, "legendFormat": "", From 2b2b61f29921513dead7054bde5ad4d724888305 Mon Sep 17 00:00:00 2001 From: oskarasbrink Date: Fri, 8 May 2026 10:16:29 +0200 Subject: [PATCH 7/7] add basic GPU utliziation monitoring to cluster-health-overview dashboard --- .../dashboards-cluster-health-overview.yaml | 143 +++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml index 10dcf59f..8c6d29d5 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml @@ -524,6 +524,147 @@ data: "title": "Memory Usage % by Node", "type": "timeseries" }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Cluster-wide GPU utilization across all GPUs reporting `gpu_gfx_activity`. Three lines: minimum, average and maximum across the fleet. Stays empty if no GPU metrics are present.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Avg" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "blue" } }, + { "id": "custom.lineWidth", "value": 3 } + ] + }, + { + "matcher": { "id": "byName", "options": "Max" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Min" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 9, + "options": { + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "avg(gpu_gfx_activity)", + "legendFormat": "Avg", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max(gpu_gfx_activity)", + "legendFormat": "Max", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "min(gpu_gfx_activity)", + "legendFormat": "Min", + "refId": "C" + } + ], + "timeFrom": "24h", + "title": "GPU Utilization % (Cluster Aggregate)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Per-GPU activity from the AMD SMI exporter (`gpu_gfx_activity`). Each line is one GPU keyed by node and GPU UUID. Stays empty if no GPU metrics are present.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "gpu_gfx_activity", + "legendFormat": "{{node}} / {{gpu_uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization % by GPU", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { @@ -549,7 +690,7 @@ data: } ] }, - "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 32 }, "id": 4, "options": { "cellHeight": "sm",