diff --git a/root/values.yaml b/root/values.yaml index 762b834d..70c21005 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -789,8 +789,6 @@ apps: requests: cpu: 500m memory: 1Gi - dashboards: - enabled: true kubeStateMetrics: enabled: true lgtm: diff --git a/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml b/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml index ee820eed..8f247230 100644 --- a/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml +++ b/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml @@ -34,6 +34,14 @@ data: # Cluster domain for ingress and routing secrets/cluster-domain|static|{{ .Values.domain }}|0 + + # ============================================================================= + # OBSERVABILITY SECRETS + # ============================================================================= + + # Grafana bootstrap admin credentials + secrets/grafana-admin-user|static|admin|0 + secrets/grafana-admin-password|random||24 # ============================================================================= # AIRM APPLICATION SECRETS diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml new file mode 100644 index 00000000..0ad20b55 --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-cluster-health-overview.yaml @@ -0,0 +1,767 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: lgtm-cluster-health-overview + namespace: {{ .Release.Namespace }} + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "common" +data: + cluster-health-overview.json: |- + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "DS_LOKI", + "label": "Loki", + "description": "", + "type": "datasource", + "pluginId": "loki", + "pluginName": "Loki" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "panel", + "id": "logs", + "name": "Logs", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Disk space usage for root and /mnt/disk* partitions", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "decimals": 1 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Used (GB)" }, + "properties": [ + { "id": "unit", "value": "decgbytes" } + ] + }, + { + "matcher": { "id": "byName", "options": "Capacity (GB)" }, + "properties": [ + { "id": "unit", "value": "decgbytes" } + ] + }, + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "id": 2, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Used (GB)" }] + }, + "title": "Disk Space Details", + "type": "table", + "transformations": [ + { "id": "merge", "options": {} }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true }, + "renameByName": { + "device": "Device", + "mountpoint": "Mount Point", + "nodename": "Hostname", + "Value #A": "Capacity (GB)", + "Value #B": "Used (GB)", + "Value #C": "Used %", + "Value #total": "Capacity (GB)", + "Value #used_gb": "Used (GB)", + "Value #used_pct": "Used %" + }, + "indexByName": { + "nodename": 0, + "device": 1, + "mountpoint": 2, + "Value #used_gb": 3, + "Value #total": 4, + "Value #used_pct": 5, + "Value #B": 3, + "Value #A": 4, + "Value #C": 5 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (nodename, device, mountpoint) ((node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} / 1024 / 1024 / 1024) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (nodename, device, mountpoint) ((((node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} - node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"}) / 1024 / 1024 / 1024) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"}))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (nodename, device, mountpoint) (((100 - (node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} / node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=~\"^/$|^/mnt(/.*)?$|^/var/lib/rancher(/.*)?$\", fstype!~\"tmpfs|overlay|squashfs|fuse.lxcfs\"} * 100)) * on(instance) group_left(nodename) node_uname_info{instance=~\"$instance\"}))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "C" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "decimals": 1 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Role" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "text": "Agent" }, + "1": { "text": "Server" } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { "text": "Agent" } + }, + "type": "special" + } + ] + } + ] + }, + { + "matcher": { "id": "byName", "options": "Node Status" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "text": "NotReady" }, + "1": { "text": "Ready" } + }, + "type": "value" + } + ] + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + }, + { + "matcher": { "id": "byName", "options": "CPU Usage %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Memory Usage %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Since Reboot" }, + "properties": [ + { "id": "unit", "value": "s" } + ] + }, + { + "matcher": { "id": "byName", "options": "GPU Detected" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "text": "No" }, + "1": { "text": "Yes" } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { "text": "No" } + }, + "type": "special" + } + ] + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Memory Usage %" }] + }, + "title": "Node Health Details", + "type": "table", + "transformations": [ + { "id": "merge", "options": {} }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true, "node": true }, + "renameByName": { + "hostname": "Hostname", + "Value #role": "Role", + "Value #status": "Node Status", + "Value #cpu": "CPU Usage %", + "Value #mem": "Memory Usage %", + "Value #reboot": "Since Reboot", + "Value #gpu": "GPU Detected" + }, + "indexByName": { + "hostname": 0, + "Value #role": 1, + "Value #status": 2, + "Value #cpu": 3, + "Value #mem": 4, + "Value #reboot": 5, + "Value #gpu": 6 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max by (hostname) (label_replace(kube_node_role{role=~\"control-plane|master\"}, \"hostname\", \"$1\", \"node\", \"(.*)\"))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "role" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max by (hostname) (label_replace(kube_node_status_condition{condition=\"Ready\",status=\"true\"}, \"hostname\", \"$1\", \"node\", \"(.*)\"))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "status" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 - (avg by (hostname) (rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m]) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) * 100)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "cpu" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 * (1 - avg by (hostname) ((node_memory_MemAvailable_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) / (node_memory_MemTotal_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "mem" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "avg by (hostname) ((time() - node_boot_time_seconds{instance=~\"$instance\"}) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "reboot" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max by (hostname) ((label_replace((kube_node_status_capacity{resource=~\".*gpu.*\"} > 0), \"hostname\", \"$1\", \"node\", \"(.*)\")) or on(hostname) (label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\") * 0))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "gpu" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Per-node CPU usage (1 - idle). Same expression as the **Node Health Details** table column, plotted over time.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 - (avg by (hostname) (rate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m]) * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) * 100)", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "title": "CPU Usage % by Node", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Per-node memory usage (1 - MemAvailable / MemTotal). Same expression as the **Node Health Details** table column, plotted over time.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 8, + "options": { + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 * (1 - avg by (hostname) ((node_memory_MemAvailable_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\")) / (node_memory_MemTotal_bytes{instance=~\"$instance\"} * on(instance) group_left(hostname) label_replace(node_uname_info{instance=~\"$instance\"}, \"hostname\", \"$1\", \"nodename\", \"(.*)\"))))", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "title": "Memory Usage % by Node", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Cluster-wide GPU utilization across all GPUs reporting `gpu_gfx_activity`. Three lines: minimum, average and maximum across the fleet. Stays empty if no GPU metrics are present.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Avg" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "blue" } }, + { "id": "custom.lineWidth", "value": 3 } + ] + }, + { + "matcher": { "id": "byName", "options": "Max" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Min" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 9, + "options": { + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "avg(gpu_gfx_activity)", + "legendFormat": "Avg", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max(gpu_gfx_activity)", + "legendFormat": "Max", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "min(gpu_gfx_activity)", + "legendFormat": "Min", + "refId": "C" + } + ], + "timeFrom": "24h", + "title": "GPU Utilization % (Cluster Aggregate)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Per-GPU activity from the AMD SMI exporter (`gpu_gfx_activity`). Each line is one GPU keyed by node and GPU UUID. Stays empty if no GPU metrics are present.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "gpu_gfx_activity", + "legendFormat": "{{node}} / {{gpu_uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization % by GPU", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "left", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Age" }, + "properties": [ + { "id": "unit", "value": "s" } + ] + } + ] + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 32 }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Age" }] + }, + "title": "Crashing/Error/Not-Running Pods", + "type": "table", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "condition": true, "instance": true, "job": true, "uid": true }, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "phase": "Pod Status", + "Value": "Age" + }, + "indexByName": { + "namespace": 0, + "pod": 1, + "phase": 2, + "Value": 3 + } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max by (namespace, pod, phase) (((time() - kube_pod_created) * on(namespace, pod) group_left(phase) max by (namespace, pod, phase) (kube_pod_status_phase{phase=~\"Pending|Running|Failed|Unknown\"} == 1)) * on(namespace, pod) group_left() (max by (namespace, pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CreateContainerError\"} == 1) or max by (namespace, pod) (kube_pod_container_status_terminated_reason{reason=~\"Error|OOMKilled|ContainerCannotRun\"} == 1) or max by (namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Failed|Unknown\"} == 1) or max by (namespace, pod) (kube_pod_status_ready{condition=\"true\"} == 0)))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "schemaVersion": 38, + "tags": ["disk", "storage", "node-exporter"], + "templating": { + "list": [ + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(node_filesystem_size_bytes, instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(node_filesystem_size_bytes, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "label": "Instance", + "type": "query" + } + ] + }, + "time": { "from": "now-5m", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Cluster Health Overview", + "uid": "cluster-health-overview", + "version": 1 + } diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml new file mode 100644 index 00000000..5c024f9b --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-externalsecret.yaml @@ -0,0 +1,24 @@ +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: grafana-admin-credentials + namespace: {{ .Release.Namespace }} + annotations: + argocd.argoproj.io/hook: PreSync +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: openbao-secret-store + target: + creationPolicy: Owner + name: grafana-admin-credentials + data: + - secretKey: username + remoteRef: + key: grafana-admin-user + property: value + - secretKey: password + remoteRef: + key: grafana-admin-password + property: value diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml new file mode 100644 index 00000000..408a0e29 --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/grafana-httproute.yaml @@ -0,0 +1,31 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: grafana-route + namespace: {{ .Release.Namespace }} +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: https + namespace: kgateway-system + rules: + - backendRefs: + - group: "" + kind: Service + name: lgtm-stack + port: {{ .Values.services.lgtm.grafana }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: grafana\..* + path: + type: PathPrefix + value: / diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index cb76e752..f0199fe4 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -250,6 +250,39 @@ spec: spec: serviceAccountName: grafana-sidecar automountServiceAccountToken: true + {{- if and .Values.dashboards.enabled .Values.dashboards.github.enabled (gt (len .Values.dashboards.github.urls) 0) }} + initContainers: + - name: grafana-github-dashboards + image: {{ .Values.dashboards.github.image | quote }} + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -ec + args: + - | + set -eu + target_dir="/tmp/dashboards/{{ .Values.dashboards.github.folder }}" + mkdir -p "$target_dir" + i=1 + {{- range $url := .Values.dashboards.github.urls }} + if curl -fsSL {{ $url | quote }} -o "$target_dir/dashboard-$i.json"; then + echo "Downloaded dashboard $i" + else + echo "WARN: failed to download dashboard $i from {{ $url }}" + fi + i=$((i+1)) + {{- end }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + volumeMounts: + - name: sc-dashboard-volume + mountPath: /tmp/dashboards + {{- end }} containers: - name: grafana-sc-dashboard image: "quay.io/kiwigrid/k8s-sidecar:1.27.4" @@ -266,9 +299,15 @@ spec: - name: FOLDER_ANNOTATION value: "grafana_folder" - name: REQ_USERNAME - value: admin + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: username - name: REQ_PASSWORD - value: admin + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: password - name: REQ_URL value: http://localhost:3000/api/admin/provisioning/dashboards/reload - name: REQ_METHOD @@ -285,6 +324,21 @@ spec: mountPath: "/tmp/dashboards" - name: lgtm image: ghcr.io/silogen/docker-otel-lgtm:v1.0.7 + env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "false" + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: username + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-admin-credentials + key: password ports: - containerPort: 3000 - containerPort: 4317 diff --git a/sources/otel-lgtm-stack/v1.0.7/values.yaml b/sources/otel-lgtm-stack/v1.0.7/values.yaml index 6ea2fd2e..517b156e 100644 --- a/sources/otel-lgtm-stack/v1.0.7/values.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/values.yaml @@ -69,6 +69,17 @@ services: # Component enablement dashboards: enabled: true + github: + enabled: true + folder: kubernetes + image: curlimages/curl:8.8.0 + urls: + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-api-server.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-coredns.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json + - https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json nodeExporter: enabled: true