AET-DevOps26 · imol-ai · Jun 25, 2026 · Jun 25, 2026 · Jun 26, 2026 · jschoedl
diff --git a/.github/workflows/deploy-k8s.yml b/.github/workflows/deploy-k8s.yml
@@ -106,6 +106,15 @@ jobs:
           kubectl apply -f infra/k8s/web-client/
           kubectl apply -f infra/k8s/ingress.yaml
 
+      - name: Upsert Grafana secret
+        run: |
+          kubectl create secret generic grafana-secret -n monitoring \
+            --from-literal=admin-password="${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
+            --dry-run=client -o yaml | kubectl apply -f -
+
+      - name: Deploy monitoring
+        run: kubectl apply -f infra/k8s/monitoring/
+
       - name: Restart deployments to pull latest images
         if: github.event_name != 'push'
         run: |

diff --git a/README.md b/README.md
@@ -9,6 +9,8 @@ Azure deployment (Docker Compose): http://135.116.196.120/
 Coverage reports: https://aet-devops26.github.io/team-devsecops/
 
 API scheme (Swagger UI): https://devsecops.stud.k8s.aet.cit.tum.de/swagger-ui/index.html
+
+Monitoring (Grafana): https://devsecops.stud.k8s.aet.cit.tum.de/grafana
 ## Local development
 
 The full stack runs under Docker Compose with live-reload:

diff --git a/infra/k8s/monitoring/grafana-ingress.yaml b/infra/k8s/monitoring/grafana-ingress.yaml
@@ -0,0 +1,29 @@
+---
+# Exposes Grafana at https://devsecops.stud.k8s.aet.cit.tum.de/grafana/
+# Grafana is configured with GF_SERVER_SERVE_FROM_SUB_PATH=true so it handles
+# the /grafana/ prefix itself — no rewrite-target needed.
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana-ingress
+  namespace: monitoring
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
-    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # no TLS certificate, ingress.yaml already requests one
-    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # no TLS certificate, ingress.yaml already requests one
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - devsecops.stud.k8s.aet.cit.tum.de
+      secretName: grafana-tls-cert
+  rules:
+    - host: devsecops.stud.k8s.aet.cit.tum.de
+      http:
+        paths:
+          - path: /grafana
+            pathType: Prefix
+            backend:
+              service:
+                name: grafana
+                port:
+                  number: 3000
diff --git a/infra/k8s/monitoring/grafana.yaml b/infra/k8s/monitoring/grafana.yaml
@@ -0,0 +1,92 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: monitoring
+data:
+  prometheus.yaml: |
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        type: prometheus
+        url: http://prometheus.monitoring.svc.cluster.local:9090
+        isDefault: true
+        access: proxy
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-data
+  namespace: monitoring
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      securityContext:
+        fsGroup: 472
+      containers:
+        - name: grafana
+          image: grafana/grafana:11.6.1
+          env:
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-secret
+                  key: admin-password
+            - name: GF_SERVER_ROOT_URL
+              value: https://devsecops.stud.k8s.aet.cit.tum.de/grafana/
+            - name: GF_SERVER_SERVE_FROM_SUB_PATH
+              value: "true"
+          ports:
+            - containerPort: 3000
+          volumeMounts:
+            - name: datasources
+              mountPath: /etc/grafana/provisioning/datasources
+            - name: data
+              mountPath: /var/lib/grafana
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 256Mi
+      volumes:
+        - name: datasources
+          configMap:
+            name: grafana-datasources
+        - name: data
+          persistentVolumeClaim:
+            claimName: grafana-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  selector:
+    app: grafana
+  ports:
+    - name: http
+      port: 3000
+      targetPort: 3000
diff --git a/infra/k8s/monitoring/prometheus.yaml b/infra/k8s/monitoring/prometheus.yaml
@@ -0,0 +1,102 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: monitoring
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+
+    scrape_configs:
+      - job_name: spring-api
+        metrics_path: /actuator/prometheus
+        static_configs:
+          - targets: ['spring-api.app.svc.cluster.local:8080']
+            labels:
+              service: spring-api
+
+      - job_name: py-help-service
+        static_configs:
+          - targets: ['py-help-service.app.svc.cluster.local:8080']
+            labels:
+              service: py-help-service
+
+      - job_name: py-recipe-service
+        static_configs:
+          - targets: ['py-recipe-service.app.svc.cluster.local:8080']
+            labels:
+              service: py-recipe-service
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+  namespace: monitoring
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus
+  namespace: monitoring
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+    spec:
+      securityContext:
+        fsGroup: 65534
+      containers:
+        - name: prometheus
+          image: prom/prometheus:v3.4.1
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --storage.tsdb.path=/prometheus
+            - --storage.tsdb.retention.time=7d
+          ports:
+            - containerPort: 9090
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus
+            - name: data
+              mountPath: /prometheus
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-config
+        - name: data
+          persistentVolumeClaim:
+            claimName: prometheus-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  namespace: monitoring
+spec:
+  selector:
+    app: prometheus
+  ports:
+    - name: http
+      port: 9090
+      targetPort: 9090
diff --git a/services/py-help-service/main.py b/services/py-help-service/main.py
@@ -10,6 +10,7 @@
 from langchain.chat_models import init_chat_model
 from langchain_core.messages import HumanMessage, SystemMessage
 
+from prometheus_fastapi_instrumentator import Instrumentator
 from client.cooking_assistant_gen_ai_services_api_internal_client.models.help_request_forwarded import (
 	HelpRequestForwarded,
 )
@@ -21,6 +22,7 @@
 load_dotenv()
 
 app = FastAPI(title="Cooking Assistant GenAI Service")
+Instrumentator().instrument(app).expose(app)
 
 
 @app.exception_handler(HTTPException)

diff --git a/services/py-help-service/requirements.txt b/services/py-help-service/requirements.txt
@@ -8,3 +8,4 @@ pydantic==2.7.4
 python-dotenv==1.0.1
 attrs==23.2.0
 python-dateutil==2.9.0.post0
+prometheus-fastapi-instrumentator==7.1.0
diff --git a/services/py-recipe-service/main.py b/services/py-recipe-service/main.py
@@ -12,6 +12,7 @@
 from langchain.chat_models import init_chat_model
 from langchain_core.messages import HumanMessage, SystemMessage
 
+from prometheus_fastapi_instrumentator import Instrumentator
 from client.cooking_assistant_gen_ai_services_api_internal_client.models.recipe_request_forwarded import (
 	RecipeRequestForwarded,
 )
@@ -20,6 +21,7 @@
 load_dotenv()
 
 app = FastAPI(title="Cooking Assistant GenAI Service")
+Instrumentator().instrument(app).expose(app)
 
 
 @app.exception_handler(HTTPException)

diff --git a/services/py-recipe-service/requirements.txt b/services/py-recipe-service/requirements.txt
@@ -8,3 +8,4 @@ pydantic==2.7.4
 python-dotenv==1.0.1
 attrs==23.2.0
 python-dateutil==2.9.0.post0
+prometheus-fastapi-instrumentator==7.1.0