From 60ef761c0110bca70a887b227162381678792f12 Mon Sep 17 00:00:00 2001
From: Scot Wells <wells.scot@gmail.com>
Date: Thu, 2 Apr 2026 17:41:58 -0400
Subject: [PATCH 1/3] fix: eliminate 502 errors during rolling deployments

Add gateway-level retry, health checking, and disruption protection so
clients never see errors when the API server is updated.

Closes #559

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 config/apiserver/deployment.yaml              |  7 +++-
 config/apiserver/kustomization.yaml           |  1 +
 config/apiserver/pdb.yaml                     |  9 +++++
 .../gateway-api/backend-traffic-policy.yaml   | 37 +++++++++++++++++++
 .../components/gateway-api/kustomization.yaml |  1 +
 5 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 config/apiserver/pdb.yaml
 create mode 100644 config/components/gateway-api/backend-traffic-policy.yaml

diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml
index e13dd69f..782c7da7 100644
--- a/config/apiserver/deployment.yaml
+++ b/config/apiserver/deployment.yaml
@@ -10,8 +10,8 @@ spec:
       app.kubernetes.io/part-of: milo-control-plane
   strategy:
     rollingUpdate:
-      maxSurge: 25%
-      maxUnavailable: 25%
+      maxSurge: 1
+      maxUnavailable: 0
     type: RollingUpdate
   template:
     metadata:
@@ -83,6 +83,7 @@ spec:
           - --events-provider-timeout=$(EVENTS_PROVIDER_TIMEOUT)
           - --events-provider-retries=$(EVENTS_PROVIDER_RETRIES)
           - --events-forward-extras=$(EVENTS_FORWARD_EXTRAS)
+          - --shutdown-delay-duration=$(SHUTDOWN_DELAY_DURATION)
         env:
           # Feature gates configuration
           # Sessions and UserIdentities are GA (enabled by default)
@@ -184,6 +185,8 @@ spec:
             value: "3"
           - name: EVENTS_FORWARD_EXTRAS
             value: "iam.miloapis.com/parent-api-group,iam.miloapis.com/parent-type,iam.miloapis.com/parent-name"
+          - name: SHUTDOWN_DELAY_DURATION
+            value: "10s"
         livenessProbe:
           failureThreshold: 3
           httpGet:
diff --git a/config/apiserver/kustomization.yaml b/config/apiserver/kustomization.yaml
index a33121c3..e309f29e 100644
--- a/config/apiserver/kustomization.yaml
+++ b/config/apiserver/kustomization.yaml
@@ -3,3 +3,4 @@ kind: Kustomization
 resources:
   - deployment.yaml
   - service.yaml
+  - pdb.yaml
diff --git a/config/apiserver/pdb.yaml b/config/apiserver/pdb.yaml
new file mode 100644
index 00000000..4031993a
--- /dev/null
+++ b/config/apiserver/pdb.yaml
@@ -0,0 +1,9 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: milo-apiserver
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: milo-apiserver
diff --git a/config/components/gateway-api/backend-traffic-policy.yaml b/config/components/gateway-api/backend-traffic-policy.yaml
new file mode 100644
index 00000000..96bdf468
--- /dev/null
+++ b/config/components/gateway-api/backend-traffic-policy.yaml
@@ -0,0 +1,37 @@
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: BackendTrafficPolicy
+metadata:
+  name: milo-apiserver
+  namespace: milo-system
+spec:
+  targetRefs:
+    - group: gateway.networking.k8s.io
+      kind: HTTPRoute
+      name: milo-apiserver
+  retry:
+    numRetries: 3
+    retryOn:
+      triggers:
+        - gateway-error
+        - connect-failure
+        - reset
+    perRetry:
+      backOff:
+        baseInterval: 100ms
+        maxInterval: 1s
+      timeout: 2s
+  healthCheck:
+    active:
+      type: HTTP
+      http:
+        path: /readyz
+      interval: 5s
+      timeout: 3s
+      unhealthyThreshold: 2
+      healthyThreshold: 1
+    passive:
+      consecutive5XxErrors: 2
+      consecutiveGatewayErrors: 1
+      interval: 3s
+      baseEjectionTime: 15s
+      maxEjectionPercent: 33
diff --git a/config/components/gateway-api/kustomization.yaml b/config/components/gateway-api/kustomization.yaml
index 02c1ab2f..befaa445 100644
--- a/config/components/gateway-api/kustomization.yaml
+++ b/config/components/gateway-api/kustomization.yaml
@@ -4,3 +4,4 @@ kind: Component
 resources:
   - httproute.yaml
   - backend-tls-policy.yaml
+  - backend-traffic-policy.yaml

From 46f25f2905ddfc71914df338adfa6fe2cae77925 Mon Sep 17 00:00:00 2001
From: Scot Wells <wells.scot@gmail.com>
Date: Thu, 2 Apr 2026 17:43:38 -0400
Subject: [PATCH 2/3] fix: use percentage-based PDB and maxSurge to scale with
 HPA

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 config/apiserver/deployment.yaml | 2 +-
 config/apiserver/pdb.yaml        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml
index 782c7da7..25ccd554 100644
--- a/config/apiserver/deployment.yaml
+++ b/config/apiserver/deployment.yaml
@@ -10,7 +10,7 @@ spec:
       app.kubernetes.io/part-of: milo-control-plane
   strategy:
     rollingUpdate:
-      maxSurge: 1
+      maxSurge: 25%
       maxUnavailable: 0
     type: RollingUpdate
   template:
diff --git a/config/apiserver/pdb.yaml b/config/apiserver/pdb.yaml
index 4031993a..4a32a8df 100644
--- a/config/apiserver/pdb.yaml
+++ b/config/apiserver/pdb.yaml
@@ -3,7 +3,7 @@ kind: PodDisruptionBudget
 metadata:
   name: milo-apiserver
 spec:
-  maxUnavailable: 1
+  maxUnavailable: 20%
   selector:
     matchLabels:
       app.kubernetes.io/name: milo-apiserver

From 86e004f73c216df7b8366e4b62b69998f5b5981a Mon Sep 17 00:00:00 2001
From: Scot Wells <wells.scot@gmail.com>
Date: Fri, 10 Apr 2026 13:53:12 -0500
Subject: [PATCH 3/3] fix: increase apiserver resource limits to prevent
 OOMKill

The apiserver was being OOMKilled at 512Mi during e2e tests when
multiple chainsaw tests run concurrently, causing cascading failures.

Bump memory limit to 1Gi and CPU limit to 1 core, with requests
raised proportionally to 512Mi memory and 200m CPU.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 config/apiserver/deployment.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml
index 25ccd554..f8de2057 100644
--- a/config/apiserver/deployment.yaml
+++ b/config/apiserver/deployment.yaml
@@ -214,11 +214,11 @@ spec:
           timeoutSeconds: 15
         resources:
           requests:
-            cpu: 100m
-            memory: 128Mi
-          limits:
-            cpu: 500m
+            cpu: 200m
             memory: 512Mi
+          limits:
+            cpu: "1"
+            memory: 1Gi
         startupProbe:
           failureThreshold: 30
           httpGet: