From 60ef761c0110bca70a887b227162381678792f12 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 2 Apr 2026 17:41:58 -0400 Subject: [PATCH 1/3] fix: eliminate 502 errors during rolling deployments Add gateway-level retry, health checking, and disruption protection so clients never see errors when the API server is updated. Closes #559 Co-Authored-By: Claude Opus 4.6 (1M context) --- config/apiserver/deployment.yaml | 7 +++- config/apiserver/kustomization.yaml | 1 + config/apiserver/pdb.yaml | 9 +++++ .../gateway-api/backend-traffic-policy.yaml | 37 +++++++++++++++++++ .../components/gateway-api/kustomization.yaml | 1 + 5 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 config/apiserver/pdb.yaml create mode 100644 config/components/gateway-api/backend-traffic-policy.yaml diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml index e13dd69f..782c7da7 100644 --- a/config/apiserver/deployment.yaml +++ b/config/apiserver/deployment.yaml @@ -10,8 +10,8 @@ spec: app.kubernetes.io/part-of: milo-control-plane strategy: rollingUpdate: - maxSurge: 25% - maxUnavailable: 25% + maxSurge: 1 + maxUnavailable: 0 type: RollingUpdate template: metadata: @@ -83,6 +83,7 @@ spec: - --events-provider-timeout=$(EVENTS_PROVIDER_TIMEOUT) - --events-provider-retries=$(EVENTS_PROVIDER_RETRIES) - --events-forward-extras=$(EVENTS_FORWARD_EXTRAS) + - --shutdown-delay-duration=$(SHUTDOWN_DELAY_DURATION) env: # Feature gates configuration # Sessions and UserIdentities are GA (enabled by default) @@ -184,6 +185,8 @@ spec: value: "3" - name: EVENTS_FORWARD_EXTRAS value: "iam.miloapis.com/parent-api-group,iam.miloapis.com/parent-type,iam.miloapis.com/parent-name" + - name: SHUTDOWN_DELAY_DURATION + value: "10s" livenessProbe: failureThreshold: 3 httpGet: diff --git a/config/apiserver/kustomization.yaml b/config/apiserver/kustomization.yaml index a33121c3..e309f29e 100644 --- a/config/apiserver/kustomization.yaml +++ b/config/apiserver/kustomization.yaml @@ -3,3 +3,4 @@ kind: Kustomization resources: - deployment.yaml - service.yaml + - pdb.yaml diff --git a/config/apiserver/pdb.yaml b/config/apiserver/pdb.yaml new file mode 100644 index 00000000..4031993a --- /dev/null +++ b/config/apiserver/pdb.yaml @@ -0,0 +1,9 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: milo-apiserver +spec: + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: milo-apiserver diff --git a/config/components/gateway-api/backend-traffic-policy.yaml b/config/components/gateway-api/backend-traffic-policy.yaml new file mode 100644 index 00000000..96bdf468 --- /dev/null +++ b/config/components/gateway-api/backend-traffic-policy.yaml @@ -0,0 +1,37 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: BackendTrafficPolicy +metadata: + name: milo-apiserver + namespace: milo-system +spec: + targetRefs: + - group: gateway.networking.k8s.io + kind: HTTPRoute + name: milo-apiserver + retry: + numRetries: 3 + retryOn: + triggers: + - gateway-error + - connect-failure + - reset + perRetry: + backOff: + baseInterval: 100ms + maxInterval: 1s + timeout: 2s + healthCheck: + active: + type: HTTP + http: + path: /readyz + interval: 5s + timeout: 3s + unhealthyThreshold: 2 + healthyThreshold: 1 + passive: + consecutive5XxErrors: 2 + consecutiveGatewayErrors: 1 + interval: 3s + baseEjectionTime: 15s + maxEjectionPercent: 33 diff --git a/config/components/gateway-api/kustomization.yaml b/config/components/gateway-api/kustomization.yaml index 02c1ab2f..befaa445 100644 --- a/config/components/gateway-api/kustomization.yaml +++ b/config/components/gateway-api/kustomization.yaml @@ -4,3 +4,4 @@ kind: Component resources: - httproute.yaml - backend-tls-policy.yaml + - backend-traffic-policy.yaml From 46f25f2905ddfc71914df338adfa6fe2cae77925 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 2 Apr 2026 17:43:38 -0400 Subject: [PATCH 2/3] fix: use percentage-based PDB and maxSurge to scale with HPA Co-Authored-By: Claude Opus 4.6 (1M context) --- config/apiserver/deployment.yaml | 2 +- config/apiserver/pdb.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml index 782c7da7..25ccd554 100644 --- a/config/apiserver/deployment.yaml +++ b/config/apiserver/deployment.yaml @@ -10,7 +10,7 @@ spec: app.kubernetes.io/part-of: milo-control-plane strategy: rollingUpdate: - maxSurge: 1 + maxSurge: 25% maxUnavailable: 0 type: RollingUpdate template: diff --git a/config/apiserver/pdb.yaml b/config/apiserver/pdb.yaml index 4031993a..4a32a8df 100644 --- a/config/apiserver/pdb.yaml +++ b/config/apiserver/pdb.yaml @@ -3,7 +3,7 @@ kind: PodDisruptionBudget metadata: name: milo-apiserver spec: - maxUnavailable: 1 + maxUnavailable: 20% selector: matchLabels: app.kubernetes.io/name: milo-apiserver From 86e004f73c216df7b8366e4b62b69998f5b5981a Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 10 Apr 2026 13:53:12 -0500 Subject: [PATCH 3/3] fix: increase apiserver resource limits to prevent OOMKill The apiserver was being OOMKilled at 512Mi during e2e tests when multiple chainsaw tests run concurrently, causing cascading failures. Bump memory limit to 1Gi and CPU limit to 1 core, with requests raised proportionally to 512Mi memory and 200m CPU. Co-Authored-By: Claude Opus 4.6 (1M context) --- config/apiserver/deployment.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml index 25ccd554..f8de2057 100644 --- a/config/apiserver/deployment.yaml +++ b/config/apiserver/deployment.yaml @@ -214,11 +214,11 @@ spec: timeoutSeconds: 15 resources: requests: - cpu: 100m - memory: 128Mi - limits: - cpu: 500m + cpu: 200m memory: 512Mi + limits: + cpu: "1" + memory: 1Gi startupProbe: failureThreshold: 30 httpGet: