From 5f2a6f317723747a03d4f70e72d21cddc1ccc956 Mon Sep 17 00:00:00 2001 From: Thiago Souza Date: Fri, 22 May 2026 19:17:34 +0000 Subject: [PATCH] build(compat): make Dockerfile.local go mod download resilient to proxy.golang.org HTTP/2 flakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The three compatibility harnesses (prom/loki/tempo) all build cerberus from Dockerfile.local on every CI run. The `RUN go mod download` step has no retry logic and no module cache mount, so a single transient `proxy.golang.org` HTTP/2 `stream error ... INTERNAL_ERROR; received from peer` mid-stream takes the whole compat job down with it. Observed on PR #708 / run 26306912141, compatibility/loki job 77445902857: `go: github.com/grpc-ecosystem/grpc-gateway/v2@v2.29.0: read "https://proxy.golang.org/.../v2.29.0.zip": stream error; INTERNAL_ERROR; received from peer`. The mandate is no-retry-rerun — fix the underlying fragility instead of bandaiding. Two structural changes to Dockerfile.local: 1. Wrap `go mod download` in a 5-attempt retry loop with linear backoff (3/6/9/12s). The Go module resolver does not retry past a bad HTTP/2 frame, so the wrapper is needed at the shell layer. 2. Add BuildKit `--mount=type=cache` for /go/pkg/mod and /root/.cache/go-build (sharing=locked because the three compat harnesses build this Dockerfile in parallel on the same runner). Warm caches mean transient proxy failures stop being possible on subsequent builds and the proxy hit surface narrows to first-build only. This is a fix to a flake class, not a single point; the same outage would have hit prom or tempo if the unlucky frame had landed there first. --- Dockerfile.local | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Dockerfile.local b/Dockerfile.local index 38affbf5..ee9614bc 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -8,9 +8,37 @@ FROM golang:1.26 AS build WORKDIR /src COPY go.mod go.sum ./ -RUN go mod download +# `go mod download` occasionally fails with a transient +# `proxy.golang.org` HTTP/2 `stream error ... INTERNAL_ERROR; received +# from peer` mid-stream (observed on compatibility/loki run +# 26306912141 against `grpc-ecosystem/grpc-gateway/v2`). The Go module +# resolver does not retry past that frame — it surfaces the error and +# exits non-zero, taking the whole compat job with it. Wrap the fetch +# in a bounded retry loop so a single bad TCP frame from the public +# proxy can't take down a compat run, and mount BuildKit caches for +# `/go/pkg/mod` + `/root/.cache/go-build` so warm runners skip the +# fetch entirely on subsequent builds. The `,sharing=locked` is +# required because the three compat harnesses (prom/loki/tempo) build +# this same Dockerfile in parallel on the same runner via separate +# `docker compose build` invocations. +RUN --mount=type=cache,target=/go/pkg/mod,sharing=locked \ + --mount=type=cache,target=/root/.cache/go-build,sharing=locked \ + set -eu; \ + for attempt in 1 2 3 4 5; do \ + if go mod download; then \ + break; \ + fi; \ + if [ "$attempt" = "5" ]; then \ + echo "go mod download failed after 5 attempts" >&2; \ + exit 1; \ + fi; \ + echo "go mod download attempt $attempt failed, retrying after backoff" >&2; \ + sleep $(( attempt * 3 )); \ + done COPY . . -RUN CGO_ENABLED=0 go build -trimpath -ldflags="-s -w -X main.Version=e2e" -o /out/cerberus ./cmd/cerberus +RUN --mount=type=cache,target=/go/pkg/mod,sharing=locked \ + --mount=type=cache,target=/root/.cache/go-build,sharing=locked \ + CGO_ENABLED=0 go build -trimpath -ldflags="-s -w -X main.Version=e2e" -o /out/cerberus ./cmd/cerberus FROM gcr.io/distroless/static-debian12:nonroot