From 5a72e8b46577d6005b3359e6c8f80e5e333bad7d Mon Sep 17 00:00:00 2001
From: Sujan Rao <sujan@nvidia.com>
Date: Fri, 26 Jun 2026 10:58:42 -0400
Subject: [PATCH 1/2] feat(ci): add testgrid-publish workflow (TG5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires tools/testgrid-publish into CI so every completed UAT run
(GCP or AWS) automatically publishes to the AICR TestGrid dashboard.

Trigger: workflow_run on "UAT - GCP" / "UAT - AWS" (success or failure).
Also supports workflow_dispatch (main branch only) for backfills.

Pipeline:
  1. Validate producer run is from this repository (pre-credential guard)
  2. Download evidence pointer artifact; distinguish network errors from
     benign "conformance did not complete" skips
  3. Extract OCI bundle ref + sha256 digest from pointer.yaml; validate
     both fields and enforce ghcr.io/nvidia/...@sha256:<64hex> format
     before writing to GITHUB_OUTPUT (guards against newline injection)
  4. Authenticate to GCP via WIF (publish SA, objectCreator on groups/ only)
  5. Login to GHCR (packages: read) so testgrid-publish can pull the bundle
  6. Build testgrid-publish from source (vendor — no integrity risk from
     downloading a release artifact into a privileged WIF context)
  7. Publish: pull OCI bundle → parse predicate → write GCS in order

Security hardening:
  - Top-level permissions: contents: read (least privilege default)
  - Job-level: contents:read + actions:read + packages:read + id-token:write
  - No project IDs/numbers in source — all from vars.GCP_PROJECT_NUMBER
    and vars.GCP_PROJECT_ID (repo variables, not secrets)
  - All actions pinned to SHA
  - All user-controlled values through env: (no inline ${{ }} in run:)
  - set -euo pipefail in every shell script
  - workflow_dispatch restricted to refs/heads/main
  - workflow_dispatch inputs use type: choice / type: string to prevent
    typos silently deriving non-existent WIF pools
  - BUNDLE_REF validated against ghcr.io/nvidia/...@sha256:<64hex> regex
  - concurrency block (one publish per triggering run, no cancel)
  - timeout-minutes: 15
  - Release branches intentionally allowed (qualify release → publish)

Environment-aware: defaults to prod (gs://aicr-testgrid). Override to
staging via workflow_dispatch(environment=staging).

Pending before this can merge:
  - TG2 (#1447) must be merged first (tools/testgrid-publish must exist)
  - Apply prod Terraform in aicr-testgrid (creates aicr-testgrid-prod-github
    WIF pool and aicr-testgrid-prod-publish SA)
  - Set vars.GCP_PROJECT_NUMBER and vars.GCP_PROJECT_ID in repo settings
---
 .github/workflows/testgrid-publish.yml | 295 +++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 .github/workflows/testgrid-publish.yml

diff --git a/.github/workflows/testgrid-publish.yml b/.github/workflows/testgrid-publish.yml
new file mode 100644
index 000000000..865092bac
--- /dev/null
+++ b/.github/workflows/testgrid-publish.yml
@@ -0,0 +1,295 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TestGrid Publish (TG5)
+#
+# Publishes UAT evidence bundles to the AICR TestGrid dashboard after
+# every completed cluster run (GCP or AWS, success or failure).
+#
+# Pipeline:
+#   1. Validate the producer run is from this repository
+#   2. Download the evidence pointer artifact from the triggering UAT run
+#   3. Extract the OCI bundle ref + digest from pointer.yaml
+#   4. Authenticate to GCP via WIF (publish SA, groups/ prefix write-only)
+#   5. Build testgrid-publish from source
+#   6. Pull the OCI bundle and write started/junit/finished to GCS
+#
+# Auth:
+#   WIF pool:  aicr-testgrid[-<env>]-github   (aicr-testgrid Terraform)
+#   SA:        aicr-testgrid[-<env>]-publish@<project>
+#   Scope:     objectCreator on gs://aicr-testgrid[-<env>]/groups/ prefix only
+#
+# Configuration (GitHub repo variables — no secrets in source):
+#   vars.GCP_PROJECT_NUMBER   GCP project number (for WIF provider resource name)
+#   vars.GCP_PROJECT_ID       GCP project ID (for SA email)
+#
+# The workflow filename must start with "testgrid-publish" — the WIF binding's
+# attribute_condition in wif-publish.tf restricts impersonation to workflows
+# matching this prefix.
+
+name: TestGrid Publish
+
+# Least-privilege default — job elevates only what it needs.
+permissions:
+  contents: read
+
+on:
+  workflow_run:
+    # Must match the exact `name:` field in uat-gcp.yaml / uat-aws.yaml.
+    workflows:
+      - "UAT - GCP"
+      - "UAT - AWS"
+    types:
+      - completed
+  # Allow manual dispatch for testing and backfills.
+  # Restricted to main — WIF attribute_condition gates on workflow filename
+  # but not branch; manual dispatch from a feature branch with modified
+  # workflow code could otherwise obtain GCS write credentials.
+  workflow_dispatch:
+    inputs:
+      bundle_ref:
+        description: "OCI bundle ref (ghcr.io/nvidia/...@sha256:...)"
+        type: string
+        required: true
+      source_class:
+        description: "Source class"
+        type: choice
+        default: "uat"
+        options:
+          - uat
+          - community
+      environment:
+        description: "Target environment"
+        type: choice
+        default: "prod"
+        options:
+          - prod
+          - staging
+
+concurrency:
+  # One publish per triggering run — prevents duplicate columns if the
+  # workflow is re-run manually while a prior publish is still in flight.
+  group: testgrid-publish-${{ github.event.workflow_run.id || github.run_id }}
+  cancel-in-progress: false  # never cancel an in-flight GCS write
+
+jobs:
+  publish:
+    name: Publish to TestGrid
+    runs-on: ubuntu-latest
+    timeout-minutes: 15  # Go build ~2 min + gcloud uploads ~3 min; 15 min is generous
+
+    # For workflow_run: run whether UAT succeeded or failed —
+    # a failure run is a valid TestGrid column showing what broke.
+    # For workflow_dispatch: restrict to main to prevent feature-branch
+    # code from obtaining WIF credentials (see comment on trigger above).
+    if: >
+      (github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') ||
+      github.event.workflow_run.conclusion == 'success' ||
+      github.event.workflow_run.conclusion == 'failure'
+
+    permissions:
+      contents: read
+      actions: read    # required to download artifacts from the triggering run
+      packages: read   # required to pull the evidence OCI bundle from ghcr.io
+      id-token: write  # required for WIF token exchange
+
+    env:
+      TG_SOURCE_CLASS: ${{ inputs.source_class || 'uat' }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0  # v7.0.0
+        with:
+          persist-credentials: false
+
+      # Guard: reject workflow_run events from forks or unexpected branches
+      # before any credential exchange. WIF is gated by the attribute_condition
+      # in wif-publish.tf, but an explicit check here fails fast and produces
+      # a clear error message rather than a cryptic WIF rejection.
+      - name: Validate producer
+        if: github.event_name == 'workflow_run'
+        env:
+          PRODUCER_REPO: ${{ github.event.workflow_run.repository.full_name }}
+          PRODUCER_BRANCH: ${{ github.event.workflow_run.head_branch }}
+        run: |
+          set -euo pipefail
+          if [ "${PRODUCER_REPO}" != "${GITHUB_REPOSITORY}" ]; then
+            echo "::error::Producer workflow is from '${PRODUCER_REPO}', expected '${GITHUB_REPOSITORY}' — rejecting"
+            exit 1
+          fi
+          # Release branches (e.g. release/v1.x) intentionally publish —
+          # their UAT runs qualify the release and belong in TestGrid.
+          if [ "${PRODUCER_BRANCH}" != "main" ]; then
+            echo "::notice::Producer ran on branch '${PRODUCER_BRANCH}' — publishing (release branches are intentional)"
+          fi
+
+      # Resolve environment-specific GCP resource names.
+      # prod  → aicr-testgrid          (no env suffix — matches Terraform cluster_name)
+      # other → aicr-testgrid-<env>
+      - name: Resolve GCP config
+        id: gcp
+        env:
+          TG_ENV: ${{ inputs.environment || 'prod' }}
+          GCP_PROJECT_NUMBER: ${{ vars.GCP_PROJECT_NUMBER }}
+          GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
+        run: |
+          set -euo pipefail
+          if [ "${TG_ENV}" = "prod" ]; then
+            CLUSTER="aicr-testgrid"
+          else
+            CLUSTER="aicr-testgrid-${TG_ENV}"
+          fi
+          echo "cluster=${CLUSTER}" >> "${GITHUB_OUTPUT}"
+          echo "bucket=${CLUSTER}" >> "${GITHUB_OUTPUT}"
+          echo "wif_provider=projects/${GCP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${CLUSTER}-github/providers/github-actions" >> "${GITHUB_OUTPUT}"
+          echo "wif_sa=${CLUSTER}-publish@${GCP_PROJECT_ID}.iam.gserviceaccount.com" >> "${GITHUB_OUTPUT}"
+
+      - name: Load versions
+        id: versions
+        uses: ./.github/actions/load-versions
+
+      # Install yq (used to parse pointer.yaml) and Go using the repo's
+      # shared composite action — keeps tool versions in sync with .settings.yaml.
+      - name: Set up build tools
+        uses: ./.github/actions/setup-build-tools
+        with:
+          install_yq: 'true'
+          yq_version: ${{ steps.versions.outputs.yq }}
+
+      - name: Set up Go
+        uses: actions/setup-go@d60b41a563a30eb6578e24a53e1f85c8e82d1623  # v6.4.0
+        with:
+          go-version: ${{ steps.versions.outputs.go }}
+          cache: false
+
+      # Download the evidence pointer artifact from the triggering UAT run.
+      # The pointer is only uploaded when conformance succeeds — on failure runs
+      # with no pointer artifact the download action exits non-zero; we capture
+      # the outcome and distinguish "not found" (benign) from network errors.
+      - name: Download evidence pointer
+        id: download-pointer
+        if: github.event_name == 'workflow_run'
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c  # v8.0.1
+        with:
+          github-token: ${{ github.token }}
+          run-id: ${{ github.event.workflow_run.id }}
+          pattern: "*-evidence-pointer-${{ github.event.workflow_run.id }}"
+          merge-multiple: true
+          path: evidence-pointer/
+        continue-on-error: true  # outcome checked explicitly in Resolve bundle ref
+
+      # Extract the OCI ref + digest from pointer.yaml.
+      # For workflow_dispatch the bundle_ref input is used directly.
+      # All user-controlled values flow through env — never inline ${{ }}
+      # in shell scripts to prevent script injection.
+      #
+      # pointer.yaml schema:
+      #   attestations:
+      #     - bundle:
+      #         oci: ghcr.io/nvidia/aicr-uat-gke-h100-training-<run_id>
+      #         digest: sha256:abc123...
+      - name: Resolve bundle ref
+        id: bundle
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          DISPATCH_BUNDLE_REF: ${{ inputs.bundle_ref }}
+          DOWNLOAD_POINTER_OUTCOME: ${{ steps.download-pointer.outcome }}
+          UAT_CONCLUSION: ${{ github.event.workflow_run.conclusion }}
+        run: |
+          set -euo pipefail
+          if [ "${EVENT_NAME}" = "workflow_dispatch" ]; then
+            BUNDLE_REF="${DISPATCH_BUNDLE_REF}"
+          else
+            # If the download step failed, distinguish "artifact not found"
+            # (conformance did not complete — benign skip) from a real error
+            # (network failure — should not silently drop the publish).
+            POINTER="$(find evidence-pointer/ -name 'pointer.yaml' 2>/dev/null | head -1)"
+            DOWNLOAD_OUTCOME="${DOWNLOAD_POINTER_OUTCOME}"
+            if [ -z "${POINTER}" ]; then
+              if [ "${DOWNLOAD_OUTCOME}" = "failure" ] && [ "${UAT_CONCLUSION}" = "success" ]; then
+                echo "::error::UAT succeeded but evidence pointer download failed — possible network error"
+                exit 1
+              fi
+              echo "::notice::No evidence pointer found (conformance did not complete) — skipping TestGrid publish"
+              echo "skip=true" >> "${GITHUB_OUTPUT}"
+              exit 0
+            fi
+            OCI=$(yq '.attestations[0].bundle.oci' "${POINTER}")
+            DIGEST=$(yq '.attestations[0].bundle.digest' "${POINTER}")
+            if [ -z "${OCI}" ] || [ "${OCI}" = "null" ]; then
+              echo "::error::Could not extract OCI ref from ${POINTER}"
+              cat "${POINTER}"
+              exit 1
+            fi
+            if [ -z "${DIGEST}" ] || [ "${DIGEST}" = "null" ]; then
+              echo "::error::Could not extract digest from ${POINTER}"
+              cat "${POINTER}"
+              exit 1
+            fi
+            BUNDLE_REF="${OCI}@${DIGEST}"
+          fi
+          # Validate format: ghcr.io/nvidia/...@sha256:<64 hex chars>
+          # Also guards against newline injection into GITHUB_OUTPUT.
+          if ! echo "${BUNDLE_REF}" | grep -qE '^ghcr\.io/nvidia/[^@]+@sha256:[0-9a-f]{64}$'; then
+            echo "::error::BUNDLE_REF '${BUNDLE_REF}' does not match expected format ghcr.io/nvidia/...@sha256:<64hex>"
+            exit 1
+          fi
+          echo "bundle_ref=${BUNDLE_REF}" >> "${GITHUB_OUTPUT}"
+          echo "skip=false" >> "${GITHUB_OUTPUT}"
+          echo "Resolved bundle: ${BUNDLE_REF}"
+
+      # Authenticate to GCP via WIF using the testgrid publish SA.
+      # This SA has objectCreator on gs://aicr-testgrid[-<env>]/groups/ only.
+      - name: Authenticate to GCP
+        if: steps.bundle.outputs.skip != 'true'
+        id: gcp-auth
+        uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093  # v3.0.0
+        with:
+          workload_identity_provider: ${{ steps.gcp.outputs.wif_provider }}
+          service_account: ${{ steps.gcp.outputs.wif_sa }}
+
+      - name: Set up gcloud
+        if: steps.bundle.outputs.skip != 'true'
+        uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db  # v3.0.1
+
+      # Authenticate to GHCR so testgrid-publish can pull the evidence OCI
+      # bundle. The packages: read permission above gates the token scope.
+      - name: Login to GHCR
+        if: steps.bundle.outputs.skip != 'true'
+        uses: ./.github/actions/ghcr-login
+
+      # Always build from source — ensures the binary matches the current
+      # commit and avoids integrity risks from downloading a release artifact
+      # into a privileged WIF context. goreleaser packages testgrid-publish
+      # for local / manual use; the workflow always builds from vendor.
+      - name: Build testgrid-publish
+        if: steps.bundle.outputs.skip != 'true'
+        run: |
+          set -euo pipefail
+          GOFLAGS="-mod=vendor" go build \
+            -o /tmp/testgrid-publish ./tools/testgrid-publish
+
+      # Pass all user-controlled values through env — never inline ${{ }}
+      # expressions in run: scripts to prevent script injection.
+      - name: Publish to TestGrid
+        if: steps.bundle.outputs.skip != 'true'
+        env:
+          BUNDLE_REF: ${{ steps.bundle.outputs.bundle_ref }}
+          TG_BUCKET: ${{ steps.gcp.outputs.bucket }}
+        run: |
+          set -euo pipefail
+          /tmp/testgrid-publish \
+            --bundle "${BUNDLE_REF}" \
+            --bucket "${TG_BUCKET}" \
+            --source-class "${TG_SOURCE_CLASS}"

From 3f348edecb966418a2545a84ed8a6f105b3614f8 Mon Sep 17 00:00:00 2001
From: Sujan Rao <sujan@nvidia.com>
Date: Thu, 2 Jul 2026 13:51:44 -0400
Subject: [PATCH 2/2] fix(ci): retarget TG5 trigger to uat-run.yaml, correct
 WIF trust docs

workflow_run watched "UAT - GCP"/"UAT - AWS", which are workflow_call-only
reusable workflows and never emit workflow_run events - the trigger would
never have fired. Retarget to "UAT Run" (uat-run.yaml), the only UAT
workflow that always executes top-level.

Also:
- correct the workflow_dispatch trust-model comment: the real enforcement
  is the publish SA's IAM binding (principalSet pinned to
  testgrid-publish.yml@refs/heads/main in wif-publish.tf), not the in-file
  branch check
- surface a warning instead of silently skipping when the evidence-pointer
  download fails on a non-success UAT conclusion
- include environment in the workflow_dispatch concurrency key so staging
  and prod publishes of the same bundle don't serialize behind each other
---
 .github/workflows/testgrid-publish.yml | 192 +++++++++++++++++++------
 1 file changed, 149 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/testgrid-publish.yml b/.github/workflows/testgrid-publish.yml
index 865092bac..f3ee8bf97 100644
--- a/.github/workflows/testgrid-publish.yml
+++ b/.github/workflows/testgrid-publish.yml
@@ -15,24 +15,35 @@
 # TestGrid Publish (TG5)
 #
 # Publishes UAT evidence bundles to the AICR TestGrid dashboard after
-# every completed cluster run (GCP or AWS, success or failure).
+# every completed UAT run (uat-run.yaml, which wraps the GCP or AWS
+# pipeline — any conclusion; a failure or cancelled run that never
+# reached conformance simply has nothing to publish and no-ops cleanly).
+#
+# Disabled by default — gated on vars.TESTGRID_PUBLISH_ENABLED == 'true' —
+# until the prod WIF pool/SA exist (see aicr-testgrid Terraform apply).
 #
 # Pipeline:
-#   1. Validate the producer run is from this repository
+#   1. Validate the producer run is from this repository and branch
 #   2. Download the evidence pointer artifact from the triggering UAT run
-#   3. Extract the OCI bundle ref + digest from pointer.yaml
+#   3. Extract + validate the OCI bundle ref + digest from pointer.yaml
 #   4. Authenticate to GCP via WIF (publish SA, groups/ prefix write-only)
-#   5. Build testgrid-publish from source
-#   6. Pull the OCI bundle and write started/junit/finished to GCS
+#   5. Log in to GHCR to pull the evidence OCI bundle
+#   6. Build testgrid-publish from source
+#   7. Pull the OCI bundle and write started/junit/finished to GCS
 #
 # Auth:
 #   WIF pool:  aicr-testgrid[-<env>]-github   (aicr-testgrid Terraform)
 #   SA:        aicr-testgrid[-<env>]-publish@<project>
 #   Scope:     objectCreator on gs://aicr-testgrid[-<env>]/groups/ prefix only
+#   For prod, <env> is empty (cluster_name = "aicr-testgrid", unsuffixed);
+#   for staging, <env> = "staging". This mirrors terraform/variables.tf and
+#   wif-publish.tf in the aicr-testgrid repo exactly — verify there before
+#   changing the naming logic in "Resolve GCP config" below.
 #
 # Configuration (GitHub repo variables — no secrets in source):
-#   vars.GCP_PROJECT_NUMBER   GCP project number (for WIF provider resource name)
-#   vars.GCP_PROJECT_ID       GCP project ID (for SA email)
+#   vars.TESTGRID_PUBLISH_ENABLED   'true' to enable (see gate above)
+#   vars.GCP_PROJECT_NUMBER         GCP project number (WIF provider resource name)
+#   vars.GCP_PROJECT_ID             GCP project ID (SA email)
 #
 # The workflow filename must start with "testgrid-publish" — the WIF binding's
 # attribute_condition in wif-publish.tf restricts impersonation to workflows
@@ -46,20 +57,51 @@ permissions:
 
 on:
   workflow_run:
-    # Must match the exact `name:` field in uat-gcp.yaml / uat-aws.yaml.
+    # "UAT Run" (uat-run.yaml) is the only UAT workflow that always executes
+    # top-level: both a human (workflow_dispatch) and the nightly/daytime
+    # controllers trigger it via `gh workflow run` (a real dispatch, not
+    # `uses:`) specifically to keep it top-level — see the "Dispatch (not
+    # workflow_call)" comments in uat-nightly-batch.yaml / uat-daytime.yaml.
+    # uat-gcp.yaml / uat-aws.yaml are workflow_call-only reusable workflows
+    # invoked exclusively via `uses:` from uat-run.yaml's run-gcp/run-aws
+    # jobs — GitHub never emits a workflow_run event for a workflow that
+    # only ever executes as a workflow_call target, so watching them
+    # directly (as this file previously did) never fires.
+    #
+    # Jobs in a called reusable workflow share the caller's github.run_id,
+    # so github.event.workflow_run.id below still correctly matches the
+    # ${{ github.run_id }}-suffixed evidence-pointer artifact names
+    # uploaded from inside uat-gcp.yaml / uat-aws.yaml, and
+    # github.event.workflow_run.conclusion still reflects the real UAT
+    # outcome (run-gcp/run-aws call the reusable workflows with no
+    # continue-on-error, so a conformance failure propagates to
+    # uat-run.yaml's own conclusion).
     workflows:
-      - "UAT - GCP"
-      - "UAT - AWS"
+      - "UAT Run"
     types:
       - completed
   # Allow manual dispatch for testing and backfills.
-  # Restricted to main — WIF attribute_condition gates on workflow filename
-  # but not branch; manual dispatch from a feature branch with modified
-  # workflow code could otherwise obtain GCS write credentials.
+  #
+  # Trust model: the real enforcement is at the WIF layer, not the
+  # github.ref == 'refs/heads/main' check below (that check is just a
+  # fast, readable fail-closed guard, evaluated from whatever copy of this
+  # file was dispatched — it cannot be the actual security boundary).
+  # The publish SA's IAM binding (aicr-testgrid's wif-publish.tf,
+  # google_service_account_iam_member.publish_wif) grants
+  # roles/iam.workloadIdentityUser via a principalSet pinned to
+  # attribute.workflow_ref == ".../testgrid-publish.yml@refs/heads/main".
+  # GitHub's OIDC token embeds the actual ref the workflow file was loaded
+  # from as job_workflow_ref, so dispatching a modified copy of this file
+  # from any other branch produces a token that does not match the
+  # principalSet — WIF impersonation is rejected at the GCP layer
+  # regardless of what the dispatched copy's own if: checks say.
+  #
+  # bundle_ref is validated for format (ghcr.io/<org>/...@sha256:<64hex>)
+  # but not provenance/signature before the tool pulls and parses it.
   workflow_dispatch:
     inputs:
       bundle_ref:
-        description: "OCI bundle ref (ghcr.io/nvidia/...@sha256:...)"
+        description: "OCI bundle ref (ghcr.io/<org>/...@sha256:...)"
         type: string
         required: true
       source_class:
@@ -78,9 +120,14 @@ on:
           - staging
 
 concurrency:
-  # One publish per triggering run — prevents duplicate columns if the
-  # workflow is re-run manually while a prior publish is still in flight.
-  group: testgrid-publish-${{ github.event.workflow_run.id || github.run_id }}
+  # One publish per triggering run (workflow_run) or per environment+bundle
+  # (workflow_dispatch) — prevents duplicate/racing columns if the workflow
+  # is re-run, or the same bundle_ref is dispatched twice, while a prior
+  # publish is still in flight. environment is included in the dispatch key
+  # so a staging backfill and a prod publish of the same bundle_ref (which
+  # write to different buckets) don't serialize behind each other. Falls
+  # back to run_id only when neither is available.
+  group: testgrid-publish-${{ github.event.workflow_run.id || format('{0}-{1}', inputs.environment, inputs.bundle_ref) || github.run_id }}
   cancel-in-progress: false  # never cancel an in-flight GCS write
 
 jobs:
@@ -89,14 +136,25 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 15  # Go build ~2 min + gcloud uploads ~3 min; 15 min is generous
 
-    # For workflow_run: run whether UAT succeeded or failed —
-    # a failure run is a valid TestGrid column showing what broke.
-    # For workflow_dispatch: restrict to main to prevent feature-branch
-    # code from obtaining WIF credentials (see comment on trigger above).
+    # Gate on vars.TESTGRID_PUBLISH_ENABLED so this workflow is a true no-op
+    # (skipped, not failed) until the prod WIF pool/SA exist in Terraform.
+    # Flip to 'true' once aicr-testgrid prod Terraform is applied — before
+    # that, every successful UAT would otherwise fail at the GCP auth step
+    # and produce CI noise.
+    #
+    # For workflow_run: run for ANY completed conclusion (success, failure,
+    # cancelled, timed_out, skipped, ...) — "Resolve bundle ref" already
+    # no-ops cleanly when no evidence pointer exists (only uploaded on a
+    # successful conformance phase), so non-terminal conclusions fall
+    # through to the same benign skip path instead of the job silently
+    # never running at all.
+    # For workflow_dispatch: restrict to main as a readable fail-closed
+    # guard — the actual security boundary is the WIF SA binding, not this
+    # check (see trust-model comment on the trigger above).
     if: >
-      (github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') ||
-      github.event.workflow_run.conclusion == 'success' ||
-      github.event.workflow_run.conclusion == 'failure'
+      vars.TESTGRID_PUBLISH_ENABLED == 'true' &&
+      ((github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') ||
+       github.event_name == 'workflow_run')
 
     permissions:
       contents: read
@@ -113,10 +171,14 @@ jobs:
         with:
           persist-credentials: false
 
-      # Guard: reject workflow_run events from forks or unexpected branches
-      # before any credential exchange. WIF is gated by the attribute_condition
-      # in wif-publish.tf, but an explicit check here fails fast and produces
-      # a clear error message rather than a cryptic WIF rejection.
+      # Sanity check before any credential exchange: confirm the producer
+      # run belongs to this repository. Note: github.event.workflow_run.
+      # repository is always the base repo that hosts the workflow (never
+      # a fork) — a repo-scoped workflow_run trigger cannot fire from a
+      # fork's own run in the first place, so this is defense-in-depth,
+      # not fork rejection. WIF's attribute_condition in wif-publish.tf is
+      # the actual security boundary; this check just fails fast with a
+      # clear message instead of a cryptic WIF rejection.
       - name: Validate producer
         if: github.event_name == 'workflow_run'
         env:
@@ -128,11 +190,18 @@ jobs:
             echo "::error::Producer workflow is from '${PRODUCER_REPO}', expected '${GITHUB_REPOSITORY}' — rejecting"
             exit 1
           fi
-          # Release branches (e.g. release/v1.x) intentionally publish —
-          # their UAT runs qualify the release and belong in TestGrid.
-          if [ "${PRODUCER_BRANCH}" != "main" ]; then
-            echo "::notice::Producer ran on branch '${PRODUCER_BRANCH}' — publishing (release branches are intentional)"
-          fi
+          # Allowlist: main and release/* branches qualify for TestGrid.
+          # UAT supports workflow_dispatch on any branch, so without this
+          # an experimental dev-branch UAT would land a column in the
+          # dashboard. Reject anything else explicitly (fail closed).
+          case "${PRODUCER_BRANCH}" in
+            main|release/*)
+              ;;
+            *)
+              echo "::error::Producer ran on branch '${PRODUCER_BRANCH}' — only main and release/* branches publish to TestGrid"
+              exit 1
+              ;;
+          esac
 
       # Resolve environment-specific GCP resource names.
       # prod  → aicr-testgrid          (no env suffix — matches Terraform cluster_name)
@@ -145,15 +214,25 @@ jobs:
           GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
         run: |
           set -euo pipefail
+          # Fail fast with a clear message rather than deriving a malformed
+          # WIF provider/SA (e.g. "projects//locations/...") that only
+          # surfaces as a cryptic rejection three steps later at auth time.
+          : "${GCP_PROJECT_NUMBER:?repo variable GCP_PROJECT_NUMBER is unset — set it in Settings > Secrets and variables > Actions > Variables}"
+          : "${GCP_PROJECT_ID:?repo variable GCP_PROJECT_ID is unset — set it in Settings > Secrets and variables > Actions > Variables}"
           if [ "${TG_ENV}" = "prod" ]; then
             CLUSTER="aicr-testgrid"
           else
             CLUSTER="aicr-testgrid-${TG_ENV}"
           fi
-          echo "cluster=${CLUSTER}" >> "${GITHUB_OUTPUT}"
-          echo "bucket=${CLUSTER}" >> "${GITHUB_OUTPUT}"
-          echo "wif_provider=projects/${GCP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${CLUSTER}-github/providers/github-actions" >> "${GITHUB_OUTPUT}"
-          echo "wif_sa=${CLUSTER}-publish@${GCP_PROJECT_ID}.iam.gserviceaccount.com" >> "${GITHUB_OUTPUT}"
+          {
+            echo "cluster=${CLUSTER}"
+            echo "bucket=${CLUSTER}"
+            echo "wif_provider=projects/${GCP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${CLUSTER}-github/providers/github-actions"
+            echo "wif_sa=${CLUSTER}-publish@${GCP_PROJECT_ID}.iam.gserviceaccount.com"
+          } >> "${GITHUB_OUTPUT}"
+          # Note: `environment` is fully encoded in `bucket` (bucket ==
+          # cluster == aicr-testgrid[-<env>]) and is not passed to
+          # testgrid-publish separately — the tool only needs --bucket.
 
       - name: Load versions
         id: versions
@@ -168,7 +247,7 @@ jobs:
           yq_version: ${{ steps.versions.outputs.yq }}
 
       - name: Set up Go
-        uses: actions/setup-go@d60b41a563a30eb6578e24a53e1f85c8e82d1623  # v6.4.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16  # v6.5.0
         with:
           go-version: ${{ steps.versions.outputs.go }}
           cache: false
@@ -214,13 +293,30 @@ jobs:
             # If the download step failed, distinguish "artifact not found"
             # (conformance did not complete — benign skip) from a real error
             # (network failure — should not silently drop the publish).
-            POINTER="$(find evidence-pointer/ -name 'pointer.yaml' 2>/dev/null | head -1)"
+            # `-print -quit` + `|| true` tolerates an absent evidence-pointer/
+            # directory under set -euo pipefail (find would otherwise exit
+            # non-zero and kill the script before the -z check below runs).
+            POINTER="$(find evidence-pointer/ -name 'pointer.yaml' -print -quit 2>/dev/null || true)"
             DOWNLOAD_OUTCOME="${DOWNLOAD_POINTER_OUTCOME}"
             if [ -z "${POINTER}" ]; then
               if [ "${DOWNLOAD_OUTCOME}" = "failure" ] && [ "${UAT_CONCLUSION}" = "success" ]; then
+                # Narrow known edge case: a manual `skip_tests` UAT dispatch
+                # also concludes success with no pointer (conformance never
+                # ran), which looks identical to a network error from here.
+                # Accepted — this only affects operator-triggered skip_tests
+                # runs, which have nothing to publish anyway.
                 echo "::error::UAT succeeded but evidence pointer download failed — possible network error"
                 exit 1
               fi
+              if [ "${DOWNLOAD_OUTCOME}" = "failure" ]; then
+                # Unlike the success case above, a non-success UAT conclusion
+                # can't be disambiguated: artifact absence is expected here
+                # (conformance may never have run), so a genuine transient
+                # download error looks identical. Surface it for visibility
+                # without failing the job — there's nothing to publish
+                # either way.
+                echo "::warning::Evidence pointer download failed for a non-success UAT run (conclusion=${UAT_CONCLUSION}) — could be genuine absence or a transient error; treating as skip"
+              fi
               echo "::notice::No evidence pointer found (conformance did not complete) — skipping TestGrid publish"
               echo "skip=true" >> "${GITHUB_OUTPUT}"
               exit 0
@@ -239,10 +335,20 @@ jobs:
             fi
             BUNDLE_REF="${OCI}@${DIGEST}"
           fi
-          # Validate format: ghcr.io/nvidia/...@sha256:<64 hex chars>
-          # Also guards against newline injection into GITHUB_OUTPUT.
-          if ! echo "${BUNDLE_REF}" | grep -qE '^ghcr\.io/nvidia/[^@]+@sha256:[0-9a-f]{64}$'; then
-            echo "::error::BUNDLE_REF '${BUNDLE_REF}' does not match expected format ghcr.io/nvidia/...@sha256:<64hex>"
+          # Validate format: ghcr.io/<org>/...@sha256:<64 hex chars>
+          # Namespace is intentionally NOT restricted to nvidia/ — community
+          # bundles (source_class=community) are hosted under contributor
+          # namespaces, e.g. ghcr.io/yuanchen8911/aicr-evidence.
+          #
+          # Uses bash's [[ =~ ]] (whole-string match, POSIX ERE) rather than
+          # `echo | grep -qE`, which is line-based: a value containing an
+          # embedded newline (e.g. "valid-ref\nfoo=bar") would pass grep
+          # because at least one *line* matches, while the newline still
+          # reaches `echo >> GITHUB_OUTPUT` below and injects a second,
+          # attacker-controlled output key. [[ =~ ]] anchors ^ and $ to the
+          # start/end of the whole variable, so an embedded newline fails.
+          if [[ ! "${BUNDLE_REF}" =~ ^ghcr\.io/[A-Za-z0-9][A-Za-z0-9-]*/[A-Za-z0-9._/-]+@sha256:[0-9a-f]{64}$ ]]; then
+            echo "::error::BUNDLE_REF '${BUNDLE_REF}' does not match expected format ghcr.io/<org>/...@sha256:<64hex>"
             exit 1
           fi
           echo "bundle_ref=${BUNDLE_REF}" >> "${GITHUB_OUTPUT}"