NVIDIA · mohityadav8 · Jun 28, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
@@ -1,4 +1,4 @@
-# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ on:
       - 'go.sum'
       - 'vendor/**'
       - '.github/workflows/kwok-recipes.yaml'
-      - '.github/workflows/kwok-tier3-shard.yaml'
+      - '.github/workflows/kwok-test-run.yaml'
       - '.github/actions/kwok-test/**'
       - '!**.md'
   pull_request:
@@ -52,7 +52,7 @@ on:
       - 'go.sum'
       - 'vendor/**'
       - '.github/workflows/kwok-recipes.yaml'
-      - '.github/workflows/kwok-tier3-shard.yaml'
+      - '.github/workflows/kwok-test-run.yaml'
       - '.github/actions/kwok-test/**'
       - '!**.md'
   schedule:
@@ -84,6 +84,8 @@ jobs:
       tier2: ${{ steps.classify.outputs.tier2 }}
       tier3: ${{ steps.classify.outputs.tier3 }}
       tier3_batches: ${{ steps.classify.outputs.tier3_batches }}
+      tier1_pairs: ${{ steps.classify.outputs.tier1_pairs }}
+      tier2_pairs: ${{ steps.classify.outputs.tier2_pairs }}
     steps:
       - name: Checkout
         uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0  # v7.0.0
@@ -100,13 +102,25 @@ jobs:
         run: |
           set -euo pipefail
 
+          # Deployer list — single source of truth, consumed by all tiers and by
+          # the workflow_dispatch early-exit below. To add or remove a deployer,
+          # change this one line; Tier 1, Tier 2, and Tier 3 all derive from it.
+          readonly DEPLOYERS='["helm","argocd-oci","argocd-helm-oci","argocd-git","flux-oci","flux-git"]'
+
           # --- workflow_dispatch: test exactly the requested recipe ---
           if [[ -n "${DISPATCH_RECIPE}" ]]; then
             single=$(jq -nc '[$r]' --arg r "${DISPATCH_RECIPE}")
-            echo "tier1=${single}"   >> "$GITHUB_OUTPUT"
-            echo "tier2=[]"          >> "$GITHUB_OUTPUT"
-            echo "tier3=[]"          >> "$GITHUB_OUTPUT"
-            echo "tier3_batches=[]"  >> "$GITHUB_OUTPUT"
+            single_pairs=$(jq -cn \
+              --arg r "${DISPATCH_RECIPE}" \
+              --argjson deployers "${DEPLOYERS}" '
+                [ $deployers[] | {recipe: $r, deployer: .} ]
+              ')
+            echo "tier1=${single}"             >> "$GITHUB_OUTPUT"
+            echo "tier2=[]"                    >> "$GITHUB_OUTPUT"
+            echo "tier3=[]"                    >> "$GITHUB_OUTPUT"
+            echo "tier3_batches=[]"            >> "$GITHUB_OUTPUT"
+            echo "tier1_pairs=${single_pairs}" >> "$GITHUB_OUTPUT"
+            echo "tier2_pairs=[]"              >> "$GITHUB_OUTPUT"
             echo "Manual dispatch: ${DISPATCH_RECIPE}"
             exit 0
           fi
@@ -237,16 +251,42 @@ jobs:
           # --- Tier 3: full matrix (all testable overlays) ---
           tier3="$all"
 
+          # Local alias so jq --argjson calls below can use $deployers
+          # (DEPLOYERS is readonly; this assignment is intentional).
+          # shellcheck disable=SC2034
+          deployers="${DEPLOYERS}"
+
+          # --- Pre-build Tier 1 pairs: generic recipes × all deployers ---
+          tier1_pairs=$(jq -cn \
+            --argjson recipes "$tier1" \
+            --argjson deployers "$deployers" '
+              [ $recipes[] as $r | $deployers[] as $d | {recipe: $r, deployer: $d} ]
+            ')
+
+          # Guard: Tier 1 is expected to stay well under 256 (no batching needed).
+          # Warn early if organic growth is approaching the limit.
+          tier1_pair_count=$(echo "$tier1_pairs" | jq 'length')
+          if (( tier1_pair_count > 256 )); then
+            echo "::error::Tier 1 has ${tier1_pair_count} pairs (>256) — add batching before passing this to kwok-test-run.yaml"
+            exit 1
+          elif (( tier1_pair_count > 200 )); then
+            echo "::warning::Tier 1 has ${tier1_pair_count} pairs (>200) — consider adding batching before it reaches 256"
+          fi
+
+          # --- Pre-build Tier 2 pairs: diff-affected recipes, helm-only ---
+          # Coverage-policy decision: Tier 2 uses helm only to keep PR wall-clock
+          # time proportional to the change scope. Full deployer coverage runs in
+          # Tier 3 on every push to main and on the nightly schedule. See ADR-003
+          # §"Tier 2 deployer coverage" for rationale and how to revisit this.
+          tier2_pairs=$(echo "$tier2" | jq -c '[.[] | {recipe: ., deployer: "helm"}]')
+
           # --- Tier 3 batching ---
           # GitHub caps a single job's matrix at 256 configurations. Tier 3
           # crosses every testable recipe with every deployer, so the raw
-          # cross-product (recipes × deployers) outgrew the cap. Split the
+          # cross-product (recipes × deployers) can outgrow the cap. Split the
           # {recipe, deployer} pairs into batches of <= TIER3_BATCH_SIZE; the
-          # caller fans each batch out to kwok-tier3-shard.yaml, keeping every
-          # shard's matrix under the limit. Keep this deployer list in sync
-          # with the test-tier1 matrix above and the input doc in
-          # .github/actions/kwok-test/action.yml.
-          deployers='["helm","argocd-oci","argocd-helm-oci","argocd-git","flux-oci","flux-git"]'
+          # caller fans each batch out to kwok-test-run.yaml, keeping every
+          # shard's matrix under the limit.
           readonly TIER3_BATCH_SIZE=200  # headroom under GitHub's 256 cap
 
           # Fail closed if the batch size is ever raised past the hard limit —
@@ -259,7 +299,7 @@ jobs:
 
           tier3_batches=$(jq -cn \
             --argjson recipes "$tier3" \
-            --argjson deployers "$deployers" \
+            --argjson deployers "${DEPLOYERS}" \
             --argjson size "$TIER3_BATCH_SIZE" '
               [ $recipes[] as $r | $deployers[] as $d | {recipe: $r, deployer: $d} ]
               | [ range(0; length; $size) as $i
@@ -268,106 +308,50 @@ jobs:
             ')
 
           # --- Output ---
-          echo "tier1=${tier1}" >> "$GITHUB_OUTPUT"
-          echo "tier2=${tier2}" >> "$GITHUB_OUTPUT"
-          echo "tier3=${tier3}" >> "$GITHUB_OUTPUT"
+          echo "tier1=${tier1}"                 >> "$GITHUB_OUTPUT"
+          echo "tier2=${tier2}"                 >> "$GITHUB_OUTPUT"
+          echo "tier3=${tier3}"                 >> "$GITHUB_OUTPUT"
           echo "tier3_batches=${tier3_batches}" >> "$GITHUB_OUTPUT"
+          echo "tier1_pairs=${tier1_pairs}"     >> "$GITHUB_OUTPUT"
+          echo "tier2_pairs=${tier2_pairs}"     >> "$GITHUB_OUTPUT"
 
+          deployer_count=$(echo "$deployers" | jq 'length')
           tier3_pairs=$(echo "$tier3_batches" | jq '[.[].pairs[]] | length')
           tier3_batch_count=$(echo "$tier3_batches" | jq 'length')
-          echo "Tier 1 (generic):      $(echo "$tier1" | jq 'length') recipe(s)"
-          echo "Tier 2 (diff-aware):   $(echo "$tier2" | jq 'length') recipe(s)"
-          echo "Tier 3 (full matrix):  $(echo "$tier3" | jq 'length') recipe(s) × $(echo "$deployers" | jq 'length') deployer(s) = ${tier3_pairs} pair(s) in ${tier3_batch_count} batch(es)"
+          echo "Tier 1 (generic):      $(echo "$tier1" | jq 'length') recipe(s) × ${deployer_count} deployer(s) = ${tier1_pair_count} pair(s)"
+          echo "Tier 2 (diff-aware):   $(echo "$tier2" | jq 'length') recipe(s) × 1 deployer (helm) = $(echo "$tier2_pairs" | jq 'length') pair(s)"
+          echo "Tier 3 (full matrix):  $(echo "$tier3" | jq 'length') recipe(s) × ${deployer_count} deployer(s) = ${tier3_pairs} pair(s) in ${tier3_batch_count} batch(es)"
 
   # ── Tier 1: PR gate — generic overlays (PR + push, skip on schedule) ──
   test-tier1:
-    name: 'Tier 1: ${{ matrix.recipe }} (${{ matrix.deployer }})'
     needs: discover
     if: >-
       github.event_name != 'schedule' &&
-      needs.discover.outputs.tier1 != '[]' &&
-      needs.discover.outputs.tier1 != ''
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    strategy:
-      fail-fast: false
-      matrix:
-        recipe: ${{ fromJSON(needs.discover.outputs.tier1) }}
-        deployer: [helm, argocd-oci, argocd-helm-oci, argocd-git, flux-oci, flux-git]
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0  # v7.0.0
-        with:
-          persist-credentials: false
-
-      - name: Load versions
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Run KWOK test
-        uses: ./.github/actions/kwok-test
-        with:
-          recipe: ${{ matrix.recipe }}
-          deployer: ${{ matrix.deployer }}
-          go_version: ${{ steps.versions.outputs.go }}
-          goreleaser_version: ${{ steps.versions.outputs.goreleaser }}
-          kind_version: ${{ steps.versions.outputs.kind }}
-          helm_version: ${{ steps.versions.outputs.helm }}
-          kwok_version: ${{ steps.versions.outputs.kwok }}
-          kubectl_version: ${{ steps.versions.outputs.kubectl }}
-          yq_version: ${{ steps.versions.outputs.yq }}
-          flux_version: ${{ steps.versions.outputs.flux }}
-          chainsaw_version: ${{ steps.versions.outputs.chainsaw }}
-          chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }}
-          kind_node_image: ${{ steps.versions.outputs.kind_node_image }}
+      needs.discover.outputs.tier1_pairs != '[]' &&
+      needs.discover.outputs.tier1_pairs != ''
+    uses: ./.github/workflows/kwok-test-run.yaml
+    with:
+      pairs: ${{ needs.discover.outputs.tier1_pairs }}
 
   # ── Tier 2: diff-aware accelerator tests (PR only, conditional) ──
+  # Coverage-policy decision: Tier 2 uses helm only (see ADR-003 §"Tier 2
+  # deployer coverage"). Full deployer coverage runs in Tier 3 on push/nightly.
   test-tier2:
-    name: 'Tier 2: ${{ matrix.recipe }}'
     needs: discover
     if: >-
       github.event_name == 'pull_request' &&
-      needs.discover.outputs.tier2 != '[]' &&
-      needs.discover.outputs.tier2 != ''
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    strategy:
-      fail-fast: false
-      matrix:
-        recipe: ${{ fromJSON(needs.discover.outputs.tier2) }}
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0  # v7.0.0
-        with:
-          persist-credentials: false
-
-      - name: Load versions
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Run KWOK test
-        uses: ./.github/actions/kwok-test
-        with:
-          recipe: ${{ matrix.recipe }}
-          go_version: ${{ steps.versions.outputs.go }}
-          goreleaser_version: ${{ steps.versions.outputs.goreleaser }}
-          kind_version: ${{ steps.versions.outputs.kind }}
-          helm_version: ${{ steps.versions.outputs.helm }}
-          kwok_version: ${{ steps.versions.outputs.kwok }}
-          kubectl_version: ${{ steps.versions.outputs.kubectl }}
-          yq_version: ${{ steps.versions.outputs.yq }}
-          flux_version: ${{ steps.versions.outputs.flux }}
-          chainsaw_version: ${{ steps.versions.outputs.chainsaw }}
-          chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }}
-          kind_node_image: ${{ steps.versions.outputs.kind_node_image }}
+      needs.discover.outputs.tier2_pairs != '[]' &&
+      needs.discover.outputs.tier2_pairs != ''
+    uses: ./.github/workflows/kwok-test-run.yaml
+    with:
+      pairs: ${{ needs.discover.outputs.tier2_pairs }}
 
   # ── Tier 3: full matrix (push to main + nightly schedule) ──
-  # The recipe × deployer cross-product exceeds GitHub's 256-config matrix cap,
-  # so discover splits it into batches and we fan each batch out to the
-  # kwok-tier3-shard reusable workflow (one shard per batch, each <= 256).
-  # Per ADR-003: the concurrency group is keyed by SHA so successive merges to
-  # main never cancel in-flight Tier 3 runs; the batch id keeps every shard of a
-  # single run in its own group so they all run in parallel.
+  # The recipe × deployer cross-product can exceed GitHub's 256-config cap, so
+  # discover batches the pairs and we fan each batch out to kwok-test-run.yaml
+  # (one shard per batch, each <= 256 pairs). Per ADR-003: concurrency is keyed
+  # by SHA so successive merges never cancel in-flight Tier 3 runs; the batch id
+  # keeps every shard of a single run in its own group so they all run in parallel.
   test-tier3:
     needs: discover
     concurrency:
@@ -381,7 +365,7 @@ jobs:
       fail-fast: false
       matrix:
         batch: ${{ fromJSON(needs.discover.outputs.tier3_batches) }}
-    uses: ./.github/workflows/kwok-tier3-shard.yaml
+    uses: ./.github/workflows/kwok-test-run.yaml
     with:
       pairs: ${{ toJSON(matrix.batch.pairs) }}
 
@@ -434,3 +418,4 @@ jobs:
           fi
 
           echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY
+
@@ -12,18 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Reusable shard for the Tier 3 KWOK full matrix. GitHub caps a single job's
-# matrix at 256 configurations; the caller (kwok-recipes.yaml) splits the full
-# recipe × deployer cross-product into batches of <= 256 pairs and invokes this
-# workflow once per batch, so the per-shard matrix always stays under the cap.
-name: KWOK Tier 3 Shard
+# Reusable KWOK test runner shared by all three tiers (kwok-recipes.yaml).
+# Takes a JSON array of {recipe, deployer} pairs and runs each as a matrix job.
+#
+# Tier 1 passes a flat pairs list (well under 256 → no batching needed).
+# Tier 2 passes a flat pairs list (helm-only for diff-affected recipes).
+# Tier 3 passes one batch at a time from the discover job's tier3_batches output;
+#   the caller (test-tier3 in kwok-recipes.yaml) shards across batches to keep
+#   every shard's matrix under GitHub's 256-configuration cap.
+name: KWOK Test Run
 
 on:
   workflow_call:
     inputs:
       pairs:
-        # Caller batches at TIER3_BATCH_SIZE (kwok-recipes.yaml); must never
-        # exceed GitHub's hard cap of 256 matrix configurations per job.
+        # JSON array of {recipe, deployer} objects. Must never exceed GitHub's
+        # hard cap of 256 matrix configurations per job.
         description: 'JSON array of {recipe, deployer} objects to test (<= 256 entries)'
         required: true
         type: string
@@ -33,7 +37,7 @@ permissions:
 
 jobs:
   test:
-    name: 'Tier 3: ${{ matrix.pair.recipe }} (${{ matrix.pair.deployer }})'
+    name: '${{ matrix.pair.recipe }} (${{ matrix.pair.deployer }})'
     runs-on: ubuntu-latest
     timeout-minutes: 15
     strategy:

@@ -2,12 +2,16 @@
 
 ## Status
 
+<<<<<<< Updated upstream
 **Accepted, implemented** — 2026-03-18
 
 The tiered KWOK scaling strategy has shipped: tier discovery and per-overlay
 parallel jobs run in `.github/workflows/kwok-recipes.yaml`, with the Tier 3
 shard lane in `.github/workflows/kwok-tier3-shard.yaml`, backed by the
 `kwok/scripts/` and `kwok/profiles/` machinery.
+=======
+Accepted | Updated (workflow consolidation, #1172)
+>>>>>>> Stashed changes
 
 ## Scope
 
@@ -150,7 +154,7 @@ To stay under the limit without sacrificing coverage, the `discover` job builds
 the `{recipe, deployer}` pairs and chunks them into batches of `TIER3_BATCH_SIZE`
 (200, with headroom under 256), emitting a `tier3_batches` output of
 `{id, pairs}` objects. `test-tier3` is a thin matrix over those batches that fans
-each one out to the **`kwok-tier3-shard.yaml`** reusable workflow, which expands
+each one out to the **`kwok-test-run.yaml`** reusable workflow, which expands
 its batch as its own (≤ 256) matrix. Batch count grows automatically as overlays
 are added — no manual job duplication. A fail-closed guard in `discover` errors
 loudly if `TIER3_BATCH_SIZE` is ever raised past 256, rather than resurfacing
@@ -163,19 +167,18 @@ and a summary. Tier 3 fans out to a reusable shard workflow (see above):
 
 ```
 discover
-├── tier1: [eks, aks, gke, kind, ...]               # generic only
-├── tier2: [h100-eks-ubuntu-training, ...]           # diff-affected only
-├── tier3: [all 72+]                                 # full overlay set
-└── tier3_batches: [{id, pairs:[{recipe,deployer}]}] # cross-product, chunked ≤256
+├── tier1_pairs: [{recipe,deployer}]                 # generic overlays × all deployers
+├── tier2_pairs: [{recipe, deployer:"helm"}]         # diff-affected overlays, helm-only
+└── tier3_batches: [{id, pairs:[{recipe,deployer}]}] # all overlays × all deployers, chunked ≤256
 
 test-tier1  (PR + push to main)
-  matrix: tier1 × deployer
+  uses kwok-test-run.yaml  pairs=tier1_pairs
 
 test-tier2  (PR only, skip if empty)
-  matrix: tier2 × deployer
+  uses kwok-test-run.yaml  pairs=tier2_pairs  [helm-only]
 
 test-tier3  (push to main + schedule, skip on PR)
-  matrix: tier3_batches → uses kwok-tier3-shard.yaml (matrix: pairs)
+  matrix: tier3_batches → uses kwok-test-run.yaml (matrix: pairs)
 
 summary
   needs: [test-tier1, test-tier2, test-tier3]
@@ -194,6 +197,17 @@ The `summary` job gates on Tier 1 and Tier 2 for PRs, and on all three tiers for
 pushes to `main`. This avoids branch protection brittleness when the overlay set
 changes.
 
+### Tier 2 deployer coverage
+
+Tier 2 is **helm-only** by deliberate policy (#1172). The old Tier 2 passed no
+deployer argument to the action, which already defaulted to `helm` — this makes
+that behavior explicit. Full deployer coverage (all deployers × all overlays)
+runs in Tier 3 on every push to `main` and on the nightly schedule.
+
+To add full deployer coverage to Tier 2, change `tier2_pairs` in the `discover`
+classify step to cross the recipe list with the full `DEPLOYERS` array (same
+pattern as `tier1_pairs`).
+
 ## Consequences
 
 ### Positive