diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index bb103c32c..0ec768486 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ on: - 'go.sum' - 'vendor/**' - '.github/workflows/kwok-recipes.yaml' - - '.github/workflows/kwok-tier3-shard.yaml' + - '.github/workflows/kwok-test-run.yaml' - '.github/actions/kwok-test/**' - '!**.md' pull_request: @@ -52,7 +52,7 @@ on: - 'go.sum' - 'vendor/**' - '.github/workflows/kwok-recipes.yaml' - - '.github/workflows/kwok-tier3-shard.yaml' + - '.github/workflows/kwok-test-run.yaml' - '.github/actions/kwok-test/**' - '!**.md' schedule: @@ -84,6 +84,8 @@ jobs: tier2: ${{ steps.classify.outputs.tier2 }} tier3: ${{ steps.classify.outputs.tier3 }} tier3_batches: ${{ steps.classify.outputs.tier3_batches }} + tier1_pairs: ${{ steps.classify.outputs.tier1_pairs }} + tier2_pairs: ${{ steps.classify.outputs.tier2_pairs }} steps: - name: Checkout uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 @@ -100,13 +102,25 @@ jobs: run: | set -euo pipefail + # Deployer list — single source of truth, consumed by all tiers and by + # the workflow_dispatch early-exit below. To add or remove a deployer, + # change this one line; Tier 1, Tier 2, and Tier 3 all derive from it. + readonly DEPLOYERS='["helm","argocd-oci","argocd-helm-oci","argocd-git","flux-oci","flux-git"]' + # --- workflow_dispatch: test exactly the requested recipe --- if [[ -n "${DISPATCH_RECIPE}" ]]; then single=$(jq -nc '[$r]' --arg r "${DISPATCH_RECIPE}") - echo "tier1=${single}" >> "$GITHUB_OUTPUT" - echo "tier2=[]" >> "$GITHUB_OUTPUT" - echo "tier3=[]" >> "$GITHUB_OUTPUT" - echo "tier3_batches=[]" >> "$GITHUB_OUTPUT" + single_pairs=$(jq -cn \ + --arg r "${DISPATCH_RECIPE}" \ + --argjson deployers "${DEPLOYERS}" ' + [ $deployers[] | {recipe: $r, deployer: .} ] + ') + echo "tier1=${single}" >> "$GITHUB_OUTPUT" + echo "tier2=[]" >> "$GITHUB_OUTPUT" + echo "tier3=[]" >> "$GITHUB_OUTPUT" + echo "tier3_batches=[]" >> "$GITHUB_OUTPUT" + echo "tier1_pairs=${single_pairs}" >> "$GITHUB_OUTPUT" + echo "tier2_pairs=[]" >> "$GITHUB_OUTPUT" echo "Manual dispatch: ${DISPATCH_RECIPE}" exit 0 fi @@ -237,16 +251,42 @@ jobs: # --- Tier 3: full matrix (all testable overlays) --- tier3="$all" + # Local alias so jq --argjson calls below can use $deployers + # (DEPLOYERS is readonly; this assignment is intentional). + # shellcheck disable=SC2034 + deployers="${DEPLOYERS}" + + # --- Pre-build Tier 1 pairs: generic recipes × all deployers --- + tier1_pairs=$(jq -cn \ + --argjson recipes "$tier1" \ + --argjson deployers "$deployers" ' + [ $recipes[] as $r | $deployers[] as $d | {recipe: $r, deployer: $d} ] + ') + + # Guard: Tier 1 is expected to stay well under 256 (no batching needed). + # Warn early if organic growth is approaching the limit. + tier1_pair_count=$(echo "$tier1_pairs" | jq 'length') + if (( tier1_pair_count > 256 )); then + echo "::error::Tier 1 has ${tier1_pair_count} pairs (>256) — add batching before passing this to kwok-test-run.yaml" + exit 1 + elif (( tier1_pair_count > 200 )); then + echo "::warning::Tier 1 has ${tier1_pair_count} pairs (>200) — consider adding batching before it reaches 256" + fi + + # --- Pre-build Tier 2 pairs: diff-affected recipes, helm-only --- + # Coverage-policy decision: Tier 2 uses helm only to keep PR wall-clock + # time proportional to the change scope. Full deployer coverage runs in + # Tier 3 on every push to main and on the nightly schedule. See ADR-003 + # §"Tier 2 deployer coverage" for rationale and how to revisit this. + tier2_pairs=$(echo "$tier2" | jq -c '[.[] | {recipe: ., deployer: "helm"}]') + # --- Tier 3 batching --- # GitHub caps a single job's matrix at 256 configurations. Tier 3 # crosses every testable recipe with every deployer, so the raw - # cross-product (recipes × deployers) outgrew the cap. Split the + # cross-product (recipes × deployers) can outgrow the cap. Split the # {recipe, deployer} pairs into batches of <= TIER3_BATCH_SIZE; the - # caller fans each batch out to kwok-tier3-shard.yaml, keeping every - # shard's matrix under the limit. Keep this deployer list in sync - # with the test-tier1 matrix above and the input doc in - # .github/actions/kwok-test/action.yml. - deployers='["helm","argocd-oci","argocd-helm-oci","argocd-git","flux-oci","flux-git"]' + # caller fans each batch out to kwok-test-run.yaml, keeping every + # shard's matrix under the limit. readonly TIER3_BATCH_SIZE=200 # headroom under GitHub's 256 cap # Fail closed if the batch size is ever raised past the hard limit — @@ -259,7 +299,7 @@ jobs: tier3_batches=$(jq -cn \ --argjson recipes "$tier3" \ - --argjson deployers "$deployers" \ + --argjson deployers "${DEPLOYERS}" \ --argjson size "$TIER3_BATCH_SIZE" ' [ $recipes[] as $r | $deployers[] as $d | {recipe: $r, deployer: $d} ] | [ range(0; length; $size) as $i @@ -268,106 +308,50 @@ jobs: ') # --- Output --- - echo "tier1=${tier1}" >> "$GITHUB_OUTPUT" - echo "tier2=${tier2}" >> "$GITHUB_OUTPUT" - echo "tier3=${tier3}" >> "$GITHUB_OUTPUT" + echo "tier1=${tier1}" >> "$GITHUB_OUTPUT" + echo "tier2=${tier2}" >> "$GITHUB_OUTPUT" + echo "tier3=${tier3}" >> "$GITHUB_OUTPUT" echo "tier3_batches=${tier3_batches}" >> "$GITHUB_OUTPUT" + echo "tier1_pairs=${tier1_pairs}" >> "$GITHUB_OUTPUT" + echo "tier2_pairs=${tier2_pairs}" >> "$GITHUB_OUTPUT" + deployer_count=$(echo "$deployers" | jq 'length') tier3_pairs=$(echo "$tier3_batches" | jq '[.[].pairs[]] | length') tier3_batch_count=$(echo "$tier3_batches" | jq 'length') - echo "Tier 1 (generic): $(echo "$tier1" | jq 'length') recipe(s)" - echo "Tier 2 (diff-aware): $(echo "$tier2" | jq 'length') recipe(s)" - echo "Tier 3 (full matrix): $(echo "$tier3" | jq 'length') recipe(s) × $(echo "$deployers" | jq 'length') deployer(s) = ${tier3_pairs} pair(s) in ${tier3_batch_count} batch(es)" + echo "Tier 1 (generic): $(echo "$tier1" | jq 'length') recipe(s) × ${deployer_count} deployer(s) = ${tier1_pair_count} pair(s)" + echo "Tier 2 (diff-aware): $(echo "$tier2" | jq 'length') recipe(s) × 1 deployer (helm) = $(echo "$tier2_pairs" | jq 'length') pair(s)" + echo "Tier 3 (full matrix): $(echo "$tier3" | jq 'length') recipe(s) × ${deployer_count} deployer(s) = ${tier3_pairs} pair(s) in ${tier3_batch_count} batch(es)" # ── Tier 1: PR gate — generic overlays (PR + push, skip on schedule) ── test-tier1: - name: 'Tier 1: ${{ matrix.recipe }} (${{ matrix.deployer }})' needs: discover if: >- github.event_name != 'schedule' && - needs.discover.outputs.tier1 != '[]' && - needs.discover.outputs.tier1 != '' - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - recipe: ${{ fromJSON(needs.discover.outputs.tier1) }} - deployer: [helm, argocd-oci, argocd-helm-oci, argocd-git, flux-oci, flux-git] - steps: - - name: Checkout Code - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - with: - persist-credentials: false - - - name: Load versions - id: versions - uses: ./.github/actions/load-versions - - - name: Run KWOK test - uses: ./.github/actions/kwok-test - with: - recipe: ${{ matrix.recipe }} - deployer: ${{ matrix.deployer }} - go_version: ${{ steps.versions.outputs.go }} - goreleaser_version: ${{ steps.versions.outputs.goreleaser }} - kind_version: ${{ steps.versions.outputs.kind }} - helm_version: ${{ steps.versions.outputs.helm }} - kwok_version: ${{ steps.versions.outputs.kwok }} - kubectl_version: ${{ steps.versions.outputs.kubectl }} - yq_version: ${{ steps.versions.outputs.yq }} - flux_version: ${{ steps.versions.outputs.flux }} - chainsaw_version: ${{ steps.versions.outputs.chainsaw }} - chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} - kind_node_image: ${{ steps.versions.outputs.kind_node_image }} + needs.discover.outputs.tier1_pairs != '[]' && + needs.discover.outputs.tier1_pairs != '' + uses: ./.github/workflows/kwok-test-run.yaml + with: + pairs: ${{ needs.discover.outputs.tier1_pairs }} # ── Tier 2: diff-aware accelerator tests (PR only, conditional) ── + # Coverage-policy decision: Tier 2 uses helm only (see ADR-003 §"Tier 2 + # deployer coverage"). Full deployer coverage runs in Tier 3 on push/nightly. test-tier2: - name: 'Tier 2: ${{ matrix.recipe }}' needs: discover if: >- github.event_name == 'pull_request' && - needs.discover.outputs.tier2 != '[]' && - needs.discover.outputs.tier2 != '' - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - recipe: ${{ fromJSON(needs.discover.outputs.tier2) }} - steps: - - name: Checkout Code - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - with: - persist-credentials: false - - - name: Load versions - id: versions - uses: ./.github/actions/load-versions - - - name: Run KWOK test - uses: ./.github/actions/kwok-test - with: - recipe: ${{ matrix.recipe }} - go_version: ${{ steps.versions.outputs.go }} - goreleaser_version: ${{ steps.versions.outputs.goreleaser }} - kind_version: ${{ steps.versions.outputs.kind }} - helm_version: ${{ steps.versions.outputs.helm }} - kwok_version: ${{ steps.versions.outputs.kwok }} - kubectl_version: ${{ steps.versions.outputs.kubectl }} - yq_version: ${{ steps.versions.outputs.yq }} - flux_version: ${{ steps.versions.outputs.flux }} - chainsaw_version: ${{ steps.versions.outputs.chainsaw }} - chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} - kind_node_image: ${{ steps.versions.outputs.kind_node_image }} + needs.discover.outputs.tier2_pairs != '[]' && + needs.discover.outputs.tier2_pairs != '' + uses: ./.github/workflows/kwok-test-run.yaml + with: + pairs: ${{ needs.discover.outputs.tier2_pairs }} # ── Tier 3: full matrix (push to main + nightly schedule) ── - # The recipe × deployer cross-product exceeds GitHub's 256-config matrix cap, - # so discover splits it into batches and we fan each batch out to the - # kwok-tier3-shard reusable workflow (one shard per batch, each <= 256). - # Per ADR-003: the concurrency group is keyed by SHA so successive merges to - # main never cancel in-flight Tier 3 runs; the batch id keeps every shard of a - # single run in its own group so they all run in parallel. + # The recipe × deployer cross-product can exceed GitHub's 256-config cap, so + # discover batches the pairs and we fan each batch out to kwok-test-run.yaml + # (one shard per batch, each <= 256 pairs). Per ADR-003: concurrency is keyed + # by SHA so successive merges never cancel in-flight Tier 3 runs; the batch id + # keeps every shard of a single run in its own group so they all run in parallel. test-tier3: needs: discover concurrency: @@ -381,7 +365,7 @@ jobs: fail-fast: false matrix: batch: ${{ fromJSON(needs.discover.outputs.tier3_batches) }} - uses: ./.github/workflows/kwok-tier3-shard.yaml + uses: ./.github/workflows/kwok-test-run.yaml with: pairs: ${{ toJSON(matrix.batch.pairs) }} @@ -434,3 +418,4 @@ jobs: fi echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY + \ No newline at end of file diff --git a/.github/workflows/kwok-tier3-shard.yaml b/.github/workflows/kwok-test-run.yaml similarity index 74% rename from .github/workflows/kwok-tier3-shard.yaml rename to .github/workflows/kwok-test-run.yaml index 13590aff9..0313ba4f2 100644 --- a/.github/workflows/kwok-tier3-shard.yaml +++ b/.github/workflows/kwok-test-run.yaml @@ -12,18 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Reusable shard for the Tier 3 KWOK full matrix. GitHub caps a single job's -# matrix at 256 configurations; the caller (kwok-recipes.yaml) splits the full -# recipe × deployer cross-product into batches of <= 256 pairs and invokes this -# workflow once per batch, so the per-shard matrix always stays under the cap. -name: KWOK Tier 3 Shard +# Reusable KWOK test runner shared by all three tiers (kwok-recipes.yaml). +# Takes a JSON array of {recipe, deployer} pairs and runs each as a matrix job. +# +# Tier 1 passes a flat pairs list (well under 256 → no batching needed). +# Tier 2 passes a flat pairs list (helm-only for diff-affected recipes). +# Tier 3 passes one batch at a time from the discover job's tier3_batches output; +# the caller (test-tier3 in kwok-recipes.yaml) shards across batches to keep +# every shard's matrix under GitHub's 256-configuration cap. +name: KWOK Test Run on: workflow_call: inputs: pairs: - # Caller batches at TIER3_BATCH_SIZE (kwok-recipes.yaml); must never - # exceed GitHub's hard cap of 256 matrix configurations per job. + # JSON array of {recipe, deployer} objects. Must never exceed GitHub's + # hard cap of 256 matrix configurations per job. description: 'JSON array of {recipe, deployer} objects to test (<= 256 entries)' required: true type: string @@ -33,7 +37,7 @@ permissions: jobs: test: - name: 'Tier 3: ${{ matrix.pair.recipe }} (${{ matrix.pair.deployer }})' + name: '${{ matrix.pair.recipe }} (${{ matrix.pair.deployer }})' runs-on: ubuntu-latest timeout-minutes: 15 strategy: diff --git a/docs/design/003-scaling-recipe-tests.md b/docs/design/003-scaling-recipe-tests.md index 076aef3f1..98865167e 100644 --- a/docs/design/003-scaling-recipe-tests.md +++ b/docs/design/003-scaling-recipe-tests.md @@ -2,12 +2,16 @@ ## Status +<<<<<<< Updated upstream **Accepted, implemented** — 2026-03-18 The tiered KWOK scaling strategy has shipped: tier discovery and per-overlay parallel jobs run in `.github/workflows/kwok-recipes.yaml`, with the Tier 3 shard lane in `.github/workflows/kwok-tier3-shard.yaml`, backed by the `kwok/scripts/` and `kwok/profiles/` machinery. +======= +Accepted | Updated (workflow consolidation, #1172) +>>>>>>> Stashed changes ## Scope @@ -150,7 +154,7 @@ To stay under the limit without sacrificing coverage, the `discover` job builds the `{recipe, deployer}` pairs and chunks them into batches of `TIER3_BATCH_SIZE` (200, with headroom under 256), emitting a `tier3_batches` output of `{id, pairs}` objects. `test-tier3` is a thin matrix over those batches that fans -each one out to the **`kwok-tier3-shard.yaml`** reusable workflow, which expands +each one out to the **`kwok-test-run.yaml`** reusable workflow, which expands its batch as its own (≤ 256) matrix. Batch count grows automatically as overlays are added — no manual job duplication. A fail-closed guard in `discover` errors loudly if `TIER3_BATCH_SIZE` is ever raised past 256, rather than resurfacing @@ -163,19 +167,18 @@ and a summary. Tier 3 fans out to a reusable shard workflow (see above): ``` discover -├── tier1: [eks, aks, gke, kind, ...] # generic only -├── tier2: [h100-eks-ubuntu-training, ...] # diff-affected only -├── tier3: [all 72+] # full overlay set -└── tier3_batches: [{id, pairs:[{recipe,deployer}]}] # cross-product, chunked ≤256 +├── tier1_pairs: [{recipe,deployer}] # generic overlays × all deployers +├── tier2_pairs: [{recipe, deployer:"helm"}] # diff-affected overlays, helm-only +└── tier3_batches: [{id, pairs:[{recipe,deployer}]}] # all overlays × all deployers, chunked ≤256 test-tier1 (PR + push to main) - matrix: tier1 × deployer + uses kwok-test-run.yaml pairs=tier1_pairs test-tier2 (PR only, skip if empty) - matrix: tier2 × deployer + uses kwok-test-run.yaml pairs=tier2_pairs [helm-only] test-tier3 (push to main + schedule, skip on PR) - matrix: tier3_batches → uses kwok-tier3-shard.yaml (matrix: pairs) + matrix: tier3_batches → uses kwok-test-run.yaml (matrix: pairs) summary needs: [test-tier1, test-tier2, test-tier3] @@ -194,6 +197,17 @@ The `summary` job gates on Tier 1 and Tier 2 for PRs, and on all three tiers for pushes to `main`. This avoids branch protection brittleness when the overlay set changes. +### Tier 2 deployer coverage + +Tier 2 is **helm-only** by deliberate policy (#1172). The old Tier 2 passed no +deployer argument to the action, which already defaulted to `helm` — this makes +that behavior explicit. Full deployer coverage (all deployers × all overlays) +runs in Tier 3 on every push to `main` and on the nightly schedule. + +To add full deployer coverage to Tier 2, change `tier2_pairs` in the `discover` +classify step to cross the recipe list with the full `DEPLOYERS` array (same +pattern as `tier1_pairs`). + ## Consequences ### Positive