From 8de6e8f3daa42657a05a081c48d58c692b805228 Mon Sep 17 00:00:00 2001 From: Mohit Date: Sun, 28 Jun 2026 22:06:50 +0530 Subject: [PATCH 1/6] feat(ci): consolidate KWOK tier workflows into single reusable runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace kwok-tier3-shard.yaml with kwok-test-run.yaml; the checkout/ load-versions/kwok-test step block now lives in exactly one place - Define readonly DEPLOYERS once in discover; Tier 1, Tier 2, and Tier 3 all derive from it — no more hardcoded matrix axes or sync comments - Emit tier1_pairs and tier2_pairs from discover; all three tiers call kwok-test-run.yaml with pre-built {recipe,deployer} pairs - Tier 2 deployer coverage: helm-only (deliberate; full coverage in Tier 3) - Fix workflow_dispatch early-exit to also emit tier1_pairs/tier2_pairs - Add Tier 1 pair-count guard (warning at >200) - Update paths triggers and ADR-003 workflow diagram Closes #1172 --- .github/workflows/kwok-recipes.yaml | 108 ++++++++++++++---- ...ok-tier3-shard.yaml => kwok-test-run.yaml} | 21 ++-- 2 files changed, 99 insertions(+), 30 deletions(-) rename .github/workflows/{kwok-tier3-shard.yaml => kwok-test-run.yaml} (73%) diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index fec1fdcae..90f65f5f5 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -23,7 +23,7 @@ on: - 'kwok/**' - 'tests/chainsaw/kwok/**' - '.github/workflows/kwok-recipes.yaml' - - '.github/workflows/kwok-tier3-shard.yaml' + - '.github/workflows/kwok-test-run.yaml' - '.github/actions/kwok-test/**' - '!**.md' pull_request: @@ -34,7 +34,7 @@ on: - 'kwok/**' - 'tests/chainsaw/kwok/**' - '.github/workflows/kwok-recipes.yaml' - - '.github/workflows/kwok-tier3-shard.yaml' + - '.github/workflows/kwok-test-run.yaml' - '.github/actions/kwok-test/**' - '!**.md' schedule: @@ -66,6 +66,8 @@ jobs: tier2: ${{ steps.classify.outputs.tier2 }} tier3: ${{ steps.classify.outputs.tier3 }} tier3_batches: ${{ steps.classify.outputs.tier3_batches }} + tier1_pairs: ${{ steps.classify.outputs.tier1_pairs }} + tier2_pairs: ${{ steps.classify.outputs.tier2_pairs }} steps: - name: Checkout uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -82,13 +84,25 @@ jobs: run: | set -euo pipefail + # Deployer list — single source of truth, consumed by all tiers and by + # the workflow_dispatch early-exit below. To add or remove a deployer, + # change this one line; Tier 1, Tier 2, and Tier 3 all derive from it. + readonly DEPLOYERS='["helm","argocd-oci","argocd-helm-oci","argocd-git","flux-oci","flux-git"]' + # --- workflow_dispatch: test exactly the requested recipe --- if [[ -n "${DISPATCH_RECIPE}" ]]; then single=$(jq -nc '[$r]' --arg r "${DISPATCH_RECIPE}") - echo "tier1=${single}" >> "$GITHUB_OUTPUT" - echo "tier2=[]" >> "$GITHUB_OUTPUT" - echo "tier3=[]" >> "$GITHUB_OUTPUT" - echo "tier3_batches=[]" >> "$GITHUB_OUTPUT" + single_pairs=$(jq -cn \ + --arg r "${DISPATCH_RECIPE}" \ + --argjson deployers "${DEPLOYERS}" ' + [ $deployers[] | {recipe: $r, deployer: .} ] + ') + echo "tier1=${single}" >> "$GITHUB_OUTPUT" + echo "tier2=[]" >> "$GITHUB_OUTPUT" + echo "tier3=[]" >> "$GITHUB_OUTPUT" + echo "tier3_batches=[]" >> "$GITHUB_OUTPUT" + echo "tier1_pairs=${single_pairs}" >> "$GITHUB_OUTPUT" + echo "tier2_pairs=[]" >> "$GITHUB_OUTPUT" echo "Manual dispatch: ${DISPATCH_RECIPE}" exit 0 fi @@ -219,16 +233,47 @@ jobs: # --- Tier 3: full matrix (all testable overlays) --- tier3="$all" + # Local alias so jq --argjson calls below can use $deployers + # (DEPLOYERS is readonly; this assignment is intentional). + # shellcheck disable=SC2034 + deployers="${DEPLOYERS}" + + # --- Pre-build Tier 1 pairs: generic recipes × all deployers --- + tier1_pairs=$(jq -cn \ + --argjson recipes "$tier1" \ + --argjson deployers "$deployers" ' + [ $recipes[] as $r | $deployers[] as $d | {recipe: $r, deployer: $d} ] + ') + + # Guard: Tier 1 is expected to stay well under 256 (no batching needed). + # Warn early if organic growth is approaching the limit. + tier1_pair_count=$(echo "$tier1_pairs" | jq 'length') + if (( tier1_pair_count > 200 )); then + echo "::warning::Tier 1 has ${tier1_pair_count} pairs (>200) — consider adding batching before it reaches 256" + fi + + # --- Pre-build Tier 2 pairs: diff-affected recipes, helm-only --- + # Coverage-policy decision: Tier 2 uses helm only to keep PR wall-clock + # time proportional to the change scope. Full deployer coverage runs in + # Tier 3 on every push to main and on the nightly schedule. See ADR-003 + # §"Tier 2 deployer coverage" for rationale and how to revisit this. + tier2_pairs=$(echo "$tier2" | jq -c '[.[] | {recipe: ., deployer: "helm"}]') + # --- Tier 3 batching --- # GitHub caps a single job's matrix at 256 configurations. Tier 3 # crosses every testable recipe with every deployer, so the raw - # cross-product (recipes × deployers) outgrew the cap. Split the + # cross-product (recipes × deployers) can outgrow the cap. Split the # {recipe, deployer} pairs into batches of <= TIER3_BATCH_SIZE; the +<<<<<<< Updated upstream # caller fans each batch out to kwok-tier3-shard.yaml, keeping every # shard's matrix under the limit. Keep this deployer list in sync # with the test-tier1 matrix above and the input doc in # .github/actions/kwok-test/action.yml. deployers='["helm","argocd-oci","argocd-helm-oci","flux-oci"]' +======= + # caller fans each batch out to kwok-test-run.yaml, keeping every + # shard's matrix under the limit. +>>>>>>> Stashed changes readonly TIER3_BATCH_SIZE=200 # headroom under GitHub's 256 cap # Fail closed if the batch size is ever raised past the hard limit — @@ -250,23 +295,26 @@ jobs: ') # --- Output --- - echo "tier1=${tier1}" >> "$GITHUB_OUTPUT" - echo "tier2=${tier2}" >> "$GITHUB_OUTPUT" - echo "tier3=${tier3}" >> "$GITHUB_OUTPUT" + echo "tier1=${tier1}" >> "$GITHUB_OUTPUT" + echo "tier2=${tier2}" >> "$GITHUB_OUTPUT" + echo "tier3=${tier3}" >> "$GITHUB_OUTPUT" echo "tier3_batches=${tier3_batches}" >> "$GITHUB_OUTPUT" + echo "tier1_pairs=${tier1_pairs}" >> "$GITHUB_OUTPUT" + echo "tier2_pairs=${tier2_pairs}" >> "$GITHUB_OUTPUT" + deployer_count=$(echo "$deployers" | jq 'length') tier3_pairs=$(echo "$tier3_batches" | jq '[.[].pairs[]] | length') tier3_batch_count=$(echo "$tier3_batches" | jq 'length') - echo "Tier 1 (generic): $(echo "$tier1" | jq 'length') recipe(s)" - echo "Tier 2 (diff-aware): $(echo "$tier2" | jq 'length') recipe(s)" - echo "Tier 3 (full matrix): $(echo "$tier3" | jq 'length') recipe(s) × $(echo "$deployers" | jq 'length') deployer(s) = ${tier3_pairs} pair(s) in ${tier3_batch_count} batch(es)" + echo "Tier 1 (generic): $(echo "$tier1" | jq 'length') recipe(s) × ${deployer_count} deployer(s) = ${tier1_pair_count} pair(s)" + echo "Tier 2 (diff-aware): $(echo "$tier2" | jq 'length') recipe(s) × 1 deployer (helm) = $(echo "$tier2_pairs" | jq 'length') pair(s)" + echo "Tier 3 (full matrix): $(echo "$tier3" | jq 'length') recipe(s) × ${deployer_count} deployer(s) = ${tier3_pairs} pair(s) in ${tier3_batch_count} batch(es)" # ── Tier 1: PR gate — generic overlays (PR + push, skip on schedule) ── test-tier1: - name: 'Tier 1: ${{ matrix.recipe }} (${{ matrix.deployer }})' needs: discover if: >- github.event_name != 'schedule' && +<<<<<<< Updated upstream needs.discover.outputs.tier1 != '[]' && needs.discover.outputs.tier1 != '' runs-on: ubuntu-latest @@ -302,13 +350,22 @@ jobs: chainsaw_version: ${{ steps.versions.outputs.chainsaw }} chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} kind_node_image: ${{ steps.versions.outputs.kind_node_image }} +======= + needs.discover.outputs.tier1_pairs != '[]' && + needs.discover.outputs.tier1_pairs != '' + uses: ./.github/workflows/kwok-test-run.yaml + with: + pairs: ${{ needs.discover.outputs.tier1_pairs }} +>>>>>>> Stashed changes # ── Tier 2: diff-aware accelerator tests (PR only, conditional) ── + # Coverage-policy decision: Tier 2 uses helm only (see ADR-003 §"Tier 2 + # deployer coverage"). Full deployer coverage runs in Tier 3 on push/nightly. test-tier2: - name: 'Tier 2: ${{ matrix.recipe }}' needs: discover if: >- github.event_name == 'pull_request' && +<<<<<<< Updated upstream needs.discover.outputs.tier2 != '[]' && needs.discover.outputs.tier2 != '' runs-on: ubuntu-latest @@ -342,14 +399,20 @@ jobs: chainsaw_version: ${{ steps.versions.outputs.chainsaw }} chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} kind_node_image: ${{ steps.versions.outputs.kind_node_image }} +======= + needs.discover.outputs.tier2_pairs != '[]' && + needs.discover.outputs.tier2_pairs != '' + uses: ./.github/workflows/kwok-test-run.yaml + with: + pairs: ${{ needs.discover.outputs.tier2_pairs }} +>>>>>>> Stashed changes # ── Tier 3: full matrix (push to main + nightly schedule) ── - # The recipe × deployer cross-product exceeds GitHub's 256-config matrix cap, - # so discover splits it into batches and we fan each batch out to the - # kwok-tier3-shard reusable workflow (one shard per batch, each <= 256). - # Per ADR-003: the concurrency group is keyed by SHA so successive merges to - # main never cancel in-flight Tier 3 runs; the batch id keeps every shard of a - # single run in its own group so they all run in parallel. + # The recipe × deployer cross-product can exceed GitHub's 256-config cap, so + # discover batches the pairs and we fan each batch out to kwok-test-run.yaml + # (one shard per batch, each <= 256 pairs). Per ADR-003: concurrency is keyed + # by SHA so successive merges never cancel in-flight Tier 3 runs; the batch id + # keeps every shard of a single run in its own group so they all run in parallel. test-tier3: needs: discover concurrency: @@ -363,7 +426,7 @@ jobs: fail-fast: false matrix: batch: ${{ fromJSON(needs.discover.outputs.tier3_batches) }} - uses: ./.github/workflows/kwok-tier3-shard.yaml + uses: ./.github/workflows/kwok-test-run.yaml with: pairs: ${{ toJSON(matrix.batch.pairs) }} @@ -416,3 +479,4 @@ jobs: fi echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY + \ No newline at end of file diff --git a/.github/workflows/kwok-tier3-shard.yaml b/.github/workflows/kwok-test-run.yaml similarity index 73% rename from .github/workflows/kwok-tier3-shard.yaml rename to .github/workflows/kwok-test-run.yaml index fdfa0140b..92f4735ae 100644 --- a/.github/workflows/kwok-tier3-shard.yaml +++ b/.github/workflows/kwok-test-run.yaml @@ -12,18 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Reusable shard for the Tier 3 KWOK full matrix. GitHub caps a single job's -# matrix at 256 configurations; the caller (kwok-recipes.yaml) splits the full -# recipe × deployer cross-product into batches of <= 256 pairs and invokes this -# workflow once per batch, so the per-shard matrix always stays under the cap. -name: KWOK Tier 3 Shard +# Reusable KWOK test runner shared by all three tiers (kwok-recipes.yaml). +# Takes a JSON array of {recipe, deployer} pairs and runs each as a matrix job. +# +# Tier 1 passes a flat pairs list (well under 256 → no batching needed). +# Tier 2 passes a flat pairs list (helm-only for diff-affected recipes). +# Tier 3 passes one batch at a time from the discover job's tier3_batches output; +# the caller (test-tier3 in kwok-recipes.yaml) shards across batches to keep +# every shard's matrix under GitHub's 256-configuration cap. +name: KWOK Test Run on: workflow_call: inputs: pairs: - # Caller batches at TIER3_BATCH_SIZE (kwok-recipes.yaml); must never - # exceed GitHub's hard cap of 256 matrix configurations per job. + # JSON array of {recipe, deployer} objects. Must never exceed GitHub's + # hard cap of 256 matrix configurations per job. description: 'JSON array of {recipe, deployer} objects to test (<= 256 entries)' required: true type: string @@ -33,7 +37,7 @@ permissions: jobs: test: - name: 'Tier 3: ${{ matrix.pair.recipe }} (${{ matrix.pair.deployer }})' + name: '${{ matrix.pair.recipe }} (${{ matrix.pair.deployer }})' runs-on: ubuntu-latest timeout-minutes: 15 strategy: @@ -66,3 +70,4 @@ jobs: chainsaw_version: ${{ steps.versions.outputs.chainsaw }} chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} kind_node_image: ${{ steps.versions.outputs.kind_node_image }} + \ No newline at end of file From f5d8fdba6d1e7dc2b40fd37761812b350223c895 Mon Sep 17 00:00:00 2001 From: Mohit Date: Mon, 29 Jun 2026 13:28:18 +0530 Subject: [PATCH 2/6] feat(ci): consolidate KWOK tier workflows into single reusable runner - Replace kwok-tier3-shard.yaml with kwok-test-run.yaml (step block in one place) - readonly DEPLOYERS single source of truth; Tier 1/2/3 all derive from it - discover emits tier1_pairs + tier2_pairs; all tiers call kwok-test-run.yaml - Tier 2 helm-only (deliberate, documented in ADR-003) - Fix workflow_dispatch early-exit to emit tier1_pairs/tier2_pairs Closes #1172 --- .github/workflows/kwok-recipes.yaml | 97 ++-------------------------- .github/workflows/kwok-test-run.yaml | 2 +- 2 files changed, 8 insertions(+), 91 deletions(-) diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index 90f65f5f5..af9918e94 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ jobs: tier2_pairs: ${{ steps.classify.outputs.tier2_pairs }} steps: - name: Checkout - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false # Full checkout needed for diff-aware Tier 2 discovery @@ -115,8 +115,8 @@ jobs: name=$(basename "$overlay" .yaml) service=$(yq eval '.spec.criteria.service // ""' "$overlay" 2>/dev/null || true) - # Skip overlays without a concrete service - if [[ -z "$service" || "$service" == "null" || "$service" == "any" ]]; then + # Skip non-testable overlays (no service, or OCP — needs OpenShift operators) + if [[ -z "$service" || "$service" == "null" || "$service" == "any" || "$service" == "ocp" ]]; then continue fi @@ -154,8 +154,8 @@ jobs: service=$(yq eval '.spec.criteria.service // ""' "$overlay" 2>/dev/null || true) accel=$(yq eval '.spec.criteria.accelerator // ""' "$overlay" 2>/dev/null || true) - # Only accelerator-specific overlays belong in Tier 2 - if [[ -z "$service" || "$service" == "null" || "$service" == "any" ]]; then + # Only accelerator-specific, KWOK-testable overlays belong in Tier 2 + if [[ -z "$service" || "$service" == "null" || "$service" == "any" || "$service" == "ocp" ]]; then continue fi if [[ -z "$accel" || "$accel" == "null" || "$accel" == "any" ]]; then @@ -264,16 +264,8 @@ jobs: # crosses every testable recipe with every deployer, so the raw # cross-product (recipes × deployers) can outgrow the cap. Split the # {recipe, deployer} pairs into batches of <= TIER3_BATCH_SIZE; the -<<<<<<< Updated upstream - # caller fans each batch out to kwok-tier3-shard.yaml, keeping every - # shard's matrix under the limit. Keep this deployer list in sync - # with the test-tier1 matrix above and the input doc in - # .github/actions/kwok-test/action.yml. - deployers='["helm","argocd-oci","argocd-helm-oci","flux-oci"]' -======= # caller fans each batch out to kwok-test-run.yaml, keeping every # shard's matrix under the limit. ->>>>>>> Stashed changes readonly TIER3_BATCH_SIZE=200 # headroom under GitHub's 256 cap # Fail closed if the batch size is ever raised past the hard limit — @@ -314,49 +306,11 @@ jobs: needs: discover if: >- github.event_name != 'schedule' && -<<<<<<< Updated upstream - needs.discover.outputs.tier1 != '[]' && - needs.discover.outputs.tier1 != '' - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - recipe: ${{ fromJSON(needs.discover.outputs.tier1) }} - deployer: [helm, argocd-oci, argocd-helm-oci, flux-oci] - steps: - - name: Checkout Code - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - persist-credentials: false - - - name: Load versions - id: versions - uses: ./.github/actions/load-versions - - - name: Run KWOK test - uses: ./.github/actions/kwok-test - with: - recipe: ${{ matrix.recipe }} - deployer: ${{ matrix.deployer }} - go_version: ${{ steps.versions.outputs.go }} - goreleaser_version: ${{ steps.versions.outputs.goreleaser }} - kind_version: ${{ steps.versions.outputs.kind }} - helm_version: ${{ steps.versions.outputs.helm }} - kwok_version: ${{ steps.versions.outputs.kwok }} - kubectl_version: ${{ steps.versions.outputs.kubectl }} - yq_version: ${{ steps.versions.outputs.yq }} - flux_version: ${{ steps.versions.outputs.flux }} - chainsaw_version: ${{ steps.versions.outputs.chainsaw }} - chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} - kind_node_image: ${{ steps.versions.outputs.kind_node_image }} -======= needs.discover.outputs.tier1_pairs != '[]' && needs.discover.outputs.tier1_pairs != '' uses: ./.github/workflows/kwok-test-run.yaml with: pairs: ${{ needs.discover.outputs.tier1_pairs }} ->>>>>>> Stashed changes # ── Tier 2: diff-aware accelerator tests (PR only, conditional) ── # Coverage-policy decision: Tier 2 uses helm only (see ADR-003 §"Tier 2 @@ -365,47 +319,11 @@ jobs: needs: discover if: >- github.event_name == 'pull_request' && -<<<<<<< Updated upstream - needs.discover.outputs.tier2 != '[]' && - needs.discover.outputs.tier2 != '' - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - recipe: ${{ fromJSON(needs.discover.outputs.tier2) }} - steps: - - name: Checkout Code - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - persist-credentials: false - - - name: Load versions - id: versions - uses: ./.github/actions/load-versions - - - name: Run KWOK test - uses: ./.github/actions/kwok-test - with: - recipe: ${{ matrix.recipe }} - go_version: ${{ steps.versions.outputs.go }} - goreleaser_version: ${{ steps.versions.outputs.goreleaser }} - kind_version: ${{ steps.versions.outputs.kind }} - helm_version: ${{ steps.versions.outputs.helm }} - kwok_version: ${{ steps.versions.outputs.kwok }} - kubectl_version: ${{ steps.versions.outputs.kubectl }} - yq_version: ${{ steps.versions.outputs.yq }} - flux_version: ${{ steps.versions.outputs.flux }} - chainsaw_version: ${{ steps.versions.outputs.chainsaw }} - chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} - kind_node_image: ${{ steps.versions.outputs.kind_node_image }} -======= needs.discover.outputs.tier2_pairs != '[]' && needs.discover.outputs.tier2_pairs != '' uses: ./.github/workflows/kwok-test-run.yaml with: pairs: ${{ needs.discover.outputs.tier2_pairs }} ->>>>>>> Stashed changes # ── Tier 3: full matrix (push to main + nightly schedule) ── # The recipe × deployer cross-product can exceed GitHub's 256-config cap, so @@ -478,5 +396,4 @@ jobs: exit 1 fi - echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY - \ No newline at end of file + echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.github/workflows/kwok-test-run.yaml b/.github/workflows/kwok-test-run.yaml index 92f4735ae..e63a02f05 100644 --- a/.github/workflows/kwok-test-run.yaml +++ b/.github/workflows/kwok-test-run.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From f2a8c8f6e651d4720c44d39cb15aba0dec0b840b Mon Sep 17 00:00:00 2001 From: Mohit Date: Mon, 29 Jun 2026 16:05:09 +0530 Subject: [PATCH 3/6] feat(ci): consolidate KWOK tier workflows into reusable runner --- .github/workflows/kwok-recipes.yaml | 3 ++- .github/workflows/kwok-test-run.yaml | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index af9918e94..b34cfbbc8 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -396,4 +396,5 @@ jobs: exit 1 fi - echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY \ No newline at end of file + echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY + \ No newline at end of file diff --git a/.github/workflows/kwok-test-run.yaml b/.github/workflows/kwok-test-run.yaml index e63a02f05..e20349d1b 100644 --- a/.github/workflows/kwok-test-run.yaml +++ b/.github/workflows/kwok-test-run.yaml @@ -70,4 +70,3 @@ jobs: chainsaw_version: ${{ steps.versions.outputs.chainsaw }} chainsaw_sha256: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} kind_node_image: ${{ steps.versions.outputs.kind_node_image }} - \ No newline at end of file From b0d8ef02c9475f2bd5a67cc5055072f49bfc561c Mon Sep 17 00:00:00 2001 From: Mohit Date: Tue, 30 Jun 2026 12:50:48 +0530 Subject: [PATCH 4/6] fix(ci): remove UTF-8 BOM and normalize line endings in kwok workflow files --- .github/workflows/kwok-recipes.yaml | 2 +- .github/workflows/kwok-test-run.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index f785ce00c..bebbd8320 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/.github/workflows/kwok-test-run.yaml b/.github/workflows/kwok-test-run.yaml index b921afa03..0313ba4f2 100644 --- a/.github/workflows/kwok-test-run.yaml +++ b/.github/workflows/kwok-test-run.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 79bd45cceeca460a2ef07ef8d9e42d8739a9d4e8 Mon Sep 17 00:00:00 2001 From: Mohit Date: Tue, 30 Jun 2026 13:57:13 +0530 Subject: [PATCH 5/6] fix(ci): hard-fail Tier 1 matrix before exceeding 256-config cap --- .github/workflows/kwok-recipes.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index bebbd8320..8a8daab49 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -266,7 +266,10 @@ jobs: # Guard: Tier 1 is expected to stay well under 256 (no batching needed). # Warn early if organic growth is approaching the limit. tier1_pair_count=$(echo "$tier1_pairs" | jq 'length') - if (( tier1_pair_count > 200 )); then + if (( tier1_pair_count > 256 )); then + echo "::error::Tier 1 has ${tier1_pair_count} pairs (>256) — add batching before passing this to kwok-test-run.yaml" + exit 1 + elif (( tier1_pair_count > 200 )); then echo "::warning::Tier 1 has ${tier1_pair_count} pairs (>200) — consider adding batching before it reaches 256" fi From e234dfd81af9a46f61ce473d9ec25c7a5c7912db Mon Sep 17 00:00:00 2001 From: Mohit Date: Thu, 2 Jul 2026 00:33:31 +0530 Subject: [PATCH 6/6] fix(ci): hard-fail Tier 1 matrix before exceeding 256-config cap --- .github/workflows/kwok-recipes.yaml | 3 ++- docs/design/003-scaling-recipe-tests.md | 30 ++++++++++++++++++------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/.github/workflows/kwok-recipes.yaml b/.github/workflows/kwok-recipes.yaml index 8a8daab49..0ec768486 100644 --- a/.github/workflows/kwok-recipes.yaml +++ b/.github/workflows/kwok-recipes.yaml @@ -299,7 +299,7 @@ jobs: tier3_batches=$(jq -cn \ --argjson recipes "$tier3" \ - --argjson deployers "$deployers" \ + --argjson deployers "${DEPLOYERS}" \ --argjson size "$TIER3_BATCH_SIZE" ' [ $recipes[] as $r | $deployers[] as $d | {recipe: $r, deployer: $d} ] | [ range(0; length; $size) as $i @@ -418,3 +418,4 @@ jobs: fi echo "All recipe validations passed" >> $GITHUB_STEP_SUMMARY + \ No newline at end of file diff --git a/docs/design/003-scaling-recipe-tests.md b/docs/design/003-scaling-recipe-tests.md index 076aef3f1..98865167e 100644 --- a/docs/design/003-scaling-recipe-tests.md +++ b/docs/design/003-scaling-recipe-tests.md @@ -2,12 +2,16 @@ ## Status +<<<<<<< Updated upstream **Accepted, implemented** — 2026-03-18 The tiered KWOK scaling strategy has shipped: tier discovery and per-overlay parallel jobs run in `.github/workflows/kwok-recipes.yaml`, with the Tier 3 shard lane in `.github/workflows/kwok-tier3-shard.yaml`, backed by the `kwok/scripts/` and `kwok/profiles/` machinery. +======= +Accepted | Updated (workflow consolidation, #1172) +>>>>>>> Stashed changes ## Scope @@ -150,7 +154,7 @@ To stay under the limit without sacrificing coverage, the `discover` job builds the `{recipe, deployer}` pairs and chunks them into batches of `TIER3_BATCH_SIZE` (200, with headroom under 256), emitting a `tier3_batches` output of `{id, pairs}` objects. `test-tier3` is a thin matrix over those batches that fans -each one out to the **`kwok-tier3-shard.yaml`** reusable workflow, which expands +each one out to the **`kwok-test-run.yaml`** reusable workflow, which expands its batch as its own (≤ 256) matrix. Batch count grows automatically as overlays are added — no manual job duplication. A fail-closed guard in `discover` errors loudly if `TIER3_BATCH_SIZE` is ever raised past 256, rather than resurfacing @@ -163,19 +167,18 @@ and a summary. Tier 3 fans out to a reusable shard workflow (see above): ``` discover -├── tier1: [eks, aks, gke, kind, ...] # generic only -├── tier2: [h100-eks-ubuntu-training, ...] # diff-affected only -├── tier3: [all 72+] # full overlay set -└── tier3_batches: [{id, pairs:[{recipe,deployer}]}] # cross-product, chunked ≤256 +├── tier1_pairs: [{recipe,deployer}] # generic overlays × all deployers +├── tier2_pairs: [{recipe, deployer:"helm"}] # diff-affected overlays, helm-only +└── tier3_batches: [{id, pairs:[{recipe,deployer}]}] # all overlays × all deployers, chunked ≤256 test-tier1 (PR + push to main) - matrix: tier1 × deployer + uses kwok-test-run.yaml pairs=tier1_pairs test-tier2 (PR only, skip if empty) - matrix: tier2 × deployer + uses kwok-test-run.yaml pairs=tier2_pairs [helm-only] test-tier3 (push to main + schedule, skip on PR) - matrix: tier3_batches → uses kwok-tier3-shard.yaml (matrix: pairs) + matrix: tier3_batches → uses kwok-test-run.yaml (matrix: pairs) summary needs: [test-tier1, test-tier2, test-tier3] @@ -194,6 +197,17 @@ The `summary` job gates on Tier 1 and Tier 2 for PRs, and on all three tiers for pushes to `main`. This avoids branch protection brittleness when the overlay set changes. +### Tier 2 deployer coverage + +Tier 2 is **helm-only** by deliberate policy (#1172). The old Tier 2 passed no +deployer argument to the action, which already defaulted to `helm` — this makes +that behavior explicit. Full deployer coverage (all deployers × all overlays) +runs in Tier 3 on every push to `main` and on the nightly schedule. + +To add full deployer coverage to Tier 2, change `tier2_pairs` in the `discover` +classify step to cross the recipe list with the full `DEPLOYERS` array (same +pattern as `tier1_pairs`). + ## Consequences ### Positive