From e66db6ec7e39d4a0e0cca40dae08e25d9ce14540 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 18 May 2026 10:43:34 -0700 Subject: [PATCH 01/14] run-sweep: gate full-sweep PRs behind a sequential canary When a PR carries `full-sweep-enabled` (and not `evals-only`), pick the lowest-conc single-node benchmark entry as a canary and run it before fanning out the full sweep. If the canary fails, the eight fan-out jobs are skipped to save cluster time on shared failures (bad image tag, removed CLI flag, etc.). Design choices: - Canary candidacy is restricted to single_node['1k1k' | '8k1k'] and excludes entries with run-eval: true, so the canary is always a pure benchmark smoke test using the existing single-node template. - The canary entry is removed from the regular fan-out's matrix (via remaining-search-space-config) only when the canary actually succeeded. On canary skip / cancel / canary-select failure, the regular fan-out falls back to the full search-space-config so coverage is preserved. - The fan-out gate blocks only on `canary-sweep.result == 'failure'` -- every other state (success, skipped, cancelled) proceeds, so a bug in the canary mechanism never blocks the rest of the sweep. - Non-full-sweep PRs, draft PRs, pushes to main, and the reuse path all behave identically to before via existing gates. The aggregated results_bmk artifact picks up both the canary's row and the regular fan-out's rows via the existing bmk_* glob -- each entry appears exactly once. --- .github/workflows/run-sweep.yml | 170 ++++++++++++++++++++++++++++---- 1 file changed, 152 insertions(+), 18 deletions(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index c5ece9804..2b08a4a60 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -135,9 +135,88 @@ jobs: --ref "${{ github.ref }}" \ --workflow-id "run-sweep.yml" - sweep-multi-node-1k1k: + canary-select: needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' }} + if: >- + needs.setup.outputs.reuse-enabled != 'true' && + github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') && + !contains(github.event.pull_request.labels.*.name, 'evals-only') + runs-on: ubuntu-latest + outputs: + canary-config: ${{ steps.pick.outputs.canary-config }} + remaining-search-space-config: ${{ steps.pick.outputs.remaining-search-space-config }} + steps: + - id: pick + env: + SEARCH_SPACE: ${{ needs.setup.outputs.search-space-config }} + run: | + selection=$(jq -c ' + def remove_one($needle): + if $needle == null then . + else + (index($needle)) as $idx + | if $idx == null then . else del(.[$idx]) end + end; + + # Canary is a benchmark-only smoke test — exclude entries + # whose primary purpose is eval (run-eval == true) so the + # picked canary never runs an eval pass. + (((.single_node["1k1k"] // []) + (.single_node["8k1k"] // [])) + | map(select(.["run-eval"] != true))) as $candidates + | (if ($candidates | length) == 0 then null else ($candidates | min_by(.conc)) end) as $canary + | { + canary: (if $canary == null then [] else [$canary] end), + remaining: ( + . + | .single_node = (.single_node // {}) + | .single_node["1k1k"] = ((.single_node["1k1k"] // []) | remove_one($canary)) + | .single_node["8k1k"] = ((.single_node["8k1k"] // []) | remove_one($canary)) + ) + } + ' <<<"$SEARCH_SPACE") + echo "canary-config=$(jq -c '.canary' <<<"$selection")" >> "$GITHUB_OUTPUT" + echo "remaining-search-space-config=$(jq -c '.remaining' <<<"$selection")" >> "$GITHUB_OUTPUT" + + canary-sweep: + needs: canary-select + if: ${{ needs.canary-select.outputs.canary-config != '' && needs.canary-select.outputs.canary-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: canary / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.canary-select.outputs.canary-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + run-eval: false + + sweep-multi-node-1k1k: + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node 1k1k / strategy: @@ -174,8 +253,15 @@ jobs: run-eval: false sweep-multi-node-8k1k: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node 8k1k / strategy: @@ -186,14 +272,22 @@ jobs: with: *multi-node-inputs sweep-single-node-1k1k: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != '[]' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: single-node 1k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }} + config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k'] }} secrets: inherit with: &single-node-inputs exp-name: ${{ matrix.config.exp-name }} @@ -215,20 +309,35 @@ jobs: run-eval: ${{ matrix.config.run-eval }} sweep-single-node-8k1k: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != '[]' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: single-node 8k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }} + config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k'] }} secrets: inherit with: *single-node-inputs sweep-agentic: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: agentic / strategy: @@ -259,8 +368,15 @@ jobs: scenario-type: agentic-coding sweep-multi-node-agentic: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node agentic / strategy: @@ -298,8 +414,16 @@ jobs: scenario-type: agentic-coding sweep-evals: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && + toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: eval / strategy: @@ -328,8 +452,16 @@ jobs: eval-only: true sweep-multi-node-evals: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && + toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node eval / strategy: @@ -368,6 +500,7 @@ jobs: collect-results: needs: [ + canary-sweep, sweep-single-node-1k1k, sweep-single-node-8k1k, sweep-agentic, @@ -381,6 +514,7 @@ jobs: always() && needs.setup.result == 'success' && ( + needs.canary-sweep.result == 'success' || needs.sweep-single-node-1k1k.result != 'skipped' || needs.sweep-single-node-8k1k.result != 'skipped' || needs.sweep-multi-node-1k1k.result != 'skipped' || From 81bfc26a34d427f53b2ff34364eb489163899f79 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 11:20:20 -0700 Subject: [PATCH 02/14] run-sweep: tighten canary gate to success/skipped only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the fan-out gate was `needs.canary-sweep.result != 'failure'`, which let `cancelled` (and any unknown future result) fall through. A cancelled canary then ran the FULL fan-out matrix — including the canary's own entry — without canary validation, the worst-case outcome. Replace with an explicit allowlist: (result == 'success' || result == 'skipped') - success: canary passed → fan-out runs with deduped (remaining) matrix - skipped: no canary candidate (multi-node-only / evals-only) → fan-out runs with full matrix (no canary to dedup against) - failure / cancelled / anything else: fan-out blocked The matrix-ternary `result == 'success' && remaining-search-space-config || full-search-space-config` already had the right shape and is untouched. Applies to all 8 fan-out jobs (single/multi-node 1k1k+8k1k, agentic, multi-node-agentic, evals, multi-node-evals). --- .github/workflows/run-sweep.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 2b08a4a60..774cff4d3 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -214,7 +214,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml @@ -259,7 +259,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml @@ -278,7 +278,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' && toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != '[]' }} @@ -315,7 +315,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' && toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != '[]' }} @@ -335,7 +335,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' }} uses: ./.github/workflows/benchmark-tmpl.yml @@ -374,7 +374,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml @@ -420,7 +420,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }} @@ -458,7 +458,7 @@ jobs: always() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && - needs.canary-sweep.result != 'failure' && + (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }} From afb408bc249a7bf6decd33bce997d6cb76c7cb39 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 11:30:05 -0700 Subject: [PATCH 03/14] run-sweep: add non-canary-full-sweep-enabled label (full sweep, no canary) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New label that triggers the full sweep without running the canary gate. Acts as an escape hatch when: - The canary is known to be flaky / unreliable for this config - The user wants the full sweep without the canary delay - The canary's chosen entry is not a representative smoke test Behavior matrix: sweep-enabled - trims to max(conc) per parallelism, with canary full-sweep-enabled - full intermediate conc sweep, with canary non-canary-full-sweep-enabled - full intermediate conc sweep, NO canary How it works (no changes to existing label semantics): - Setup gate triggers on any of the 3 labels (was: 2) - canary-select gate still requires `full-sweep-enabled` (exact array match, so `non-canary-full-sweep-enabled` does NOT match) → canary skips → all fan-out jobs run on full search space via the `== 'skipped'` allowlist - TRIM_CONC env is unchanged — only `sweep-enabled` enables trim, so the new label correctly behaves as "full sweep" - The reject-conflicting-labels step is now a 3-way exclusion: at most one of {sweep-enabled, full-sweep-enabled, non-canary-full-sweep-enabled} - The same gate updates apply to the comment-visualizer job - Concurrency-group filter excludes the new label too so toggling it uses the same `'active'` group key as the other sweep labels --- .github/workflows/run-sweep.yml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 774cff4d3..9d5233779 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -8,6 +8,7 @@ concurrency: (github.event.action == 'labeled' || github.event.action == 'unlabeled') && github.event.label.name != 'sweep-enabled' && github.event.label.name != 'full-sweep-enabled' && + github.event.label.name != 'non-canary-full-sweep-enabled' && github.run_id || 'active' }} @@ -39,7 +40,8 @@ jobs: ( (github.event.action != 'labeled' && github.event.action != 'unlabeled') || github.event.label.name == 'sweep-enabled' || - github.event.label.name == 'full-sweep-enabled' + github.event.label.name == 'full-sweep-enabled' || + github.event.label.name == 'non-canary-full-sweep-enabled' ) steps: - name: Checkout code @@ -61,12 +63,14 @@ jobs: !github.event.pull_request.draft && ( contains(github.event.pull_request.labels.*.name, 'sweep-enabled') || - contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') + contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') || + contains(github.event.pull_request.labels.*.name, 'non-canary-full-sweep-enabled') ) && ( (github.event.action != 'labeled' && github.event.action != 'unlabeled') || github.event.label.name == 'sweep-enabled' || - github.event.label.name == 'full-sweep-enabled' + github.event.label.name == 'full-sweep-enabled' || + github.event.label.name == 'non-canary-full-sweep-enabled' ) ) || ( @@ -85,10 +89,13 @@ jobs: - name: Reject conflicting sweep labels if: >- github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'sweep-enabled') && - contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') + ( + (contains(github.event.pull_request.labels.*.name, 'sweep-enabled') && contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled')) || + (contains(github.event.pull_request.labels.*.name, 'sweep-enabled') && contains(github.event.pull_request.labels.*.name, 'non-canary-full-sweep-enabled')) || + (contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') && contains(github.event.pull_request.labels.*.name, 'non-canary-full-sweep-enabled')) + ) run: | - echo "::error::PR has both 'sweep-enabled' and 'full-sweep-enabled' labels. Remove one — 'full-sweep-enabled' runs the full intermediate concurrency sweep; 'sweep-enabled' trims to max(conc) per parallelism config." + echo "::error::PR has multiple conflicting sweep labels. Pick exactly one of: 'sweep-enabled' (trims to max(conc) per parallelism config), 'full-sweep-enabled' (full intermediate concurrency sweep, with canary gate), or 'non-canary-full-sweep-enabled' (full sweep, no canary gate)." exit 1 - name: Checkout code @@ -726,12 +733,14 @@ jobs: !github.event.pull_request.draft && ( contains(github.event.pull_request.labels.*.name, 'sweep-enabled') || - contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') + contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') || + contains(github.event.pull_request.labels.*.name, 'non-canary-full-sweep-enabled') ) && ( (github.event.action != 'labeled' && github.event.action != 'unlabeled') || github.event.label.name == 'sweep-enabled' || - github.event.label.name == 'full-sweep-enabled' + github.event.label.name == 'full-sweep-enabled' || + github.event.label.name == 'non-canary-full-sweep-enabled' ) runs-on: ubuntu-latest permissions: From 80f6171a57f640f02ce5ea4685a2ab29fc64737c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 09:13:24 -0700 Subject: [PATCH 04/14] TEMP T6: trigger sweep for P1 cancellation test --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 614b6104e..44239501b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3137,3 +3137,9 @@ - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354 +- config-keys: + - gptoss-fp4-h100-vllm + description: + - "TEMP T6 cancellation test for PR #1503 — DO NOT MERGE — verifies always() defeats workflow cancellation" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 + From 7d2317403c4e9f882668c0d8276f3ae4408f5d6c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 09:25:58 -0700 Subject: [PATCH 05/14] fix: resolve perf-changelog.yaml merge conflict markers The merge of main into sweep-canary-gate (e49c5bf1) committed unresolved conflict markers in perf-changelog.yaml, breaking process_changelog.py parsing and failing the Run Sweep setup job. Keep all three entries: the AMD/MI355X DSv4 image bump (#1568), the power-aggregation validation entry (#1558), and the TEMP T6 entry used to trigger this PR's own canary test. --- perf-changelog.yaml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index cbf9bfdb0..62c2ea75f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3147,13 +3147,6 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354 - config-keys: -<<<<<<< sweep-canary-gate - - gptoss-fp4-h100-vllm - description: - - "TEMP T6 cancellation test for PR #1503 — DO NOT MERGE — verifies always() defeats workflow cancellation" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 - -======= - dsv4-fp4-mi355x-sglang description: - "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4" @@ -3166,4 +3159,9 @@ description: - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 ->>>>>>> main + +- config-keys: + - gptoss-fp4-h100-vllm + description: + - "TEMP T6 cancellation test for PR #1503 — DO NOT MERGE — verifies always() defeats workflow cancellation" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 From d51965fe9433ee41c51826f5275a76ce5319505e Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 09:26:06 -0700 Subject: [PATCH 06/14] run-sweep: drop redundant evals-only PR-label check from canary-select MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit evals-only is a per-entry perf-changelog field (validation.py:484: evals_only: bool = Field(alias="evals-only", default=False)), not a PR label. The canary-select if-clause's !contains(github.event.pull_request.labels.*.name, 'evals-only') was a mis-design — it treated evals-only as a PR-level gate. The PR-label check is also redundant. process_changelog.py routes evals-only changelog entries through --evals-only in generate_sweep_configs.py, which leaves single_node['1k1k'] and single_node['8k1k'] empty (covered by test_evals_only_no_benchmarks). The canary-select jq filter walks exactly those arrays, so for an evals-only-only changelog it emits canary-config=[] and canary-sweep skips via its existing guard. No PR-label inspection needed. The previous T4 test (evals-only PR label + full-sweep-enabled) is obsolete and will be removed from the PR body. --- .github/workflows/run-sweep.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 9d5233779..8912ff625 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -147,8 +147,7 @@ jobs: if: >- needs.setup.outputs.reuse-enabled != 'true' && github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') && - !contains(github.event.pull_request.labels.*.name, 'evals-only') + contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') runs-on: ubuntu-latest outputs: canary-config: ${{ steps.pick.outputs.canary-config }} From 11ab3aec36dd2bcfd5419c372dbeda689e9f54f4 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 09:40:34 -0700 Subject: [PATCH 07/14] run-sweep: !cancelled() so fan-out honors workflow cancellation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eight sweep-* fan-out jobs gated their if: clause on `always() && needs.setup.result == 'success' && ...`. `always()` returns true regardless of workflow cancellation, so any matrix entry whose runner was already assigned would launch and run to completion even after `gh run cancel` — and queued matrix entries kept picking up runners after the cancel signal. Empirically reproduced on run 26524225403: 12 queued entries all picked up runners after cancel and finished with conclusion=success; zero entries were marked cancelled. Switch the eight gates to `!cancelled() && ...`. Per the GitHub Actions expression reference, `!cancelled()` evaluates false once the workflow is cancelled (by user, by concurrency, or by upstream), which is what we want here — fan-out should not start (and should not continue) once the user has cancelled. collect-results (line 520) and the housekeeping jobs at 537/607/626/ 664/697/730 retain `always()` on purpose so partial-result aggregation and cleanup still run on cancel; only the 8 expensive fan-out matrices are switched. --- .github/workflows/run-sweep.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 8912ff625..62b8ff191 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -217,7 +217,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -262,7 +262,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -281,7 +281,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -318,7 +318,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -338,7 +338,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -377,7 +377,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -423,7 +423,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && @@ -461,7 +461,7 @@ jobs: needs: [setup, canary-select, canary-sweep] if: >- ${{ - always() && + !cancelled() && needs.setup.result == 'success' && needs.setup.outputs.reuse-enabled != 'true' && (needs.canary-sweep.result == 'success' || needs.canary-sweep.result == 'skipped') && From e166509d4352da5329fb3b418912f0716c0b9646 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 09:40:48 -0700 Subject: [PATCH 08/14] reuse: accept non-canary-full-sweep-enabled label alongside full-sweep-enabled The reuse path's two hard label checks only accepted full-sweep-enabled, so a PR that landed using the new non-canary-full-sweep-enabled escape hatch could not subsequently approve /reuse-sweep-run on main or be merged via utils/merge_with_reuse.sh. Both labels produce a complete full-sweep matrix in run-sweep.yml (the only difference is whether the canary smoke test runs first), so both should be reusable. Changes: - find_reusable_sweep_run.py --full-sweep-label now accepts a comma- separated list (mirroring --allowed-author-associations in the same file). Default extended to "full-sweep-enabled,non-canary-full-sweep-enabled"; reuse passes if the PR carries any one. - merge_with_reuse.sh preflight now accepts either label and updates the error message accordingly. - Two new pytest cases: - PR with only non-canary-full-sweep-enabled is accepted. - PR with neither label fails with both names in the error message. --- utils/find_reusable_sweep_run.py | 18 +++- utils/merge_with_reuse.sh | 9 +- utils/test_find_reusable_sweep_run.py | 130 ++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 6 deletions(-) diff --git a/utils/find_reusable_sweep_run.py b/utils/find_reusable_sweep_run.py index 8af018a8e..3f814d2e5 100644 --- a/utils/find_reusable_sweep_run.py +++ b/utils/find_reusable_sweep_run.py @@ -276,7 +276,11 @@ def main() -> int: parser.add_argument("--event-name", required=True) parser.add_argument("--ref", required=True) parser.add_argument("--workflow-id", default="run-sweep.yml") - parser.add_argument("--full-sweep-label", default="full-sweep-enabled") + parser.add_argument( + "--full-sweep-label", + default="full-sweep-enabled,non-canary-full-sweep-enabled", + help="Comma-separated PR labels treated as 'full sweep'; reuse requires at least one.", + ) parser.add_argument("--pinned-run-command", default="/reuse-sweep-run") parser.add_argument( "--allowed-author-associations", @@ -355,10 +359,16 @@ def main() -> int: pr = github_api(args.repo, f"/pulls/{pr_number}", token) labels = label_names(pr) - if args.full_sweep_label not in labels: + accepted_full_sweep_labels = { + value.strip() + for value in args.full_sweep_label.split(",") + if value.strip() + } + if not accepted_full_sweep_labels.intersection(labels): + accepted = ", ".join(sorted(accepted_full_sweep_labels)) raise RuntimeError( - f"PR #{pr_number} has {args.pinned_run_command} authorization but not " - f"{args.full_sweep_label}." + f"PR #{pr_number} has {args.pinned_run_command} authorization but is " + f"missing any of: {accepted}." ) if not pr.get("merged_at"): raise RuntimeError(f"PR #{pr_number} is not marked as merged.") diff --git a/utils/merge_with_reuse.sh b/utils/merge_with_reuse.sh index 9336b81c2..a94ea0f69 100755 --- a/utils/merge_with_reuse.sh +++ b/utils/merge_with_reuse.sh @@ -38,8 +38,13 @@ PR_STATE="$(jq -r '.state' <<<"$PR_INFO")" [ "$PR_STATE" = "OPEN" ] || die "PR #${PR} is ${PR_STATE}, expected OPEN" HEAD_BRANCH="$(jq -r '.headRefName' <<<"$PR_INFO")" -HAS_FULL_SWEEP="$(jq -r '[.labels[].name] | index("full-sweep-enabled") // ""' <<<"$PR_INFO")" -[ -n "$HAS_FULL_SWEEP" ] || die "PR #${PR} is missing the 'full-sweep-enabled' label" +HAS_FULL_SWEEP="$(jq -r ' + [.labels[].name] as $names + | if (($names | index("full-sweep-enabled")) != null) + or (($names | index("non-canary-full-sweep-enabled")) != null) + then "1" else "" end +' <<<"$PR_INFO")" +[ -n "$HAS_FULL_SWEEP" ] || die "PR #${PR} is missing 'full-sweep-enabled' or 'non-canary-full-sweep-enabled' label" # Warn early if no successful run exists on any current PR commit. PR_SHAS="$(gh api "repos/${REPO}/pulls/${PR}/commits" --paginate --jq '.[].sha')" diff --git a/utils/test_find_reusable_sweep_run.py b/utils/test_find_reusable_sweep_run.py index e779bbe8b..0f8a09ac7 100644 --- a/utils/test_find_reusable_sweep_run.py +++ b/utils/test_find_reusable_sweep_run.py @@ -455,3 +455,133 @@ def fake_paginated_github_api(repo, path, token, item_key, params=None): assert outputs["reuse-enabled"] == "false" assert outputs["reuse-source-pr-number"] == "1321" assert outputs["reuse-reason"] == "PR #1321 has no /reuse-sweep-run authorization" + + +def test_main_accepts_non_canary_full_sweep_label(monkeypatch, tmp_path) -> None: + comments = [ + { + "created_at": "2026-05-13T00:00:00Z", + "author_association": "OWNER", + "body": "/reuse-sweep-run 25763404168", + }, + ] + run = { + "id": 25763404168, + "event": "pull_request", + "status": "completed", + "conclusion": "success", + "path": ".github/workflows/run-sweep.yml", + "pull_requests": [{"number": 1321}], + "run_attempt": 1, + "html_url": "https://github.com/SemiAnalysisAI/InferenceX/actions/runs/25763404168", + "head_sha": "abc123", + } + + def fake_github_api(repo, path, token, params=None): + if path == "/commits/merge-sha/pulls": + return [{"number": 1321}] + if path == "/pulls/1321": + return { + "merged_at": "2026-05-13T00:01:00Z", + "labels": [{"name": "non-canary-full-sweep-enabled"}], + "head": {"sha": "abc123"}, + } + if path == "/actions/runs/25763404168": + return run + raise AssertionError(f"unexpected GitHub API path: {path}") + + def fake_paginated_github_api(repo, path, token, item_key, params=None): + if path == "/issues/1321/comments": + return comments + if path == "/pulls/1321/commits": + return [{"sha": "abc123"}] + if path == "/actions/runs/25763404168/artifacts": + return [{"name": "results_bmk"}] + raise AssertionError(f"unexpected paginated GitHub API path: {path}") + + output_path = tmp_path / "outputs" + monkeypatch.setenv("GITHUB_TOKEN", "token") + monkeypatch.setattr(reuse, "github_api", fake_github_api) + monkeypatch.setattr(reuse, "paginated_github_api", fake_paginated_github_api) + monkeypatch.setattr( + reuse.sys, + "argv", + [ + "find_reusable_sweep_run.py", + "--repo", + "SemiAnalysisAI/InferenceX", + "--commit-sha", + "merge-sha", + "--event-name", + "push", + "--ref", + "refs/heads/main", + "--github-output", + str(output_path), + ], + ) + + assert reuse.main() == 0 + + outputs = dict(line.split("=", 1) for line in output_path.read_text().splitlines()) + assert outputs["reuse-enabled"] == "true" + + +def test_main_rejects_pr_with_neither_full_sweep_label(monkeypatch, tmp_path) -> None: + comments = [ + { + "created_at": "2026-05-13T00:00:00Z", + "author_association": "OWNER", + "body": "/reuse-sweep-run 25763404168", + }, + ] + + def fake_github_api(repo, path, token, params=None): + if path == "/commits/merge-sha/pulls": + return [{"number": 1321}] + if path == "/pulls/1321": + return { + "merged_at": "2026-05-13T00:01:00Z", + "labels": [{"name": "sweep-enabled"}], + "head": {"sha": "abc123"}, + } + raise AssertionError(f"unexpected GitHub API path: {path}") + + def fake_paginated_github_api(repo, path, token, item_key, params=None): + if path == "/issues/1321/comments": + return comments + raise AssertionError(f"unexpected paginated GitHub API path: {path}") + + output_path = tmp_path / "outputs" + monkeypatch.setenv("GITHUB_TOKEN", "token") + monkeypatch.setattr(reuse, "github_api", fake_github_api) + monkeypatch.setattr(reuse, "paginated_github_api", fake_paginated_github_api) + monkeypatch.setattr( + reuse.sys, + "argv", + [ + "find_reusable_sweep_run.py", + "--repo", + "SemiAnalysisAI/InferenceX", + "--commit-sha", + "merge-sha", + "--event-name", + "push", + "--ref", + "refs/heads/main", + "--github-output", + str(output_path), + ], + ) + + try: + reuse.main() + except RuntimeError as error: + msg = str(error) + assert "full-sweep-enabled" in msg + assert "non-canary-full-sweep-enabled" in msg + else: + raise AssertionError( + "expected RuntimeError when PR has neither full-sweep-enabled nor " + "non-canary-full-sweep-enabled label" + ) From 398395256ccc843094fd55cb6a4fdb309dd7f17d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 09:55:42 -0700 Subject: [PATCH 09/14] perf-changelog: drop TEMP T6 cancellation-test entry Used by T6/T7 (cancellation-regression repro and !cancelled() fix verification, runs 26524225403 and 26525033406). Both tests are complete and recorded in the PR description; the entry is no longer needed. --- perf-changelog.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 62c2ea75f..ad37e0c27 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3159,9 +3159,3 @@ description: - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 - -- config-keys: - - gptoss-fp4-h100-vllm - description: - - "TEMP T6 cancellation test for PR #1503 — DO NOT MERGE — verifies always() defeats workflow cancellation" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 From 13272b920a3154470a60a7cc5c8147a2647d5cd0 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 11:17:03 -0700 Subject: [PATCH 10/14] TEMP T8 redo: trigger Run Sweep for skipped-canary fan-out test --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ad37e0c27..6199b55d3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3159,3 +3159,9 @@ description: - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 + +- config-keys: + - gptoss-fp4-h100-vllm + description: + - "TEMP T8 redo for PR #1503 — DO NOT MERGE — verifies skipped-canary fan-out under !cancelled() gate with non-canary-full-sweep-enabled label" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 From d35e0b76afec5b41f3f4b5de3348922196443d0d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 11:52:43 -0700 Subject: [PATCH 11/14] =?UTF-8?q?TEMP:=20swap=20T8=20redo=20=E2=86=92=20T3?= =?UTF-8?q?=20redo=20(kimik2.5-int4-h100-vllm)=20for=20PR=20#1503?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DO NOT MERGE. Verifies empty-canary-config path on HEAD's !cancelled() gate. Agentic-only config → canary-select returns [] → canary-sweep skipped via inner guard → agentic fan-out on full search space. --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6199b55d3..2ca38c3ce 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3161,7 +3161,7 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 - config-keys: - - gptoss-fp4-h100-vllm + - kimik2.5-int4-h100-vllm description: - - "TEMP T8 redo for PR #1503 — DO NOT MERGE — verifies skipped-canary fan-out under !cancelled() gate with non-canary-full-sweep-enabled label" + - "TEMP T3 redo for PR #1503 — DO NOT MERGE — verifies empty-canary-config skip path (agentic-only config) + agentic fan-out on full search space under !cancelled() gate" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 From b632ca848680e997934e8ac8564e072710137409 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 14:28:21 -0700 Subject: [PATCH 12/14] =?UTF-8?q?TEMP:=20swap=20T3=20redo=20=E2=86=92=20T1?= =?UTF-8?q?=20redo=20(gptoss-fp4-h100-vllm)=20for=20PR=20#1503?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DO NOT MERGE. Verifies canary success → fan-out + dedup path on HEAD's !cancelled() gate. --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2ca38c3ce..890b745ac 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3161,7 +3161,7 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 - config-keys: - - kimik2.5-int4-h100-vllm + - gptoss-fp4-h100-vllm description: - - "TEMP T3 redo for PR #1503 — DO NOT MERGE — verifies empty-canary-config skip path (agentic-only config) + agentic fan-out on full search space under !cancelled() gate" + - "TEMP T1 redo for PR #1503 — DO NOT MERGE — verifies canary success → fan-out runs on remaining-search-space-config (canary entry de-duped) under !cancelled() gate" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503 From e3f40988d0f5de1ef21878365444b7ccaab13f37 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 14:36:05 -0700 Subject: [PATCH 13/14] Document non-canary-full-sweep-enabled label AGENTS.md: list all three sweep labels in the table; clarify multi-label rejection. .github/workflows/README.md: reuse section accepts either full-sweep label; fail-closed wording covers both. KLAUD_DEBUG.md / klaud-pr-status-html.md: mention the new label alongside full-sweep-enabled. --- .claude/commands/klaud-pr-status-html.md | 2 +- .github/workflows/README.md | 23 ++++++++++++----------- AGENTS.md | 7 ++++--- KLAUD_DEBUG.md | 2 +- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.claude/commands/klaud-pr-status-html.md b/.claude/commands/klaud-pr-status-html.md index 6051357c6..47d6024f8 100644 --- a/.claude/commands/klaud-pr-status-html.md +++ b/.claude/commands/klaud-pr-status-html.md @@ -30,7 +30,7 @@ State buckets: - **RUNNING** — no failed checks; at least one is `QUEUED` / `IN_PROGRESS` / `PENDING`. - **READY** — no failed, no pending, and at least one `Run Sweep` check is `SUCCESS`. - **NO_SUCCESS** — sweep ran but never produced a `SUCCESS` (e.g. all matrix jobs got SKIPPED). -- **NO_SWEEP** — no `Run Sweep` check exists for this head SHA at all (sweep never triggered — usually missing `full-sweep-enabled` label). +- **NO_SWEEP** — no `Run Sweep` check exists for this head SHA at all (sweep never triggered — usually missing a sweep label such as `full-sweep-enabled` or `non-canary-full-sweep-enabled`). ```bash : > /tmp/klaud_pr_status.tsv diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 2252d8b7c..8517d1580 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -180,18 +180,18 @@ test-config --config-keys *-b200-* --conc 4 8 --config-files .github/configs/nvi ## Reusing an Approved PR Full Sweep -If a PR has already run the full untrimmed sweep (`full-sweep-enabled` label), -a maintainer can avoid running the same sweep again after merge by leaving a -PR comment before merging: +If a PR has already run the full untrimmed sweep (`full-sweep-enabled` with a +sequential canary, or `non-canary-full-sweep-enabled` without one), a +maintainer can avoid running the same sweep again after merge by leaving a PR +comment before merging: ``` /reuse-sweep-run ``` -That reuses the latest successful `run-sweep.yml` `pull_request` run for the -PR's current head SHA. If the PR was rebased or had to merge `main` after the -successful sweep — so the current head no longer has a matching run — pin the -source run explicitly: +That reuses the latest successful `run-sweep.yml` `pull_request` run whose +commit is still part of the PR. To select a particular eligible successful +run, pin the source run explicitly: ``` /reuse-sweep-run @@ -209,10 +209,11 @@ Only comments from `OWNER`, `MEMBER`, or `COLLABORATOR` users authorize reuse. The most recent matching comment wins, so a maintainer can supersede an earlier pin by leaving a new `/reuse-sweep-run []` comment. -Reuse fails closed: if the comment is present but the `full-sweep-enabled` -label, source PR run, or artifacts cannot be validated, the push-to-main -workflow fails instead of falling back to a cluster sweep. Without the comment, -the push-to-main workflow runs the normal full sweep. +Reuse fails closed: if the comment is present but neither full-sweep label +(`full-sweep-enabled` or `non-canary-full-sweep-enabled`) is present, or if +the source PR run or artifacts cannot be validated, the push-to-main workflow +fails instead of falling back to a cluster sweep. Without the comment, the +push-to-main workflow runs the normal full sweep. ## Validation Architecture diff --git a/AGENTS.md b/AGENTS.md index cd057f4d3..764dc2f80 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -59,12 +59,13 @@ Git: conventional commit messages. `[skip-sweep]` in commit message skips benchm ### Pull Request Sweep Labels -PRs do not run the sweep automatically - `run-sweep.yml` is gated on a label. Pick exactly one; setting both is rejected by the workflow's `setup` job. +PRs do not run the sweep automatically - `run-sweep.yml` is gated on a label. Pick exactly one; setting multiple sweep labels is rejected by the workflow's `setup` job. - `sweep-enabled` - runs the sweep with `--trim-conc` (each parallelism config reduced to its single highest concurrency). Default for most PRs. -- `full-sweep-enabled` - runs the full intermediate concurrency sweep, identical to push-to-main. Use when intermediate points matter (e.g. a recipe change shifts the throughput/latency curve, not just its endpoints). +- `full-sweep-enabled` - runs the full intermediate concurrency sweep behind a sequential single-node canary gate. Use when intermediate points matter (e.g. a recipe change shifts the throughput/latency curve, not just its endpoints). +- `non-canary-full-sweep-enabled` - runs the full intermediate concurrency sweep without the canary gate. Use when the canary is flaky or not representative of the affected configuration. -**The sweep does not trigger while the PR has merge conflicts.** Even with `sweep-enabled` / `full-sweep-enabled` applied, the `run-sweep.yml` workflow will not start until the PR cleanly merges into main — a stale claude/* or update-* branch with a `perf-changelog.yaml` conflict (the common case) will sit in NO_SWEEP / NO_SUCCESS until rebased. Resolution recipe is documented in `KLAUD_DEBUG.md §1.1`: `git merge origin/main`, then `git checkout origin/main -- perf-changelog.yaml`, then re-append the PR's own changelog entry at the tail. Don't 3-way merge `perf-changelog.yaml`; whitespace edits silently re-trigger the deletion check. +**The sweep does not trigger while the PR has merge conflicts.** Even with `sweep-enabled`, `full-sweep-enabled`, or `non-canary-full-sweep-enabled` applied, the `run-sweep.yml` workflow will not start until the PR cleanly merges into main — a stale claude/* or update-* branch with a `perf-changelog.yaml` conflict (the common case) will sit in NO_SWEEP / NO_SUCCESS until rebased. Resolution recipe is documented in `KLAUD_DEBUG.md §1.1`: `git merge origin/main`, then `git checkout origin/main -- perf-changelog.yaml`, then re-append the PR's own changelog entry at the tail. Don't 3-way merge `perf-changelog.yaml`; whitespace edits silently re-trigger the deletion check. Push-to-main always runs the full untrimmed sweep unless `[skip-sweep]` is in the commit message. Trim logic lives in `trim_conc()` in `utils/process_changelog.py`: single-node entries are grouped by every non-`conc` field and only the highest-`conc` entry per group is kept; multi-node entries have their `conc` list collapsed to `[max(conc)]`. diff --git a/KLAUD_DEBUG.md b/KLAUD_DEBUG.md index 92eb76bfc..1f81b6a9f 100644 --- a/KLAUD_DEBUG.md +++ b/KLAUD_DEBUG.md @@ -193,7 +193,7 @@ Or check whether any other recipe on main uses the proposed tag — if zero uses ## 9. PR conventions for this repo - Image-bump / new-recipe PRs I open on behalf of the user (or that the user creates) get the **`[Klaud Cold]`** title prefix. -- Add the `full-sweep-enabled` label so a full sweep actually runs (`gh api -X POST ... labels[]=full-sweep-enabled`). Without it, the sweep is mostly SKIPPED. +- Add the `full-sweep-enabled` label so a canary-gated full sweep actually runs (`gh api -X POST ... labels[]=full-sweep-enabled`). Use `non-canary-full-sweep-enabled` instead only when the single-node canary is flaky or unrepresentative; it runs the full sweep without the canary gate. Without one of the sweep labels, the sweep is mostly SKIPPED. - After any code change that shifts a PR's scope (drops a recipe, changes an image tag), **update the PR title AND body in the same step** and **verify** with `gh pr view --json title,body` — `gh pr edit` silently fails (see §8). - `utils/merge_with_reuse.sh ` is the merge entrypoint; it handles the `perf-changelog.yaml` auto-append. From d87804d9dfff9b7408388a23a16e6152e5c300b5 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 27 May 2026 14:36:09 -0700 Subject: [PATCH 14/14] Drop TEMP T1-redo changelog entry for PR #1503 Final cleanup. T8 redo (run 26530059401, commit 13272b92, success) is the reusable green sweep for /reuse-sweep-run. --- perf-changelog.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 890b745ac..ad37e0c27 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3159,9 +3159,3 @@ description: - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 - -- config-keys: - - gptoss-fp4-h100-vllm - description: - - "TEMP T1 redo for PR #1503 — DO NOT MERGE — verifies canary success → fan-out runs on remaining-search-space-config (canary entry de-duped) under !cancelled() gate" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1503