From 78b51e3e2c7bf766cf015fe71fe5df82c1e18007 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 18 May 2026 10:43:34 -0700 Subject: [PATCH] run-sweep: gate full-sweep PRs behind a sequential canary When a PR carries `full-sweep-enabled` (and not `evals-only`), pick the lowest-conc single-node benchmark entry as a canary and run it before fanning out the full sweep. If the canary fails, the eight fan-out jobs are skipped to save cluster time on shared failures (bad image tag, removed CLI flag, etc.). Design choices: - Canary candidacy is restricted to single_node['1k1k' | '8k1k'] and excludes entries with run-eval: true, so the canary is always a pure benchmark smoke test using the existing single-node template. - The canary entry is removed from the regular fan-out's matrix (via remaining-search-space-config) only when the canary actually succeeded. On canary skip / cancel / canary-select failure, the regular fan-out falls back to the full search-space-config so coverage is preserved. - The fan-out gate blocks only on `canary-sweep.result == 'failure'` -- every other state (success, skipped, cancelled) proceeds, so a bug in the canary mechanism never blocks the rest of the sweep. - Non-full-sweep PRs, draft PRs, pushes to main, and the reuse path all behave identically to before via existing gates. The aggregated results_bmk artifact picks up both the canary's row and the regular fan-out's rows via the existing bmk_* glob -- each entry appears exactly once. --- .github/workflows/run-sweep.yml | 170 ++++++++++++++++++++++++++++---- 1 file changed, 152 insertions(+), 18 deletions(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index c5ece9804..2b08a4a60 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -135,9 +135,88 @@ jobs: --ref "${{ github.ref }}" \ --workflow-id "run-sweep.yml" - sweep-multi-node-1k1k: + canary-select: needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' }} + if: >- + needs.setup.outputs.reuse-enabled != 'true' && + github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') && + !contains(github.event.pull_request.labels.*.name, 'evals-only') + runs-on: ubuntu-latest + outputs: + canary-config: ${{ steps.pick.outputs.canary-config }} + remaining-search-space-config: ${{ steps.pick.outputs.remaining-search-space-config }} + steps: + - id: pick + env: + SEARCH_SPACE: ${{ needs.setup.outputs.search-space-config }} + run: | + selection=$(jq -c ' + def remove_one($needle): + if $needle == null then . + else + (index($needle)) as $idx + | if $idx == null then . else del(.[$idx]) end + end; + + # Canary is a benchmark-only smoke test — exclude entries + # whose primary purpose is eval (run-eval == true) so the + # picked canary never runs an eval pass. + (((.single_node["1k1k"] // []) + (.single_node["8k1k"] // [])) + | map(select(.["run-eval"] != true))) as $candidates + | (if ($candidates | length) == 0 then null else ($candidates | min_by(.conc)) end) as $canary + | { + canary: (if $canary == null then [] else [$canary] end), + remaining: ( + . + | .single_node = (.single_node // {}) + | .single_node["1k1k"] = ((.single_node["1k1k"] // []) | remove_one($canary)) + | .single_node["8k1k"] = ((.single_node["8k1k"] // []) | remove_one($canary)) + ) + } + ' <<<"$SEARCH_SPACE") + echo "canary-config=$(jq -c '.canary' <<<"$selection")" >> "$GITHUB_OUTPUT" + echo "remaining-search-space-config=$(jq -c '.remaining' <<<"$selection")" >> "$GITHUB_OUTPUT" + + canary-sweep: + needs: canary-select + if: ${{ needs.canary-select.outputs.canary-config != '' && needs.canary-select.outputs.canary-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: canary / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.canary-select.outputs.canary-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + run-eval: false + + sweep-multi-node-1k1k: + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node 1k1k / strategy: @@ -174,8 +253,15 @@ jobs: run-eval: false sweep-multi-node-8k1k: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node 8k1k / strategy: @@ -186,14 +272,22 @@ jobs: with: *multi-node-inputs sweep-single-node-1k1k: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != '[]' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: single-node 1k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }} + config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k'] }} secrets: inherit with: &single-node-inputs exp-name: ${{ matrix.config.exp-name }} @@ -215,20 +309,35 @@ jobs: run-eval: ${{ matrix.config.run-eval }} sweep-single-node-8k1k: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' && + toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != '[]' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: single-node 8k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }} + config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k'] }} secrets: inherit with: *single-node-inputs sweep-agentic: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: agentic / strategy: @@ -259,8 +368,15 @@ jobs: scenario-type: agentic-coding sweep-multi-node-agentic: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node agentic / strategy: @@ -298,8 +414,16 @@ jobs: scenario-type: agentic-coding sweep-evals: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && + toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' + }} uses: ./.github/workflows/benchmark-tmpl.yml name: eval / strategy: @@ -328,8 +452,16 @@ jobs: eval-only: true sweep-multi-node-evals: - needs: setup - if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }} + needs: [setup, canary-select, canary-sweep] + if: >- + ${{ + always() && + needs.setup.result == 'success' && + needs.setup.outputs.reuse-enabled != 'true' && + needs.canary-sweep.result != 'failure' && + toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && + toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' + }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: multi-node eval / strategy: @@ -368,6 +500,7 @@ jobs: collect-results: needs: [ + canary-sweep, sweep-single-node-1k1k, sweep-single-node-8k1k, sweep-agentic, @@ -381,6 +514,7 @@ jobs: always() && needs.setup.result == 'success' && ( + needs.canary-sweep.result == 'success' || needs.sweep-single-node-1k1k.result != 'skipped' || needs.sweep-single-node-8k1k.result != 'skipped' || needs.sweep-multi-node-1k1k.result != 'skipped' ||