From 78b51e3e2c7bf766cf015fe71fe5df82c1e18007 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 18 May 2026 10:43:34 -0700
Subject: [PATCH] run-sweep: gate full-sweep PRs behind a sequential canary

When a PR carries `full-sweep-enabled` (and not `evals-only`), pick the
lowest-conc single-node benchmark entry as a canary and run it before
fanning out the full sweep. If the canary fails, the eight fan-out jobs
are skipped to save cluster time on shared failures (bad image tag,
removed CLI flag, etc.).

Design choices:
- Canary candidacy is restricted to single_node['1k1k' | '8k1k'] and
  excludes entries with run-eval: true, so the canary is always a pure
  benchmark smoke test using the existing single-node template.
- The canary entry is removed from the regular fan-out's matrix (via
  remaining-search-space-config) only when the canary actually succeeded.
  On canary skip / cancel / canary-select failure, the regular fan-out
  falls back to the full search-space-config so coverage is preserved.
- The fan-out gate blocks only on `canary-sweep.result == 'failure'` --
  every other state (success, skipped, cancelled) proceeds, so a bug in
  the canary mechanism never blocks the rest of the sweep.
- Non-full-sweep PRs, draft PRs, pushes to main, and the reuse path all
  behave identically to before via existing gates.

The aggregated results_bmk artifact picks up both the canary's row and
the regular fan-out's rows via the existing bmk_* glob -- each entry
appears exactly once.
---
 .github/workflows/run-sweep.yml | 170 ++++++++++++++++++++++++++++----
 1 file changed, 152 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index c5ece9804..2b08a4a60 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -135,9 +135,88 @@ jobs:
                     --ref "${{ github.ref }}" \
                     --workflow-id "run-sweep.yml"
 
-    sweep-multi-node-1k1k:
+    canary-select:
         needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' }}
+        if: >-
+            needs.setup.outputs.reuse-enabled != 'true' &&
+            github.event_name == 'pull_request' &&
+            contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') &&
+            !contains(github.event.pull_request.labels.*.name, 'evals-only')
+        runs-on: ubuntu-latest
+        outputs:
+            canary-config: ${{ steps.pick.outputs.canary-config }}
+            remaining-search-space-config: ${{ steps.pick.outputs.remaining-search-space-config }}
+        steps:
+            - id: pick
+              env:
+                  SEARCH_SPACE: ${{ needs.setup.outputs.search-space-config }}
+              run: |
+                  selection=$(jq -c '
+                    def remove_one($needle):
+                      if $needle == null then .
+                      else
+                        (index($needle)) as $idx
+                        | if $idx == null then . else del(.[$idx]) end
+                      end;
+
+                    # Canary is a benchmark-only smoke test — exclude entries
+                    # whose primary purpose is eval (run-eval == true) so the
+                    # picked canary never runs an eval pass.
+                    (((.single_node["1k1k"] // []) + (.single_node["8k1k"] // []))
+                      | map(select(.["run-eval"] != true))) as $candidates
+                    | (if ($candidates | length) == 0 then null else ($candidates | min_by(.conc)) end) as $canary
+                    | {
+                        canary: (if $canary == null then [] else [$canary] end),
+                        remaining: (
+                          .
+                          | .single_node = (.single_node // {})
+                          | .single_node["1k1k"] = ((.single_node["1k1k"] // []) | remove_one($canary))
+                          | .single_node["8k1k"] = ((.single_node["8k1k"] // []) | remove_one($canary))
+                        )
+                      }
+                  ' <<<"$SEARCH_SPACE")
+                  echo "canary-config=$(jq -c '.canary' <<<"$selection")" >> "$GITHUB_OUTPUT"
+                  echo "remaining-search-space-config=$(jq -c '.remaining' <<<"$selection")" >> "$GITHUB_OUTPUT"
+
+    canary-sweep:
+        needs: canary-select
+        if: ${{ needs.canary-select.outputs.canary-config != '' && needs.canary-select.outputs.canary-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: canary /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.canary-select.outputs.canary-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: false
+
+    sweep-multi-node-1k1k:
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null'
+            }}
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
         name: multi-node 1k1k /
         strategy:
@@ -174,8 +253,15 @@ jobs:
             run-eval: false
 
     sweep-multi-node-8k1k:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null'
+            }}
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
         name: multi-node 8k1k /
         strategy:
@@ -186,14 +272,22 @@ jobs:
         with: *multi-node-inputs
 
     sweep-single-node-1k1k:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' &&
+              toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != '[]'
+            }}
         uses: ./.github/workflows/benchmark-tmpl.yml
         name: single-node 1k1k /
         strategy:
             fail-fast: false
             matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }}
+                config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k'] }}
         secrets: inherit
         with: &single-node-inputs
             exp-name: ${{ matrix.config.exp-name }}
@@ -215,20 +309,35 @@ jobs:
             run-eval: ${{ matrix.config.run-eval }}
 
     sweep-single-node-8k1k:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' &&
+              toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != '[]'
+            }}
         uses: ./.github/workflows/benchmark-tmpl.yml
         name: single-node 8k1k /
         strategy:
             fail-fast: false
             matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
+                config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k'] }}
         secrets: inherit
         with: *single-node-inputs
 
     sweep-agentic:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null'
+            }}
         uses: ./.github/workflows/benchmark-tmpl.yml
         name: agentic /
         strategy:
@@ -259,8 +368,15 @@ jobs:
             scenario-type: agentic-coding
 
     sweep-multi-node-agentic:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null'
+            }}
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
         name: multi-node agentic /
         strategy:
@@ -298,8 +414,16 @@ jobs:
             scenario-type: agentic-coding
 
     sweep-evals:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null'
+            }}
         uses: ./.github/workflows/benchmark-tmpl.yml
         name: eval /
         strategy:
@@ -328,8 +452,16 @@ jobs:
             eval-only: true
 
     sweep-multi-node-evals:
-        needs: setup
-        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }}
+        needs: [setup, canary-select, canary-sweep]
+        if: >-
+            ${{
+              always() &&
+              needs.setup.result == 'success' &&
+              needs.setup.outputs.reuse-enabled != 'true' &&
+              needs.canary-sweep.result != 'failure' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' &&
+              toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null'
+            }}
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
         name: multi-node eval /
         strategy:
@@ -368,6 +500,7 @@ jobs:
     collect-results:
         needs:
             [
+                canary-sweep,
                 sweep-single-node-1k1k,
                 sweep-single-node-8k1k,
                 sweep-agentic,
@@ -381,6 +514,7 @@ jobs:
               always() &&
               needs.setup.result == 'success' &&
               (
+                needs.canary-sweep.result == 'success' ||
                 needs.sweep-single-node-1k1k.result != 'skipped' ||
                 needs.sweep-single-node-8k1k.result != 'skipped' ||
                 needs.sweep-multi-node-1k1k.result != 'skipped' ||