Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 152 additions & 18 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,239 +135,372 @@
--ref "${{ github.ref }}" \
--workflow-id "run-sweep.yml"

sweep-multi-node-1k1k:
canary-select:
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' }}
if: >-
needs.setup.outputs.reuse-enabled != 'true' &&
github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') &&
!contains(github.event.pull_request.labels.*.name, 'evals-only')
runs-on: ubuntu-latest
outputs:
canary-config: ${{ steps.pick.outputs.canary-config }}
remaining-search-space-config: ${{ steps.pick.outputs.remaining-search-space-config }}
steps:
- id: pick
env:
SEARCH_SPACE: ${{ needs.setup.outputs.search-space-config }}
run: |
selection=$(jq -c '
def remove_one($needle):
if $needle == null then .
else
(index($needle)) as $idx
| if $idx == null then . else del(.[$idx]) end
end;

# Canary is a benchmark-only smoke test — exclude entries
# whose primary purpose is eval (run-eval == true) so the
# picked canary never runs an eval pass.
(((.single_node["1k1k"] // []) + (.single_node["8k1k"] // []))
| map(select(.["run-eval"] != true))) as $candidates
| (if ($candidates | length) == 0 then null else ($candidates | min_by(.conc)) end) as $canary
| {
canary: (if $canary == null then [] else [$canary] end),
remaining: (
.
| .single_node = (.single_node // {})
| .single_node["1k1k"] = ((.single_node["1k1k"] // []) | remove_one($canary))
| .single_node["8k1k"] = ((.single_node["8k1k"] // []) | remove_one($canary))
)
}
' <<<"$SEARCH_SPACE")
echo "canary-config=$(jq -c '.canary' <<<"$selection")" >> "$GITHUB_OUTPUT"
echo "remaining-search-space-config=$(jq -c '.remaining' <<<"$selection")" >> "$GITHUB_OUTPUT"

canary-sweep:
needs: canary-select
if: ${{ needs.canary-select.outputs.canary-config != '' && needs.canary-select.outputs.canary-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: canary /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.canary-select.outputs.canary-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: false

sweep-multi-node-1k1k:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
Comment on lines +182 to +210
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null'
}}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node 1k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k'] }}
secrets: inherit
with: &multi-node-inputs
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
exp-name: ${{ matrix.config.exp-name }}
conc-list: ${{ toJson(matrix.config.conc) }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}

prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}

decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
run-eval: false

sweep-multi-node-8k1k:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null'
}}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node 8k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k'] }}
secrets: inherit
with: *multi-node-inputs

sweep-single-node-1k1k:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' &&
toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k']) != '[]'
}}
uses: ./.github/workflows/benchmark-tmpl.yml
name: single-node 1k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }}
config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['1k1k'] }}
secrets: inherit
with: &single-node-inputs
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: ${{ matrix.config.run-eval }}

sweep-single-node-8k1k:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' &&
toJson(fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k']) != '[]'
}}
uses: ./.github/workflows/benchmark-tmpl.yml
name: single-node 8k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
config: ${{ fromJson((needs.canary-sweep.result == 'success' && needs.canary-select.outputs.remaining-search-space-config) || needs.setup.outputs.search-space-config).single_node['8k1k'] }}
secrets: inherit
with: *single-node-inputs

sweep-agentic:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null'
}}
uses: ./.github/workflows/benchmark-tmpl.yml
name: agentic /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['agentic'] }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
offloading: ${{ matrix.config.offloading }}
duration: ${{ matrix.config.duration }}
isl: '0'
osl: '0'
max-model-len: '0'
spec-decoding: 'none'
disagg: ${{ 'false' }}
run-eval: false
scenario-type: agentic-coding

sweep-multi-node-agentic:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null'
}}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node agentic /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['agentic'] }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: '0'
osl: '0'
max-model-len: '0'
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
conc-list: '[${{ matrix.config.conc }}]'
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
conc: ${{ matrix.config.conc }}
duration: ${{ matrix.config.duration }}
run-eval: false
scenario-type: agentic-coding

sweep-evals:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' &&
toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null'
}}
uses: ./.github/workflows/benchmark-tmpl.yml
name: eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: true
eval-only: true

sweep-multi-node-evals:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }}
needs: [setup, canary-select, canary-sweep]
if: >-
${{
always() &&
needs.setup.result == 'success' &&
needs.setup.outputs.reuse-enabled != 'true' &&
needs.canary-sweep.result != 'failure' &&
toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' &&
toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null'
}}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
conc-list: ${{ toJson(matrix.config.conc) }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
run-eval: true
eval-only: true
eval-conc: ${{ matrix.config.eval-conc }}

collect-results:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs:
[
canary-sweep,
sweep-single-node-1k1k,
sweep-single-node-8k1k,
sweep-agentic,
Expand All @@ -381,6 +514,7 @@
always() &&
needs.setup.result == 'success' &&
(
needs.canary-sweep.result == 'success' ||
needs.sweep-single-node-1k1k.result != 'skipped' ||
needs.sweep-single-node-8k1k.result != 'skipped' ||
needs.sweep-multi-node-1k1k.result != 'skipped' ||
Expand Down