Skip to content

Commit 47c949d

Browse files
committed
Add an e2e CI for CUDA windows workflow
1 parent 9591a67 commit 47c949d

4 files changed

Lines changed: 394 additions & 9 deletions

File tree

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
#!/usr/bin/env pwsh
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
param(
9+
[Parameter(Mandatory = $true)]
10+
[string]$Device,
11+
[Parameter(Mandatory = $true)]
12+
[string]$HfModel,
13+
[Parameter(Mandatory = $true)]
14+
[string]$QuantName,
15+
[string]$ModelDir = ".",
16+
[string]$ExpectedCudaVersion = ""
17+
)
18+
19+
Set-StrictMode -Version Latest
20+
$ErrorActionPreference = "Stop"
21+
$PSNativeCommandUseErrorActionPreference = $true
22+
$ProgressPreference = "SilentlyContinue"
23+
24+
if ($Device -ne "cuda-windows") {
25+
throw "Unsupported device '$Device'. Expected 'cuda-windows'."
26+
}
27+
28+
Write-Host "Testing model: $HfModel (quantization: $QuantName)"
29+
30+
$resolvedModelDir = (Resolve-Path -Path $ModelDir).Path
31+
$modelPte = Join-Path -Path $resolvedModelDir -ChildPath "model.pte"
32+
$cudaBlob = Join-Path -Path $resolvedModelDir -ChildPath "aoti_cuda_blob.ptd"
33+
34+
if (-not (Test-Path -Path $modelPte -PathType Leaf)) {
35+
throw "model.pte not found in '$resolvedModelDir'"
36+
}
37+
if (-not (Test-Path -Path $cudaBlob -PathType Leaf)) {
38+
throw "aoti_cuda_blob.ptd not found in '$resolvedModelDir'"
39+
}
40+
41+
$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
42+
$executorchRoot = (Resolve-Path -Path (Join-Path -Path $scriptDir -ChildPath "..\..")).Path
43+
44+
switch ($HfModel) {
45+
"mistralai/Voxtral-Mini-3B-2507" {
46+
$runnerTarget = "voxtral_runner"
47+
$runnerPath = "voxtral"
48+
$runnerPreset = "voxtral-cuda"
49+
$expectedOutput = "identity"
50+
$preprocessor = "voxtral_preprocessor.pte"
51+
$tokenizerUrl = "https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
52+
$tokenizerFile = "tekken.json"
53+
$audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
54+
$audioFile = "poem.wav"
55+
}
56+
"nvidia/parakeet-tdt" {
57+
$runnerTarget = "parakeet_runner"
58+
$runnerPath = "parakeet"
59+
$runnerPreset = "parakeet-cuda"
60+
$expectedOutput = "Phoebe"
61+
$preprocessor = ""
62+
$tokenizerUrl = ""
63+
$tokenizerFile = "tokenizer.model"
64+
$audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
65+
$audioFile = "test_audio.wav"
66+
}
67+
default {
68+
throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, nvidia/parakeet-tdt"
69+
}
70+
}
71+
72+
function Download-IfNeeded {
73+
param(
74+
[Parameter(Mandatory = $true)]
75+
[string]$Url,
76+
[Parameter(Mandatory = $true)]
77+
[string]$OutFile
78+
)
79+
80+
if (Test-Path -Path $OutFile -PathType Leaf) {
81+
Write-Host "Using existing file: $OutFile"
82+
return
83+
}
84+
Write-Host "Downloading $Url -> $OutFile"
85+
Invoke-WebRequest -Uri $Url -OutFile $OutFile
86+
}
87+
88+
Push-Location $executorchRoot
89+
try {
90+
Write-Host "::group::Check CUDA toolchain"
91+
$nvccOutput = nvcc --version | Out-String
92+
Write-Host $nvccOutput
93+
nvidia-smi
94+
if (-not [string]::IsNullOrWhiteSpace($ExpectedCudaVersion)) {
95+
$versionMatch = [Regex]::Match($nvccOutput, "release\s+(\d+\.\d+)")
96+
if (-not $versionMatch.Success) {
97+
throw "Failed to parse CUDA version from nvcc output."
98+
}
99+
$actualCudaVersion = $versionMatch.Groups[1].Value
100+
if ($actualCudaVersion -ne $ExpectedCudaVersion) {
101+
throw "CUDA version mismatch. Expected: $ExpectedCudaVersion, Actual: $actualCudaVersion"
102+
}
103+
Write-Host "CUDA version check passed: $actualCudaVersion"
104+
}
105+
Write-Host "::endgroup::"
106+
107+
Write-Host "::group::Build ExecuTorch (CUDA)"
108+
$numCores = [Math]::Max([Environment]::ProcessorCount - 1, 1)
109+
cmake --preset llm-release-cuda
110+
cmake --build cmake-out --target install --config Release -j $numCores
111+
Write-Host "::endgroup::"
112+
113+
Write-Host "::group::Build $runnerTarget"
114+
Push-Location (Join-Path -Path $executorchRoot -ChildPath "examples\models\$runnerPath")
115+
try {
116+
cmake --preset $runnerPreset
117+
cmake --build (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath") --target $runnerTarget --config Release -j $numCores
118+
}
119+
finally {
120+
Pop-Location
121+
}
122+
Write-Host "::endgroup::"
123+
124+
Write-Host "::group::Prepare Artifacts"
125+
if ($preprocessor -ne "") {
126+
$preprocessorPath = Join-Path -Path $resolvedModelDir -ChildPath $preprocessor
127+
if (-not (Test-Path -Path $preprocessorPath -PathType Leaf)) {
128+
throw "Required preprocessor artifact not found: $preprocessorPath"
129+
}
130+
}
131+
if ($tokenizerFile -ne "") {
132+
$tokenizerPath = Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile
133+
if (-not (Test-Path -Path $tokenizerPath -PathType Leaf) -and $tokenizerUrl -eq "") {
134+
throw "Required tokenizer artifact not found: $tokenizerPath"
135+
}
136+
}
137+
if ($tokenizerUrl -ne "") {
138+
Download-IfNeeded -Url "$tokenizerUrl/$tokenizerFile" -OutFile (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile)
139+
}
140+
if ($audioUrl -ne "") {
141+
Download-IfNeeded -Url $audioUrl -OutFile (Join-Path -Path $resolvedModelDir -ChildPath $audioFile)
142+
}
143+
Get-ChildItem -Path $resolvedModelDir
144+
Write-Host "::endgroup::"
145+
146+
Write-Host "::group::Run $runnerTarget"
147+
$runnerExeCandidates = @(
148+
(Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath\Release\$runnerTarget.exe"),
149+
(Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath\$runnerTarget.exe")
150+
)
151+
$runnerExe = $runnerExeCandidates | Where-Object { Test-Path -Path $_ -PathType Leaf } | Select-Object -First 1
152+
if (-not $runnerExe) {
153+
throw "Runner executable not found. Checked: $($runnerExeCandidates -join ', ')"
154+
}
155+
156+
$runnerArgs = @("--model_path", $modelPte, "--data_path", $cudaBlob)
157+
switch ($HfModel) {
158+
"mistralai/Voxtral-Mini-3B-2507" {
159+
$runnerArgs += @(
160+
"--temperature", "0",
161+
"--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
162+
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
163+
"--processor_path", (Join-Path -Path $resolvedModelDir -ChildPath $preprocessor)
164+
)
165+
}
166+
"nvidia/parakeet-tdt" {
167+
$runnerArgs = @(
168+
"--model_path", $modelPte,
169+
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
170+
"--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
171+
"--data_path", $cudaBlob
172+
)
173+
}
174+
}
175+
176+
$stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
177+
$stderrFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stderr_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
178+
try {
179+
$proc = Start-Process `
180+
-FilePath $runnerExe `
181+
-ArgumentList $runnerArgs `
182+
-NoNewWindow `
183+
-Wait `
184+
-PassThru `
185+
-RedirectStandardOutput $stdoutFile `
186+
-RedirectStandardError $stderrFile
187+
188+
$stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" }
189+
$stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" }
190+
$output = @($stdout, $stderr) -join [Environment]::NewLine
191+
$exitCode = $proc.ExitCode
192+
}
193+
finally {
194+
Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue
195+
Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue
196+
}
197+
Write-Host "Runner output:"
198+
Write-Host $output
199+
200+
if ($exitCode -ne 0) {
201+
throw "Runner exited with code $exitCode`n$output"
202+
}
203+
204+
if ($expectedOutput -ne "" -and $output -notmatch [Regex]::Escape($expectedOutput)) {
205+
throw "Expected output '$expectedOutput' not found in runner output"
206+
}
207+
Write-Host "Success: '$expectedOutput' found in output"
208+
Write-Host "::endgroup::"
209+
}
210+
finally {
211+
Pop-Location
212+
}

.github/workflows/cuda-windows.yml

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
# Test ExecuTorch CUDA Windows Cross-Compilation Export
2-
# This workflow tests model export targeting CUDA Windows using optimum-executorch.
3-
# It runs on a Linux machine with CUDA and uses the executorch-ubuntu-22.04-cuda-windows
4-
# Docker image which has mingw and Windows CUDA SDK pre-installed for cross-compilation.
1+
# Test ExecuTorch CUDA Windows Artifacts
2+
# This workflow exports models targeting CUDA Windows using optimum-executorch on Linux.
3+
# Then it runs those exported artifacts on a Windows CI machine.
54

6-
name: Test CUDA Windows Export
5+
name: Test CUDA Windows Export and E2E
76

87
on:
98
pull_request:
@@ -16,6 +15,9 @@ concurrency:
1615
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
1716
cancel-in-progress: false
1817

18+
env:
19+
CUDA_VERSION: "12.8"
20+
1921
jobs:
2022
export-model-cuda-windows-artifact:
2123
name: export-model-cuda-windows-artifact
@@ -35,14 +37,14 @@ jobs:
3537
- repo: "nvidia"
3638
name: "parakeet-tdt"
3739
quant:
38-
- "non-quantized"
39-
- "quantized-int4-weight-only"
40+
- "non-quantized"
41+
- "quantized-int4-weight-only"
4042
with:
4143
timeout: 90
4244
secrets-env: EXECUTORCH_HF_TOKEN
4345
runner: linux.g5.4xlarge.nvidia.gpu
4446
gpu-arch-type: cuda
45-
gpu-arch-version: 12.8
47+
gpu-arch-version: ${{ env.CUDA_VERSION }}
4648
docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
4749
submodules: recursive
4850
upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
@@ -81,3 +83,44 @@ jobs:
8183
echo "::endgroup::"
8284
8385
source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
86+
87+
test-model-cuda-windows-e2e:
88+
name: test-model-cuda-windows-e2e
89+
needs: export-model-cuda-windows-artifact
90+
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
91+
strategy:
92+
fail-fast: false
93+
matrix:
94+
model:
95+
- repo: "mistralai"
96+
name: "Voxtral-Mini-3B-2507"
97+
- repo: "nvidia"
98+
name: "parakeet-tdt"
99+
quant:
100+
- "non-quantized"
101+
- "quantized-int4-weight-only"
102+
with:
103+
timeout: 240
104+
submodules: recursive
105+
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
106+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
107+
script: |
108+
conda init powershell
109+
powershell -Command "& {
110+
Set-PSDebug -Trace 1
111+
\$ErrorActionPreference = 'Stop'
112+
\$PSNativeCommandUseErrorActionPreference = \$true
113+
114+
.ci/scripts/setup-windows.ps1
115+
\$artifactDir = \$env:RUNNER_ARTIFACT_DIR
116+
if ([string]::IsNullOrWhiteSpace(\$artifactDir)) {
117+
throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
118+
}
119+
120+
.ci/scripts/test_model_e2e_windows.ps1 `
121+
-Device cuda-windows `
122+
-HfModel '${{ matrix.model.repo }}/${{ matrix.model.name }}' `
123+
-QuantName '${{ matrix.quant }}' `
124+
-ModelDir \$artifactDir `
125+
-ExpectedCudaVersion '${{ env.CUDA_VERSION }}'
126+
}"

examples/models/parakeet/README.md

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,47 @@ This generates:
132132
- `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
133133
- `tokenizer.model` - SentencePiece tokenizer
134134

135+
### CUDA-Windows Export
136+
137+
Before running `cuda-windows` export, make sure these requirements are set up:
138+
- `x86_64-w64-mingw32-g++` is installed and on `PATH` (mingw-w64 cross-compiler).
139+
- `WINDOWS_CUDA_HOME` points to the extracted Windows CUDA package directory.
140+
141+
Example setup on Ubuntu:
142+
143+
```bash
144+
# 1) Install cross-compiler + extraction tools
145+
sudo apt-get update
146+
sudo apt-get install -y --no-install-recommends \
147+
g++-mingw-w64-x86-64-posix mingw-w64-tools p7zip-full wget
148+
149+
# 2) Verify cross-compiler
150+
x86_64-w64-mingw32-g++ --version
151+
152+
# 3) Download and extract Windows CUDA installer package
153+
CUDA_VERSION=12.8.1
154+
CUDA_DRIVER_VERSION=572.61
155+
CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
156+
CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
157+
158+
mkdir -p /opt/cuda-windows
159+
cd /opt/cuda-windows
160+
wget -q "${CUDA_URL}" -O "${CUDA_INSTALLER}"
161+
7z x "${CUDA_INSTALLER}" -oextracted -y
162+
163+
# 4) Point WINDOWS_CUDA_HOME to extracted Windows CUDA payload
164+
export WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
165+
```
166+
167+
```bash
168+
python export_parakeet_tdt.py --backend cuda-windows --output-dir ./parakeet_cuda_windows
169+
```
170+
171+
This generates:
172+
- `model.pte` - The compiled Parakeet TDT model
173+
- `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
174+
- `tokenizer.model` - SentencePiece tokenizer
175+
135176
## C++ Runner
136177

137178
### Building
@@ -149,6 +190,15 @@ make parakeet-metal
149190
make parakeet-cuda
150191
```
151192

193+
On Windows (PowerShell), use CMake workflow presets directly:
194+
195+
```powershell
196+
cmake --workflow --preset llm-release-cuda
197+
Push-Location examples/models/parakeet
198+
cmake --workflow --preset parakeet-cuda
199+
Pop-Location
200+
```
201+
152202
### Running
153203

154204
From the executorch root directory:
@@ -174,12 +224,24 @@ DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner
174224
--tokenizer_path examples/models/parakeet/parakeet_cuda/tokenizer.model
175225
```
176226

227+
Windows (PowerShell):
228+
229+
```powershell
230+
.\cmake-out\examples\models\parakeet\Release\parakeet_runner.exe `
231+
--model_path C:\path\to\parakeet_cuda_windows\model.pte `
232+
--data_path C:\path\to\parakeet_cuda_windows\aoti_cuda_blob.ptd `
233+
--audio_path C:\path\to\audio.wav `
234+
--tokenizer_path C:\path\to\parakeet_cuda_windows\tokenizer.model
235+
```
236+
237+
If your generator is single-config, the runner may be at `.\cmake-out\examples\models\parakeet\parakeet_runner.exe` instead.
238+
177239
### Runner Arguments
178240

179241
| Argument | Description |
180242
|----------|-------------|
181243
| `--model_path` | Path to Parakeet model (.pte) |
182244
| `--audio_path` | Path to input audio file (.wav) |
183245
| `--tokenizer_path` | Path to tokenizer file (default: `tokenizer.json`) |
184-
| `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA) |
246+
| `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA/CUDA-Windows) |
185247
| `--timestamps` | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) |

0 commit comments

Comments
 (0)