Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ help.sh
docker.log
docs/package-lock.json

__pycache__
*.pyc

# auto-generated script
build_wrapper.sh

Expand All @@ -14,6 +17,10 @@ logging/*
debug
debug/*

# benchmark outputs
benchmark/results
benchmark/results/*

compile_wrapper.sh

*.tar.gz
Expand Down
8 changes: 3 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@ name = "cuNumeric"
uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
version = "0.1.1"

[workspace]
projects = ["test", "dev"]

[deps]
CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909"
Expand All @@ -33,7 +29,6 @@ CUDAExt = "CUDA"
[compat]
CNPreferences = "0.1.2"
CUDA = "5.9"
CUDA_SDK_jll = "13"
CxxWrap = "0.17"
JuliaFormatter = "2.3.0"
Legate = "0.1.2"
Expand All @@ -47,3 +42,6 @@ StatsBase = "0.34"
cunumeric_jl_wrapper_jll = "25.10.3"
cupynumeric_jll = "25.10.3"
julia = "1.10"

[workspace]
projects = ["test", "dev"]
12 changes: 12 additions & 0 deletions benchmark/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"

[extras]
CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9"
78 changes: 78 additions & 0 deletions benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Benchmark configuration

Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.

## Running

```bash
julia --project run.jl # runs whatever benchmarks.toml configures
```

`run.jl` runs each (benchmark, backend) pair in its own process via
`run_benchmark.sh`, so backends never share a GPU/runtime within a measurement.
cuNumeric always runs; extra comparison backends are toggled in `[Global]`:

- `cuda = true` → also run under CUDA.jl (single-GPU configs only; CUDA.jl is
single-device).
- `cupynumeric = true` → also run under cupynumeric (see below).

### Comparing against cupynumeric

cupynumeric runs in a conda env whose major.minor matches this project's
resolved `cupynumeric_jll`. Build it once:

```bash
./install_cupynumeric.sh # creates env cupynumeric-bench-<major.minor>
```

`run.jl` derives the env name automatically; override it with `CUPYNUMERIC_ENV`.

## Layout

```toml
[Global]
n_warmup = 5
n_iter = 1000
n_trial = 5

[[gemm]] # name registered in src/benchmarks.jl
T = "Float32" # element type
gpus = 1
cpus = 2
N = 150
M = 150 # optional, defaults to 1
```

Repeat a `[[name]]` block to add independent configs.

## Lists

Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
two axes:
- **`T` multiply.** The whole sweep runs once per type.
- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
of each is paired together.

Each zipped field must be one of:

- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config
- a list whose length equals the sweep length

Any other length mismatch is an error.

```toml
[[sgemm]]
T = ["Float64", "Float32"] # multiplies
gpus = [1, 2, 4] #
cpus = 2 # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600)
N = [150, 300, 600] #
M = [150, 300, 600] #
```

-> 2 types * 3 sweep points = **6 runs**.

### Gotcha

When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
to a specific size, use separate `[[name]]` blocks.
48 changes: 48 additions & 0 deletions benchmark/benchmarks.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[Global]
n_warmup = 5
n_iter = 1000
n_trial = 5
cupynumeric = true # (needs install_cupynumeric.sh)
cuda = false # compare against CUDA.jl (single-GPU configs only)

####################################
# GEMM #
# Work ~ 2*N^2*M. Hold N, scale M. #
####################################

[[gemm]]
T = ["Float32", "Float64"]
gpus = [1, 2, 4, 8]
cpus = 2
N = 4096
M = [4096, 8192, 16384, 32768]

#################################
# Gray-Scott #
# Work ~ N*M. Hold N, scale M. #
#################################

[[grayscott_baseline]]
T = "Float32"
gpus = [1, 2, 4, 8]
cpus = 2
N = 1024
M = [1024, 2048, 4096, 8192]

[[grayscott_lifetimes]]
T = "Float32"
gpus = [1, 2, 4, 8]
cpus = 2
N = 1024
M = [1024, 2048, 4096, 8192]

#################################
# Monte-Carlo Integration #
# Work ~ N. Scale N linearly #
#################################

[[montecarlo]]
T = "Float32"
gpus = [1, 2, 4, 8]
cpus = 2
N = [1_000_000, 2_000_000, 4_000_000, 8_000_000]
73 changes: 73 additions & 0 deletions benchmark/install_cupynumeric.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash
# Install a cupynumeric conda env matching the cupynumeric_jll our project resolves.
# The conda package and the JLL share the calendar-versioning scheme (e.g. 25.10),
# so we pin major.minor (patch ignored) and install from the legate channel.
#
# Usage:
# ./install_cupynumeric.sh # create a fresh env named cupynumeric-bench-<ver>
# ./install_cupynumeric.sh --name myenv # override the env name
# ./install_cupynumeric.sh --into existing # install into an existing env instead of creating one
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

ENV_NAME=""
INTO_ENV=""

while [[ $# -gt 0 ]]; do
case $1 in
--name)
ENV_NAME=$2
shift 2
;;
--into)
INTO_ENV=$2
shift 2
;;
*)
echo "Unknown argument: $1"
echo "Usage: $0 [--name <env>] [--into <existing-env>]"
exit 1
;;
esac
done

# Resolve the JLL version Julia actually instantiated for this project, then keep
# major.minor only — conda packages are not published per patch.
echo "Detecting cupynumeric_jll version from the benchmark project..."
VER=$(cd "$SCRIPT_DIR" && julia --project -e '
using Pkg
for (_, info) in Pkg.dependencies()
info.name == "cupynumeric_jll" || continue
v = info.version
isnothing(v) && continue
println("$(v.major).$(v.minor)")
end' | tail -1)

if [[ -z "$VER" ]]; then
echo "Error: could not detect cupynumeric_jll version. Has the project been instantiated?"
exit 1
fi

echo "cupynumeric_jll major.minor: $VER"
SPEC="cupynumeric=$VER.*"

if [[ -n "$INTO_ENV" ]]; then
echo "Installing $SPEC into existing env '$INTO_ENV'..."
conda install -y -n "$INTO_ENV" -c conda-forge -c legate "$SPEC"
echo "Done. Activate with: conda activate $INTO_ENV"
exit 0
fi

[[ -z "$ENV_NAME" ]] && ENV_NAME="cupynumeric-bench-$VER"

if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then
echo "Env '$ENV_NAME' already exists with $SPEC; nothing to do."
echo "Activate with: conda activate $ENV_NAME"
exit 0
fi

echo "Creating env '$ENV_NAME' with $SPEC..."
conda create -y -n "$ENV_NAME" -c conda-forge -c legate "$SPEC"

echo "Done. Activate with: conda activate $ENV_NAME"
103 changes: 103 additions & 0 deletions benchmark/run.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and
# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
# launching the worker (single.jl) that actually runs the benchmark.
# no args -> one command per benchmarks.toml entry
# with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>

# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.

using Pkg

include("src/core.jl")
include("src/parse_benchmarks.jl")

const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
const WORKER = joinpath(@__DIR__, "src/single.jl")
const PY_WORKER = joinpath(@__DIR__, "src_py/single.py")

banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)

# `_lifetimes` is a cuNumeric-only code-path variant (@analyze_lifetimes)
cunumeric_only(name) = endswith(name, "_lifetimes")

# ensure things are resolved and devlop'd properly
function ensure_project_ready()
Pkg.develop(; path=joinpath(@__DIR__, ".."))
Pkg.instantiate()
end

# default env name mirrors install_cupynumeric.sh: cupynumeric-bench-<major>.<minor>
# CUPYNUMERIC_ENV overrides it.
function cupynumeric_env_name()
haskey(ENV, "CUPYNUMERIC_ENV") && return ENV["CUPYNUMERIC_ENV"]
for (_, info) in Pkg.dependencies()
info.name == "cupynumeric_jll" || continue
info.version === nothing && continue
return "cupynumeric-bench-$(info.version.major).$(info.version.minor)"
end
error("could not resolve cupynumeric_jll version; set CUPYNUMERIC_ENV explicitly")
end

function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
cupynumeric=false, cudajl=false)
banner(
"$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
"n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
)

# each backend runs in its own worker process
args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
cmds = [`bash $RUNNER $WORKER $args cunumeric`]
# CUDA.jl is single-GPU only
if cudajl && gpus == 1 && !cunumeric_only(name)
push!(cmds, `bash $RUNNER $WORKER $args cudajl`)
end
if cupynumeric && !cunumeric_only(name)
push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`)
end

for cmd in cmds
try
run(cmd)
catch e
@error "Benchmark '$(name)' failed; continuing." exception = e
end
end
end

function run_all_benchmarks(config="benchmarks.toml")
gs, specs = parse_config(joinpath(@__DIR__, config))
for spec in specs
N, M = spec.args
dispatch(;
gpus=spec.gpus,
cpus=spec.cpus,
name=spec.name,
T=spec.T,
N=N, M=M,
n_iter=gs.n_iter,
n_warmup=gs.n_warmup,
n_trial=gs.n_trial,
cupynumeric=gs.cupynumeric,
cudajl=gs.cuda,
)
end
end

ensure_project_ready()
if isempty(ARGS)
run_all_benchmarks()
else # dispatch on args
dispatch(;
gpus=parse(Int, ARGS[1]),
cpus=parse(Int, ARGS[2]),
name=ARGS[3],
T=ARGS[4],
N=parse(Int, ARGS[5]),
M=parse(Int, ARGS[6]),
n_iter=parse(Int, ARGS[7]),
n_warmup=parse(Int, ARGS[8]),
n_trial=parse(Int, ARGS[9]),
)
end
Loading
Loading