JuliaLegate · ejmeitz · Jun 1, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,9 @@ help.sh
 docker.log
 docs/package-lock.json
 
+__pycache__
+*.pyc
+
 # auto-generated script
 build_wrapper.sh
 
@@ -14,6 +17,10 @@ logging/*
 debug
 debug/*
 
+# benchmark outputs
+benchmark/results
+benchmark/results/*
+
 compile_wrapper.sh
 
 *.tar.gz

diff --git a/Project.toml b/Project.toml
@@ -2,12 +2,8 @@ name = "cuNumeric"
 uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
 version = "0.1.1"
 
-[workspace]
-projects = ["test", "dev"]
-
 [deps]
 CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
-CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
 CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909"
@@ -33,7 +29,6 @@ CUDAExt = "CUDA"
 [compat]
 CNPreferences = "0.1.2"
 CUDA = "5.9"
-CUDA_SDK_jll = "13"
 CxxWrap = "0.17"
 JuliaFormatter = "2.3.0"
 Legate = "0.1.2"
@@ -47,3 +42,6 @@ StatsBase = "0.34"
 cunumeric_jl_wrapper_jll = "25.10.3"
 cupynumeric_jll = "25.10.3"
 julia = "1.10"
+
+[workspace]
+projects = ["test", "dev"]
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -0,0 +1,12 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
+
+[extras]
+CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
+LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9"
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,78 @@
+# Benchmark configuration
+
+Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.
+
+## Running
+
+```bash
+julia --project run.jl     # runs whatever benchmarks.toml configures
+```
+
+`run.jl` runs each (benchmark, backend) pair in its own process via
+`run_benchmark.sh`, so backends never share a GPU/runtime within a measurement.
+cuNumeric always runs; extra comparison backends are toggled in `[Global]`:
+
+- `cuda = true` → also run under CUDA.jl (single-GPU configs only; CUDA.jl is
+  single-device).
+- `cupynumeric = true` → also run under cupynumeric (see below).
+
+### Comparing against cupynumeric
+
+cupynumeric runs in a conda env whose major.minor matches this project's
+resolved `cupynumeric_jll`. Build it once:
+
+```bash
+./install_cupynumeric.sh   # creates env cupynumeric-bench-<major.minor>
+```
+
+`run.jl` derives the env name automatically; override it with `CUPYNUMERIC_ENV`.
+
+## Layout
+
+```toml
+[Global]
+n_warmup = 5
+n_iter   = 1000
+n_trial  = 5
+
+[[gemm]]            # name registered in src/benchmarks.jl
+T    = "Float32"     # element type
+gpus = 1
+cpus = 2
+N    = 150
+M    = 150           # optional, defaults to 1
+```
+
+Repeat a `[[name]]` block to add independent configs.
+
+## Lists
+
+Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
+two axes:
+- **`T` multiply.** The whole sweep runs once per type.
+- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
+  of each is paired together.
+
+Each zipped field must be one of:
+
+- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config
+- a list whose length equals the sweep length
+
+Any other length mismatch is an error.
+
+```toml
+[[sgemm]]
+T    = ["Float64", "Float32"]   # multiplies
+gpus = [1, 2, 4]                #
+cpus = 2                        # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600)
+N    = [150, 300, 600]          #
+M    = [150, 300, 600]          #
+```
+
+-> 2 types * 3 sweep points = **6 runs**.
+
+### Gotcha
+
+When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
+combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
+to a specific size, use separate `[[name]]` blocks.
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
@@ -0,0 +1,48 @@
+[Global]
+n_warmup = 5
+n_iter = 1000
+n_trial = 5
+cupynumeric = true # (needs install_cupynumeric.sh)
+cuda = false # compare against CUDA.jl (single-GPU configs only)
+
+####################################
+#             GEMM                 #
+# Work ~ 2*N^2*M. Hold N, scale M. #
+####################################
+
+[[gemm]]
+T = ["Float32", "Float64"]
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 4096
+M = [4096, 8192, 16384, 32768]
+
+#################################
+#         Gray-Scott            #
+#  Work ~ N*M. Hold N, scale M. #
+#################################
+
+[[grayscott_baseline]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+[[grayscott_lifetimes]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+#################################
+#   Monte-Carlo Integration     #
+#  Work ~ N. Scale N linearly   #
+#################################
+
+[[montecarlo]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = [1_000_000, 2_000_000, 4_000_000, 8_000_000]
diff --git a/benchmark/install_cupynumeric.sh b/benchmark/install_cupynumeric.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Install a cupynumeric conda env matching the cupynumeric_jll our project resolves.
+# The conda package and the JLL share the calendar-versioning scheme (e.g. 25.10),
+# so we pin major.minor (patch ignored) and install from the legate channel.
+#
+# Usage:
+#   ./install_cupynumeric.sh                 # create a fresh env named cupynumeric-bench-<ver>
+#   ./install_cupynumeric.sh --name myenv    # override the env name
+#   ./install_cupynumeric.sh --into existing # install into an existing env instead of creating one
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+ENV_NAME=""
+INTO_ENV=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --name)
+            ENV_NAME=$2
+            shift 2
+            ;;
+        --into)
+            INTO_ENV=$2
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 [--name <env>] [--into <existing-env>]"
+            exit 1
+            ;;
+    esac
+done
+
+# Resolve the JLL version Julia actually instantiated for this project, then keep
+# major.minor only — conda packages are not published per patch.
+echo "Detecting cupynumeric_jll version from the benchmark project..."
+VER=$(cd "$SCRIPT_DIR" && julia --project -e '
+using Pkg
+for (_, info) in Pkg.dependencies()
+    info.name == "cupynumeric_jll" || continue
+    v = info.version
+    isnothing(v) && continue
+    println("$(v.major).$(v.minor)")
+end' | tail -1)
+
+if [[ -z "$VER" ]]; then
+    echo "Error: could not detect cupynumeric_jll version. Has the project been instantiated?"
+    exit 1
+fi
+
+echo "cupynumeric_jll major.minor: $VER"
+SPEC="cupynumeric=$VER.*"
+
+if [[ -n "$INTO_ENV" ]]; then
+    echo "Installing $SPEC into existing env '$INTO_ENV'..."
+    conda install -y -n "$INTO_ENV" -c conda-forge -c legate "$SPEC"
+    echo "Done. Activate with: conda activate $INTO_ENV"
+    exit 0
+fi
+
+[[ -z "$ENV_NAME" ]] && ENV_NAME="cupynumeric-bench-$VER"
+
+if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then
+    echo "Env '$ENV_NAME' already exists with $SPEC; nothing to do."
+    echo "Activate with: conda activate $ENV_NAME"
+    exit 0
+fi
+
+echo "Creating env '$ENV_NAME' with $SPEC..."
+conda create -y -n "$ENV_NAME" -c conda-forge -c legate "$SPEC"
+
+echo "Done. Activate with: conda activate $ENV_NAME"
diff --git a/benchmark/run.jl b/benchmark/run.jl
@@ -0,0 +1,103 @@
+# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and
+# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
+# launching the worker (single.jl) that actually runs the benchmark.
+#   no args   -> one command per benchmarks.toml entry
+#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>
+
+# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
+# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
+
+using Pkg
+
+include("src/core.jl")
+include("src/parse_benchmarks.jl")
+
+const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
+const WORKER = joinpath(@__DIR__, "src/single.jl")
+const PY_WORKER = joinpath(@__DIR__, "src_py/single.py")
+
+banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
+
+# `_lifetimes` is a cuNumeric-only code-path variant (@analyze_lifetimes)
+cunumeric_only(name) = endswith(name, "_lifetimes")
+
+# ensure things are resolved and devlop'd properly
+function ensure_project_ready()
+    Pkg.develop(; path=joinpath(@__DIR__, ".."))
+    Pkg.instantiate()
+end
+
+# default env name mirrors install_cupynumeric.sh: cupynumeric-bench-<major>.<minor>
+# CUPYNUMERIC_ENV overrides it.
+function cupynumeric_env_name()
+    haskey(ENV, "CUPYNUMERIC_ENV") && return ENV["CUPYNUMERIC_ENV"]
+    for (_, info) in Pkg.dependencies()
+        info.name == "cupynumeric_jll" || continue
+        info.version === nothing && continue
+        return "cupynumeric-bench-$(info.version.major).$(info.version.minor)"
+    end
+    error("could not resolve cupynumeric_jll version; set CUPYNUMERIC_ENV explicitly")
+end
+
+function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
+    cupynumeric=false, cudajl=false)
+    banner(
+        "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
+        "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
+    )
+
+    # each backend runs in its own worker process
+    args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
+    cmds = [`bash $RUNNER $WORKER $args cunumeric`]
+    # CUDA.jl is single-GPU only
+    if cudajl && gpus == 1 && !cunumeric_only(name)
+        push!(cmds, `bash $RUNNER $WORKER $args cudajl`)
+    end
+    if cupynumeric && !cunumeric_only(name)
+        push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`)
+    end
+
+    for cmd in cmds
+        try
+            run(cmd)
+        catch e
+            @error "Benchmark '$(name)' failed; continuing." exception = e
+        end
+    end
+end
+
+function run_all_benchmarks(config="benchmarks.toml")
+    gs, specs = parse_config(joinpath(@__DIR__, config))
+    for spec in specs
+        N, M = spec.args
+        dispatch(;
+            gpus=spec.gpus,
+            cpus=spec.cpus,
+            name=spec.name,
+            T=spec.T,
+            N=N, M=M,
+            n_iter=gs.n_iter,
+            n_warmup=gs.n_warmup,
+            n_trial=gs.n_trial,
+            cupynumeric=gs.cupynumeric,
+            cudajl=gs.cuda,
+        )
+    end
+end
+
+ensure_project_ready()
+if isempty(ARGS)
+    run_all_benchmarks()
+else # dispatch on args
+    dispatch(;
+        gpus=parse(Int, ARGS[1]),
+        cpus=parse(Int, ARGS[2]),
+        name=ARGS[3],
+        T=ARGS[4],
+        N=parse(Int, ARGS[5]),
+        M=parse(Int, ARGS[6]),
+        n_iter=parse(Int, ARGS[7]),
+        n_warmup=parse(Int, ARGS[8]),
+        n_trial=parse(Int, ARGS[9]),
+    )
+end