From a8154943afd977787c174b558e218ebd4603529e Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Mon, 1 Jun 2026 14:53:23 -0500
Subject: [PATCH 01/17] start benchmarking harness

---
 benchmark/Project.toml        | 10 ++++++
 benchmark/benchmarks.jl       | 50 +++++++++++++++++++++++++++
 benchmark/benchmarks.toml     | 22 ++++++++++++
 benchmark/parse_benchmarks.jl | 39 ++++++++++++++++++++++
 benchmark/run.jl              | 63 +++++++++++++++++++++++++++++++++++
 5 files changed, 184 insertions(+)
 create mode 100644 benchmark/Project.toml
 create mode 100644 benchmark/benchmarks.jl
 create mode 100644 benchmark/benchmarks.toml
 create mode 100644 benchmark/parse_benchmarks.jl
 create mode 100644 benchmark/run.jl

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 00000000..a6989391
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,10 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
+
+[extras]
+CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
+LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
new file mode 100644
index 00000000..dde1a6ce
--- /dev/null
+++ b/benchmark/benchmarks.jl
@@ -0,0 +1,50 @@
+abstract type AbstractBenchmark{T} end
+
+#########################################
+
+Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
+    N::Int
+    M::Int
+end
+
+data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
+
+function allowed_types(::MonteCarloIntegration)
+    Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
+end
+
+total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
+total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
+
+function initialize_cpu(s::GEMM{T}) where {T}
+    A = rand(T, s.N, s.M)
+    B = rand(T, s.M, s.N)
+    C = zeros(T, s.N, s.N)
+    return A, B, C
+end
+
+run!(::GEMM, C, A, B) = mul!(C, A, B)
+
+#########################################
+
+Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T}
+    n_samples::Int
+end
+
+function data(mci::MonteCarloIntegration{T}) where {T}
+    "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)"
+end
+
+allowed_types(::MonteCarloIntegration) = cuNumeric.SUPPORTED_FLOAT_TYPES
+
+total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
+total_flops(s::MonteCarloIntegration) = missing # cannot estimate FLOPS for squaring or exp easily
+
+function initialize_cpu(s::MonteCarloIntegration{T}) where {T}
+    return T(10) .* rand(T, s.n_samples) .+ T(-5) # random samples in [-5, 5]
+end
+
+_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
+run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
+
+#################
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
new file mode 100644
index 00000000..acdeec45
--- /dev/null
+++ b/benchmark/benchmarks.toml
@@ -0,0 +1,22 @@
+[Global]
+N_WARMUP = 2
+N_ITER = 100
+N_GPU = 1
+
+[[GEMM]]
+T = "Float64"
+N = 100
+M = 100
+
+[[GEMM]]
+T = "Float32"
+N = 150
+M = 150
+
+[[MonteCarloIntegration]]
+T = "Float64"
+N_samples = 1_000_000
+
+[[MonteCarloIntegration]]
+T = "Float32"
+N_samples = 10_000_000
diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl
new file mode 100644
index 00000000..11505f71
--- /dev/null
+++ b/benchmark/parse_benchmarks.jl
@@ -0,0 +1,39 @@
+using TOML
+
+Base.@kwdef struct GlobalSettings
+    N_warmup::Int
+    N_iter::Int
+    N_GPU::Int
+end
+
+function to_symbol_dict(d)
+    return Dict(Symbol(k) => v for (k, v) in d)
+end
+
+function parse_config(path)
+    raw = TOML.parsefile(path)
+
+    global_settings = GlobalSettings(; to_symbol_dict(raw["Global"])...)
+
+    benchmarks = AbstractBenchmark[]
+
+    for (name, entries) in raw
+        name == "Global" && continue
+
+        BenchmarkType = getproperty(Main, Symbol(name))
+
+        for entry in entries
+            T = getproperty(Main, Symbol(entry["T"]))
+
+            params = Dict{Symbol,Any}()
+            for (k, v) in entry
+                k == "T" && continue
+                params[Symbol(k)] = v
+            end
+
+            push!(benchmarks, BenchmarkType{T}(; params...))
+        end
+    end
+
+    return global_settings, benchmarks
+end
diff --git a/benchmark/run.jl b/benchmark/run.jl
new file mode 100644
index 00000000..3f69cfb5
--- /dev/null
+++ b/benchmark/run.jl
@@ -0,0 +1,63 @@
+import Random
+import cuNumeric
+
+include("benchmarks.jl")
+
+function work(b::AbstractBenchmark, N_GPU, arrs_cpu...)
+
+    run!(b, arrs_cunumeric...)
+
+    GC.gc(full = true)
+
+    if N_GPU == 1
+        arrs_cuda = ...
+        run!(b, arrs_cunumeric...)
+    end
+
+    # Reset state in between
+    GC.gc(full = true)
+end
+
+function run_all_benchmarks()
+
+    global_settings, benchmarks = parse_config("benchmarks.toml")
+
+    @show global_settings
+    @show benchmarks
+
+    for b in benchmarks
+        println("================================")
+        println(data(b))
+        println("================================")
+
+        arrs = init(benchmark)
+
+        #TODO FIX
+
+        arrs_cunumeric =
+        run!(b, arrs_cunumeric...)
+
+        # Reset state in between
+        GC.gc(full = true)
+
+        if N_GPU == 1
+            arrs_cuda = ...
+            run!(b, arrs_cunumeric...)
+        end
+
+        # Reset state in between
+        GC.gc(full = true)
+    end
+
+end
+
+
+function run_sgemm_benchmark(N)
+    include("sgemm.jl")
+    name = "SGEMM"
+end
+
+function run_monte_carlo_benchmark(N)
+    include("monte_carlo.jl")
+    name = "Monte_Carlo_Integration"
+end

From 0465d8c952b3bbe3bdb5d9bb5b07a1efceb5454c Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Tue, 2 Jun 2026 13:16:15 -0500
Subject: [PATCH 02/17] clean some thing sup

---
 benchmark/Project.toml        |  1 +
 benchmark/benchmarks.jl       | 39 +++++++++++++++++--
 benchmark/parse_benchmarks.jl | 14 +++----
 benchmark/run.jl              | 72 +++++++++++++++++++----------------
 4 files changed, 83 insertions(+), 43 deletions(-)

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index a6989391..e6583a71 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index dde1a6ce..f93293e6 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -1,3 +1,25 @@
+using CSV
+
+"""
+- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended
+    to avoid pre-compilation cost being timed.
+- `n_iter::Int` : Number of iterations to run per trial. Should be large enough
+    to build up queue depth of tasks such that latency is hidden.
+- `n_trial::Int` : Number of independent trials to run. Timing is restarted and
+    legate in between each trial. Sets number of datapoints used to estimated
+    standard deviations/errors.
+- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG,
+    this value is just bookkeeping.
+"""
+Base.@kwdef struct GlobalSettings
+    n_warmup::Int # Number of warmup steps, where timing is not done.
+    n_iter::Int # Number of iterations to run per trial
+    n_trial::Int # Number of independent trials to run. Benchmark
+    n_gpu::Int
+end
+
+#########################################
+
 abstract type AbstractBenchmark{T} end
 
 #########################################
@@ -9,7 +31,7 @@ end
 
 data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
 
-function allowed_types(::MonteCarloIntegration)
+function allowed_types(::Type{GEMM})
     Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
 end
 
@@ -35,7 +57,7 @@ function data(mci::MonteCarloIntegration{T}) where {T}
     "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)"
 end
 
-allowed_types(::MonteCarloIntegration) = cuNumeric.SUPPORTED_FLOAT_TYPES
+allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
 
 total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
 total_flops(s::MonteCarloIntegration) = missing # cannot estimate FLOPS for squaring or exp easily
@@ -47,4 +69,15 @@ end
 _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
 run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
 
-#################
+#########################################
+
+struct BenchmarkResult{T,B<:AbstractBenchmark}
+    times_ms::T
+    gflops::T
+    benchmark::B
+end
+
+function save(br::BenchmarkResult)
+    # Compute standard error and mean time and save to
+    # some kind of file.
+end
diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl
index 11505f71..f25d4836 100644
--- a/benchmark/parse_benchmarks.jl
+++ b/benchmark/parse_benchmarks.jl
@@ -1,11 +1,5 @@
 using TOML
 
-Base.@kwdef struct GlobalSettings
-    N_warmup::Int
-    N_iter::Int
-    N_GPU::Int
-end
-
 function to_symbol_dict(d)
     return Dict(Symbol(k) => v for (k, v) in d)
 end
@@ -20,9 +14,11 @@ function parse_config(path)
     for (name, entries) in raw
         name == "Global" && continue
 
+        # Convert name parsed as String, to actual type
         BenchmarkType = getproperty(Main, Symbol(name))
 
         for entry in entries
+            # Convert type parsed as String, to actual type
             T = getproperty(Main, Symbol(entry["T"]))
 
             params = Dict{Symbol,Any}()
@@ -31,7 +27,11 @@ function parse_config(path)
                 params[Symbol(k)] = v
             end
 
-            push!(benchmarks, BenchmarkType{T}(; params...))
+            if T <: allowed_types(BenchmarkType)
+                push!(benchmarks, BenchmarkType{T}(; params...))
+            else
+                @warn "$(BenchmarkType) does not support benchmarking with type $(T). Skipping."
+            end
         end
     end
 
diff --git a/benchmark/run.jl b/benchmark/run.jl
index 3f69cfb5..c5efba93 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -1,63 +1,69 @@
-import Random
-import cuNumeric
+using Random: Random
+using cuNumeric: cuNumeric
 
 include("benchmarks.jl")
 
-function work(b::AbstractBenchmark, N_GPU, arrs_cpu...)
+function benchmark(b::AbstractBenchmark, gs::GlobalSettings, arrs)
+    GC.gc(; full=true)
 
-    run!(b, arrs_cunumeric...)
-
-    GC.gc(full = true)
+    for idx in range(1, gs.n_iter + gs.n_warmup)
+        if idx == gs.n_warmup + 1
+            start_time = get_time_microseconds()
+        end
 
-    if N_GPU == 1
-        arrs_cuda = ...
-        run!(b, arrs_cunumeric...)
+        run!(b, arrays...)
     end
+    total_time_μs = get_time_microseconds() - start_time
+    mean_time_ms = total_time_μs / (gs.n_iter * 1e3)
+    gflops = total_flops(N, M) / (mean_time_ms * 1e6)
+
+    GC.gc(; full=true)
 
-    # Reset state in between
-    GC.gc(full = true)
+    return mean_time_ms, gflops
 end
 
 function run_all_benchmarks()
-
     global_settings, benchmarks = parse_config("benchmarks.toml")
 
     @show global_settings
     @show benchmarks
 
+    cunumeric_results = BenchmarkResult[]
+    cuda_results = BenchmarkResult[]
+
     for b in benchmarks
         println("================================")
         println(data(b))
         println("================================")
 
-        arrs = init(benchmark)
+        cn_times_ms = Vector{Float64}(undef, global_settings.n_trial)
+        cn_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial)
 
-        #TODO FIX
+        cuda_times_ms = Vector{Float64}(undef, global_settings.n_trial)
+        cuda_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial)
 
-        arrs_cunumeric =
-        run!(b, arrs_cunumeric...)
+        for i in 1:global_settings.n_trial
+            arrs_julia = initialize_cpu(b)
 
-        # Reset state in between
-        GC.gc(full = true)
+            arrs_cunumeric = # TODO
+                cn_times_ms[i], cn_gflops[i] = benchmark(b, arrs_cunumeric...)
+            push
 
-        if N_GPU == 1
-            arrs_cuda = ...
-            run!(b, arrs_cunumeric...)
+            if gs.n_gpu == 1
+                arrs_cuda = # TODO
+                    cuda_times_ms[i], cuda_gflops[i] = benchmark(b, arrs_cuda...)
+                push!(cuda_results, res_cuda)
+            end
         end
 
-        # Reset state in between
-        GC.gc(full = true)
-    end
-
-end
+        cn_result = BenchmarkResult(cn_times_ms, cn_gflops, b)
+        cuda_result = BenchmarkResult(cuda_times_ms, cuda_gflops, b)
 
+        push!(cunumeric_results, cn_result)
+        push!(cuda_results, cuda_result)
+    end
 
-function run_sgemm_benchmark(N)
-    include("sgemm.jl")
-    name = "SGEMM"
-end
+    # Call the `save` function for the cuda_results
+    # This function is not implemeneted as I was not sure how to do it
 
-function run_monte_carlo_benchmark(N)
-    include("monte_carlo.jl")
-    name = "Monte_Carlo_Integration"
 end

From 12373c7f4747f7b6932bd6a4c0a41aacb709508e Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Tue, 2 Jun 2026 13:25:26 -0500
Subject: [PATCH 03/17] add include

---
 benchmark/run.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmark/run.jl b/benchmark/run.jl
index c5efba93..b090c8ef 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -2,6 +2,7 @@ using Random: Random
 using cuNumeric: cuNumeric
 
 include("benchmarks.jl")
+include("parse_benchmarks.jl")
 
 function benchmark(b::AbstractBenchmark, gs::GlobalSettings, arrs)
     GC.gc(; full=true)

From 73fbfb223416fe8f4a054d5600c5a5e85c7443ce Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Tue, 2 Jun 2026 20:45:37 -0500
Subject: [PATCH 04/17] run benchmarks modifications

---
 .gitignore                    |   4 +
 benchmark/benchmarks.jl       | 197 ++++++++++++++++++++++++++++++----
 benchmark/benchmarks.toml     |  28 ++---
 benchmark/parse_benchmarks.jl |  42 +++-----
 benchmark/run.jl              | 106 +++++++++---------
 benchmark/run_benchmark.sh    |   8 +-
 benchmark/sgemm.jl            |  56 ----------
 7 files changed, 266 insertions(+), 175 deletions(-)
 delete mode 100644 benchmark/sgemm.jl

diff --git a/.gitignore b/.gitignore
index 3b09cfb4..c2af1d47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,10 @@ logging/*
 debug
 debug/*
 
+# benchmark outputs
+benchmark/results
+benchmark/results/*
+
 compile_wrapper.sh
 
 *.tar.gz
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index f93293e6..8a87dd38 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -1,4 +1,5 @@
-using CSV
+using Printf
+using Statistics
 
 """
 - `n_warmup::Int` : Number of warmup steps. These are not timed. Intended
@@ -14,8 +15,8 @@ using CSV
 Base.@kwdef struct GlobalSettings
     n_warmup::Int # Number of warmup steps, where timing is not done.
     n_iter::Int # Number of iterations to run per trial
-    n_trial::Int # Number of independent trials to run. Benchmark
-    n_gpu::Int
+    n_trial::Int = 1 # Number of independent trials to run. Benchmark
+    n_gpu::Int = 0
 end
 
 #########################################
@@ -29,6 +30,8 @@ Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
     M::Int
 end
 
+name(::GEMM) = "sgemm"
+dims(g::GEMM) = (g.N, g.M)
 data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
 
 function allowed_types(::Type{GEMM})
@@ -38,11 +41,12 @@ end
 total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
 total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
 
-function initialize_cpu(s::GEMM{T}) where {T}
-    A = rand(T, s.N, s.M)
-    B = rand(T, s.M, s.N)
-    C = zeros(T, s.N, s.N)
-    return A, B, C
+function initialize(s::GEMM{T}) where {T}
+    A = cuNumeric.rand(T, s.N, s.M)
+    B = cuNumeric.rand(T, s.M, s.N)
+    C = cuNumeric.zeros(T, s.N, s.N)
+    GC.gc()
+    return C, A, B
 end
 
 run!(::GEMM, C, A, B) = mul!(C, A, B)
@@ -60,24 +64,179 @@ end
 allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
 
 total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
-total_flops(s::MonteCarloIntegration) = missing # cannot estimate FLOPS for squaring or exp easily
-
-function initialize_cpu(s::MonteCarloIntegration{T}) where {T}
-    return T(10) .* rand(T, s.n_samples) .+ T(-5) # random samples in [-5, 5]
-end
+total_flops(s::MonteCarloIntegration) = s.n_samples
 
 _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
 run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
 
 #########################################
 
-struct BenchmarkResult{T,B<:AbstractBenchmark}
-    times_ms::T
-    gflops::T
+struct GSParams{T}
+    dx::T
+    dt::T
+    c_u::T
+    c_v::T
+    f::T
+    k::T
+end
+
+function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T}
+    GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k))
+end
+
+Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T}
+    N::Int
+    M::Int
+end
+
+name(::GrayScott) = "grayscott"
+dims(b::GrayScott) = (b.N, b.M)
+data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
+allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
+total_flops(b::GrayScott) = b.N * b.M # grid points updated per step
+
+mutable struct GrayScottState{A,P}
+    u::A
+    v::A
+    u_new::A
+    v_new::A
+    params::P
+end
+
+function initialize(b::GrayScott{T}) where {T}
+    d = (b.N, b.M)
+    u = cuNumeric.ones(T, d)
+    v = cuNumeric.zeros(T, d)
+    u_new = cuNumeric.zeros(T, d)
+    v_new = cuNumeric.zeros(T, d)
+
+    seed = min(150, b.N, b.M)
+    u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
+    v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
+
+    return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),)
+end
+
+function _gs_step!(u, v, u_new, v_new, args::GSParams)
+    # currently we don't have NDArray^x working yet.
+    F_u = (
+        (
+            -u[2:(end - 1), 2:(end - 1)] .*
+            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+        ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)])
+    )
+    F_v = (
+        (
+            u[2:(end - 1), 2:(end - 1)] .*
+            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+        ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)]
+    )
+    # 2-D Laplacian via slicing, excluding boundaries
+    u_lap = (
+        (
+            u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] +
+            u[1:(end - 2), 2:(end - 1)]
+        ) ./ args.dx^2 +
+        (
+            u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] +
+            u[2:(end - 1), 1:(end - 2)]
+        ) ./ args.dx^2
+    )
+    v_lap = (
+        (
+            v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] +
+            v[1:(end - 2), 2:(end - 1)]
+        ) ./ args.dx^2 +
+        (
+            v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] +
+            v[2:(end - 1), 1:(end - 2)]
+        ) ./ args.dx^2
+    )
+
+    # Forward-Euler step for all interior points
+    u_new[2:(end - 1), 2:(end - 1)] =
+        ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)]
+    v_new[2:(end - 1), 2:(end - 1)] =
+        ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)]
+
+    # Periodic boundary conditions
+    u_new[:, 1] = u[:, end - 1]
+    u_new[:, end] = u[:, 2]
+    u_new[1, :] = u[end - 1, :]
+    u_new[end, :] = u[2, :]
+    v_new[:, 1] = v[:, end - 1]
+    v_new[:, end] = v[:, 2]
+    v_new[1, :] = v[end - 1, :]
+    v_new[end, :] = v[2, :]
+end
+
+function run!(::GrayScott, st::GrayScottState)
+    _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params)
+    # swap references rather than copy
+    st.u, st.u_new = st.u_new, st.u
+    st.v, st.v_new = st.v_new, st.v
+    return nothing
+end
+
+#########################################
+
+# Maps the benchmarks.toml table name to its benchmark type. Add new benchmarks here.
+const BENCHMARKS = Dict{String,Type}(
+    "sgemm" => GEMM,
+    "grayscott" => GrayScott,
+)
+
+# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean
+# over `n_iter` iterations for trial `i`; the spread across trials gives stddev.
+struct BenchmarkResult{B<:AbstractBenchmark}
+    times_ms::Vector{Float64}
+    gflops::Vector{Float64}
     benchmark::B
 end
 
-function save(br::BenchmarkResult)
-    # Compute standard error and mean time and save to
-    # some kind of file.
+# One timed trial: warmup, then time `n_iter` iterations of `run!`.
+function _trial(b::AbstractBenchmark, gs::GlobalSettings)
+    GC.gc(true)
+    state = initialize(b)
+
+    start_time = zero(get_time_microseconds())
+    for idx in 1:(gs.n_warmup + gs.n_iter)
+        if idx == gs.n_warmup + 1
+            start_time = get_time_microseconds()
+        end
+        run!(b, state...)
+    end
+    total_time_μs = get_time_microseconds() - start_time
+
+    mean_time_ms = total_time_μs / (gs.n_iter * 1e3)
+    gflops = total_flops(b) / (mean_time_ms * 1e6)
+    return mean_time_ms, gflops
+end
+
+# Run `n_trial` independent trials and collect their per-trial measurements.
+function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings)
+    times_ms = Float64[]
+    gflops = Float64[]
+    for _ in 1:gs.n_trial
+        t, g = _trial(b, gs)
+        push!(times_ms, t)
+        push!(gflops, g)
+    end
+    return BenchmarkResult(times_ms, gflops, b)
+end
+
+_std(x) = length(x) > 1 ? std(x) : 0.0
+
+function save_result(br::BenchmarkResult, gpus)
+    N, M = dims(br.benchmark)
+    path = joinpath(@__DIR__, "results", "$(name(br.benchmark)).csv")
+    mkpath(dirname(path))
+    open(path, "a") do io
+        for trial in eachindex(br.times_ms)
+            @printf(
+                io, "%s,%d,%d,%d,%d,%.6f,%.6f\n",
+                "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial],
+            )
+        end
+    end
 end
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index acdeec45..472e13bc 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -1,22 +1,16 @@
 [Global]
-N_WARMUP = 2
-N_ITER = 100
-N_GPU = 1
+n_warmup = 5
+n_iter = 1000
+n_trial = 5
 
-[[GEMM]]
-T = "Float64"
-N = 100
-M = 100
+[[grayscott]]
+gpus = 1
+cpus = 2
+N = 1000
+M = 1000
 
-[[GEMM]]
-T = "Float32"
+[[sgemm]]
+gpus = 1
+cpus = 2
 N = 150
 M = 150
-
-[[MonteCarloIntegration]]
-T = "Float64"
-N_samples = 1_000_000
-
-[[MonteCarloIntegration]]
-T = "Float32"
-N_samples = 10_000_000
diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl
index f25d4836..b11ef983 100644
--- a/benchmark/parse_benchmarks.jl
+++ b/benchmark/parse_benchmarks.jl
@@ -1,39 +1,31 @@
 using TOML
 
-function to_symbol_dict(d)
-    return Dict(Symbol(k) => v for (k, v) in d)
+"""
+One benchmark invocation parsed from `benchmarks.toml`. `name` selects the
+benchmark type from `BENCHMARKS`; `args` are the sizes (currently `N M`).
+"""
+struct BenchmarkSpec
+    name::String
+    gpus::Int
+    cpus::Int
+    args::Vector{Int}
 end
 
 function parse_config(path)
     raw = TOML.parsefile(path)
 
-    global_settings = GlobalSettings(; to_symbol_dict(raw["Global"])...)
-
-    benchmarks = AbstractBenchmark[]
+    g = raw["Global"]
+    global_settings = GlobalSettings(;
+        n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1)
+    )
 
+    specs = BenchmarkSpec[]
     for (name, entries) in raw
         name == "Global" && continue
-
-        # Convert name parsed as String, to actual type
-        BenchmarkType = getproperty(Main, Symbol(name))
-
-        for entry in entries
-            # Convert type parsed as String, to actual type
-            T = getproperty(Main, Symbol(entry["T"]))
-
-            params = Dict{Symbol,Any}()
-            for (k, v) in entry
-                k == "T" && continue
-                params[Symbol(k)] = v
-            end
-
-            if T <: allowed_types(BenchmarkType)
-                push!(benchmarks, BenchmarkType{T}(; params...))
-            else
-                @warn "$(BenchmarkType) does not support benchmarking with type $(T). Skipping."
-            end
+        for e in entries
+            push!(specs, BenchmarkSpec(name, e["gpus"], e["cpus"], [e["N"], e["M"]]))
         end
     end
 
-    return global_settings, benchmarks
+    return global_settings, specs
 end
diff --git a/benchmark/run.jl b/benchmark/run.jl
index b090c8ef..58990e14 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -1,70 +1,66 @@
-using Random: Random
-using cuNumeric: cuNumeric
+# run.jl: orchestrator, one child per benchmarks.toml entry. With args
+# (<gpus> <name> <N> <M> <iter> <warmup> <trial>) it runs one benchmark, e.g.
+# `julia run.jl 1 grayscott 1000 1000 100 5 5`
+# Separate child per benchmark since LEGATE_CONFIG must be set before julia starts.
 
 include("benchmarks.jl")
 include("parse_benchmarks.jl")
 
-function benchmark(b::AbstractBenchmark, gs::GlobalSettings, arrs)
-    GC.gc(; full=true)
+function run_all_benchmarks(config="benchmarks.toml")
+    gs, specs = parse_config(joinpath(@__DIR__, config))
 
-    for idx in range(1, gs.n_iter + gs.n_warmup)
-        if idx == gs.n_warmup + 1
-            start_time = get_time_microseconds()
-        end
-
-        run!(b, arrays...)
-    end
-    total_time_μs = get_time_microseconds() - start_time
-    mean_time_ms = total_time_μs / (gs.n_iter * 1e3)
-    gflops = total_flops(N, M) / (mean_time_ms * 1e6)
+    runner = joinpath(@__DIR__, "run_benchmark.sh")
+    self = @__FILE__
 
-    GC.gc(; full=true)
-
-    return mean_time_ms, gflops
-end
-
-function run_all_benchmarks()
-    global_settings, benchmarks = parse_config("benchmarks.toml")
-
-    @show global_settings
-    @show benchmarks
-
-    cunumeric_results = BenchmarkResult[]
-    cuda_results = BenchmarkResult[]
+    for spec in specs
+        if !haskey(BENCHMARKS, spec.name)
+            @warn "No benchmark registered for '$(spec.name)'; skipping."
+            continue
+        end
 
-    for b in benchmarks
+        N, M = spec.args
+        println("\n================================")
+        println(
+            "$(spec.name): gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " *
+            "n_iter=$(gs.n_iter) n_warmup=$(gs.n_warmup) n_trial=$(gs.n_trial)",
+        )
         println("================================")
-        println(data(b))
-        println("================================")
-
-        cn_times_ms = Vector{Float64}(undef, global_settings.n_trial)
-        cn_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial)
-
-        cuda_times_ms = Vector{Float64}(undef, global_settings.n_trial)
-        cuda_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial)
-
-        for i in 1:global_settings.n_trial
-            arrs_julia = initialize_cpu(b)
-
-            arrs_cunumeric = # TODO
-                cn_times_ms[i], cn_gflops[i] = benchmark(b, arrs_cunumeric...)
-            push
 
-            if gs.n_gpu == 1
-                arrs_cuda = # TODO
-                    cuda_times_ms[i], cuda_gflops[i] = benchmark(b, arrs_cuda...)
-                push!(cuda_results, res_cuda)
-            end
+        cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)`
+        try
+            run(cmd)
+        catch e
+            @error "Benchmark '$(spec.name)' failed; continuing." exception = e
         end
+    end
+end
 
-        cn_result = BenchmarkResult(cn_times_ms, cn_gflops, b)
-        cuda_result = BenchmarkResult(cuda_times_ms, cuda_gflops, b)
+function run_single(gpus, name, N, M, n_iter, n_warmup, n_trial)
+    b = BENCHMARKS[name]{Float32}(; N=N, M=M)
+    gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
 
-        push!(cunumeric_results, cn_result)
-        push!(cuda_results, cuda_result)
-    end
+    println(
+        "[cuNumeric] $(name) benchmark on $(N)x$(M) for $(n_iter) iterations " *
+        "($(n_warmup) warmup) x $(n_trial) trials",
+    )
+    br = run_benchmark(b, gs)
+    @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
+    @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
 
-    # Call the `save` function for the cuda_results
-    # This function is not implemeneted as I was not sure how to do it
+    save_result(br, gpus)
+end
 
+if isempty(ARGS)
+    run_all_benchmarks()
+else
+    using cuNumeric
+    using LinearAlgebra
+    gpus = parse(Int, ARGS[1])
+    bench_name = ARGS[2]
+    N = parse(Int, ARGS[3])
+    M = parse(Int, ARGS[4])
+    n_iter = parse(Int, ARGS[5])
+    n_warmup = parse(Int, ARGS[6])
+    n_trial = parse(Int, ARGS[7])
+    run_single(gpus, bench_name, N, M, n_iter, n_warmup, n_trial)
 end
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
index 07a97a05..ef54dfa5 100755
--- a/benchmark/run_benchmark.sh
+++ b/benchmark/run_benchmark.sh
@@ -43,14 +43,16 @@ if [[ $GPUS -lt 0 ]]; then
 fi
 
 if [[ $CPUS -lt 0 ]]; then
-    echo "CPUs ivnalid, using cpus = 1"
+    echo "CPUs invalid, using cpus = 1"
     exit
 fi
 
-export LEGATE_AUTO_CONFIG=0
-export LEGATE_CONFIG="--cpus=1 --gpus=$GPUS --omps=$CPUS --ompthreads=3 --utility=2 --sysmem=256 --numamem=19029 --fbmem=7569 --zcmem=128 --regmem=0"
+export LEGATE_AUTO_CONFIG=1
+export LEGATE_CONFIG="--cpus=$CPUS --gpus=$GPUS"
 export LEGATE_SHOW_CONFIG=1
 
+export LD_LIBRARY_PATH=""
+
 echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"
 
 CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}"
diff --git a/benchmark/sgemm.jl b/benchmark/sgemm.jl
deleted file mode 100644
index 28d9ad7c..00000000
--- a/benchmark/sgemm.jl
+++ /dev/null
@@ -1,56 +0,0 @@
-using cuNumeric
-using LinearAlgebra
-using Printf
-
-function initialize_cunumeric(N, M)
-    A = cuNumeric.as_type(cuNumeric.rand(NDArray, N, M), Float32)
-    B = cuNumeric.as_type(cuNumeric.rand(NDArray, M, N), Float32)
-    C = cuNumeric.zeros(Float32, N, N)
-    GC.gc() # remove the intermediate FP64 arrays
-    return A, B, C
-end
-
-function total_flops(N, M)
-    return N * N * ((2*M) - 1)
-end
-
-function total_space(N, M)
-    return 2 * (N*M) * sizeof(Float32) + (N*N) * sizeof(Float32)
-end
-
-function gemm_cunumeric(N, M, n_samples, n_warmup)
-    A, B, C = initialize_cunumeric(N, M)
-
-    start_time = nothing
-    for idx in range(1, n_samples + n_warmup)
-        if idx == n_warmup + 1
-            start_time = get_time_microseconds()
-        end
-
-        mul!(C, A, B)
-    end
-    total_time_μs = get_time_microseconds() - start_time
-    mean_time_ms = total_time_μs / (n_samples * 1e3)
-    gflops = total_flops(N, M) / (mean_time_ms * 1e6) # GFLOP is 1e9
-
-    return mean_time_ms, gflops
-end
-
-gpus = parse(Int, ARGS[1])
-N = parse(Int, ARGS[2])
-M = parse(Int, ARGS[3])
-n_samples = parse(Int, ARGS[4])
-n_warmup = parse(Int, ARGS[5])
-
-println(
-    "[cuNumeric]  MATMUL benchmark on $(N)x$(M) matricies for $(n_samples) iterations, $(n_warmup) warmups"
-)
-
-mean_time_ms, gflops = gemm_cunumeric(N, M, n_samples, n_warmup)
-
-println("[cuNumeric]  Mean Run Time: $(mean_time_ms) ms")
-println("[cuNumeric]  FLOPS: $(gflops) GFLOPS")
-
-open("./gemm.csv", "a") do io
-    @printf(io, "%s,%d,%d,%d,%.6f,%.6f\n", "cunumeric", gpus, N, M, mean_time_ms, gflops)
-end

From cbd9a3a3b449fa7f6f1a5091258d83af601e8217 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Tue, 2 Jun 2026 21:10:41 -0500
Subject: [PATCH 05/17] bring back T config

---
 benchmark/benchmarks.toml     |  9 +++++++++
 benchmark/parse_benchmarks.jl |  9 +++++++--
 benchmark/run.jl              | 31 ++++++++++++++++++-------------
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index 472e13bc..7d70c807 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -4,12 +4,21 @@ n_iter = 1000
 n_trial = 5
 
 [[grayscott]]
+T = "Float64"
 gpus = 1
 cpus = 2
 N = 1000
 M = 1000
 
 [[sgemm]]
+T = "Float32"
+gpus = 1
+cpus = 2
+N = 150
+M = 150
+
+[[sgemm]]
+T = "Float64"
 gpus = 1
 cpus = 2
 N = 150
diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl
index b11ef983..cf79cf20 100644
--- a/benchmark/parse_benchmarks.jl
+++ b/benchmark/parse_benchmarks.jl
@@ -2,10 +2,12 @@ using TOML
 
 """
 One benchmark invocation parsed from `benchmarks.toml`. `name` selects the
-benchmark type from `BENCHMARKS`; `args` are the sizes (currently `N M`).
+benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32");
+`args` are the sizes (currently `N M`).
 """
 struct BenchmarkSpec
     name::String
+    T::String
     gpus::Int
     cpus::Int
     args::Vector{Int}
@@ -23,7 +25,10 @@ function parse_config(path)
     for (name, entries) in raw
         name == "Global" && continue
         for e in entries
-            push!(specs, BenchmarkSpec(name, e["gpus"], e["cpus"], [e["N"], e["M"]]))
+            push!(
+                specs,
+                BenchmarkSpec(name, get(e, "T", "Float32"), e["gpus"], e["cpus"], [e["N"], e["M"]]),
+            )
         end
     end
 
diff --git a/benchmark/run.jl b/benchmark/run.jl
index 58990e14..72d3036c 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -1,6 +1,6 @@
 # run.jl: orchestrator, one child per benchmarks.toml entry. With args
-# (<gpus> <name> <N> <M> <iter> <warmup> <trial>) it runs one benchmark, e.g.
-# `julia run.jl 1 grayscott 1000 1000 100 5 5`
+# (<gpus> <name> <T> <N> <M> <iter> <warmup> <trial>) it runs one benchmark, e.g.
+# `julia run.jl 1 grayscott Float32 1000 1000 100 5 5`
 # Separate child per benchmark since LEGATE_CONFIG must be set before julia starts.
 
 include("benchmarks.jl")
@@ -21,12 +21,12 @@ function run_all_benchmarks(config="benchmarks.toml")
         N, M = spec.args
         println("\n================================")
         println(
-            "$(spec.name): gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " *
+            "$(spec.name): T=$(spec.T) gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " *
             "n_iter=$(gs.n_iter) n_warmup=$(gs.n_warmup) n_trial=$(gs.n_trial)",
         )
         println("================================")
 
-        cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)`
+        cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $(spec.T) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)`
         try
             run(cmd)
         catch e
@@ -35,12 +35,16 @@ function run_all_benchmarks(config="benchmarks.toml")
     end
 end
 
-function run_single(gpus, name, N, M, n_iter, n_warmup, n_trial)
-    b = BENCHMARKS[name]{Float32}(; N=N, M=M)
+# Resolve a TOML type string like "Float32" to the actual Julia type.
+parse_type(s) = getfield(Base, Symbol(s))::DataType
+
+function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
+    T = parse_type(T_str)
+    b = BENCHMARKS[name]{T}(; N=N, M=M)
     gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
 
     println(
-        "[cuNumeric] $(name) benchmark on $(N)x$(M) for $(n_iter) iterations " *
+        "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " *
         "($(n_warmup) warmup) x $(n_trial) trials",
     )
     br = run_benchmark(b, gs)
@@ -57,10 +61,11 @@ else
     using LinearAlgebra
     gpus = parse(Int, ARGS[1])
     bench_name = ARGS[2]
-    N = parse(Int, ARGS[3])
-    M = parse(Int, ARGS[4])
-    n_iter = parse(Int, ARGS[5])
-    n_warmup = parse(Int, ARGS[6])
-    n_trial = parse(Int, ARGS[7])
-    run_single(gpus, bench_name, N, M, n_iter, n_warmup, n_trial)
+    T_str = ARGS[3]
+    N = parse(Int, ARGS[4])
+    M = parse(Int, ARGS[5])
+    n_iter = parse(Int, ARGS[6])
+    n_warmup = parse(Int, ARGS[7])
+    n_trial = parse(Int, ARGS[8])
+    run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial)
 end

From fcfd749331dd29b97326329b9c9caf7ec7763888 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Tue, 2 Jun 2026 22:16:38 -0500
Subject: [PATCH 06/17] modularize some stuff.  run.jl (does all) ->
 run_benchmark.sh (shell script to set legate config and call worker) ->
 benchmark/single.jl (worker to call benchmark)

---
 benchmark/benchmarks.toml               | 12 ++++
 benchmark/run.jl                        | 96 ++++++++++---------------
 benchmark/{ => src}/benchmarks.jl       | 19 ++++-
 benchmark/{ => src}/parse_benchmarks.jl |  5 +-
 benchmark/src/single.jl                 | 37 ++++++++++
 5 files changed, 110 insertions(+), 59 deletions(-)
 rename benchmark/{ => src}/benchmarks.jl (90%)
 rename benchmark/{ => src}/parse_benchmarks.jl (81%)
 create mode 100644 benchmark/src/single.jl

diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index 7d70c807..deb58d53 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -23,3 +23,15 @@ gpus = 1
 cpus = 2
 N = 150
 M = 150
+
+[[montecarlo]]
+T = "Float64"
+gpus = 1
+cpus = 2
+N = 1_000_000
+
+[[montecarlo]]
+T = "Float32"
+gpus = 1
+cpus = 2
+N = 1_000_000
diff --git a/benchmark/run.jl b/benchmark/run.jl
index 72d3036c..0d22ad0d 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -1,71 +1,53 @@
-# run.jl: orchestrator, one child per benchmarks.toml entry. With args
-# (<gpus> <name> <T> <N> <M> <iter> <warmup> <trial>) it runs one benchmark, e.g.
-# `julia run.jl 1 grayscott Float32 1000 1000 100 5 5`
-# Separate child per benchmark since LEGATE_CONFIG must be set before julia starts.
+# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and
+# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
+# launching the worker (single.jl) that actually runs the benchmark.
+#   no args   -> one command per benchmarks.toml entry
+#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>
 
-include("benchmarks.jl")
-include("parse_benchmarks.jl")
+include("src/benchmarks.jl")
+include("src/parse_benchmarks.jl")
 
-function run_all_benchmarks(config="benchmarks.toml")
-    gs, specs = parse_config(joinpath(@__DIR__, config))
+const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
+const WORKER = joinpath(@__DIR__, "src/single.jl")
 
-    runner = joinpath(@__DIR__, "run_benchmark.sh")
-    self = @__FILE__
+banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
 
-    for spec in specs
-        if !haskey(BENCHMARKS, spec.name)
-            @warn "No benchmark registered for '$(spec.name)'; skipping."
-            continue
-        end
+function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial)
+    if !haskey(BENCHMARKS, name)
+        @warn "No benchmark registered for '$(name)'; skipping."
+        return nothing
+    end
 
-        N, M = spec.args
-        println("\n================================")
-        println(
-            "$(spec.name): T=$(spec.T) gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " *
-            "n_iter=$(gs.n_iter) n_warmup=$(gs.n_warmup) n_trial=$(gs.n_trial)",
-        )
-        println("================================")
+    banner(
+        "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
+        "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
+    )
 
-        cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $(spec.T) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)`
-        try
-            run(cmd)
-        catch e
-            @error "Benchmark '$(spec.name)' failed; continuing." exception = e
-        end
+    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
+    try
+        run(cmd)
+    catch e
+        @error "Benchmark '$(name)' failed; continuing." exception = e
     end
 end
 
-# Resolve a TOML type string like "Float32" to the actual Julia type.
-parse_type(s) = getfield(Base, Symbol(s))::DataType
-
-function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
-    T = parse_type(T_str)
-    b = BENCHMARKS[name]{T}(; N=N, M=M)
-    gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
-
-    println(
-        "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " *
-        "($(n_warmup) warmup) x $(n_trial) trials",
-    )
-    br = run_benchmark(b, gs)
-    @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
-    @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
-
-    save_result(br, gpus)
+function run_all_benchmarks(config="benchmarks.toml")
+    gs, specs = parse_config(joinpath(@__DIR__, config))
+    for spec in specs
+        N, M = spec.args
+        dispatch(;
+            gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T, N=N, M=M,
+            n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial,
+        )
+    end
 end
 
 if isempty(ARGS)
     run_all_benchmarks()
-else
-    using cuNumeric
-    using LinearAlgebra
-    gpus = parse(Int, ARGS[1])
-    bench_name = ARGS[2]
-    T_str = ARGS[3]
-    N = parse(Int, ARGS[4])
-    M = parse(Int, ARGS[5])
-    n_iter = parse(Int, ARGS[6])
-    n_warmup = parse(Int, ARGS[7])
-    n_trial = parse(Int, ARGS[8])
-    run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial)
+else # dispatch on args
+    dispatch(;
+        gpus=parse(Int, ARGS[1]), cpus=parse(Int, ARGS[2]), name=ARGS[3], T=ARGS[4],
+        N=parse(Int, ARGS[5]), M=parse(Int, ARGS[6]),
+        n_iter=parse(Int, ARGS[7]), n_warmup=parse(Int, ARGS[8]), n_trial=parse(Int, ARGS[9]),
+    )
 end
diff --git a/benchmark/benchmarks.jl b/benchmark/src/benchmarks.jl
similarity index 90%
rename from benchmark/benchmarks.jl
rename to benchmark/src/benchmarks.jl
index 8a87dd38..2cd17cdf 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/src/benchmarks.jl
@@ -57,6 +57,8 @@ Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T}
     n_samples::Int
 end
 
+name(::MonteCarloIntegration) = "montecarlo"
+dims(mci::MonteCarloIntegration) = (mci.n_samples, 1)
 function data(mci::MonteCarloIntegration{T}) where {T}
     "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)"
 end
@@ -66,6 +68,13 @@ allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
 total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
 total_flops(s::MonteCarloIntegration) = s.n_samples
 
+function initialize(mci::MonteCarloIntegration{T}) where {T}
+    # Uniform samples over the integration domain [0, 10].
+    x = T(10) .* cuNumeric.rand(T, mci.n_samples)
+    GC.gc()
+    return (x,)
+end
+
 _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
 run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
 
@@ -184,8 +193,16 @@ end
 const BENCHMARKS = Dict{String,Type}(
     "sgemm" => GEMM,
     "grayscott" => GrayScott,
+    "montecarlo" => MonteCarloIntegration,
 )
 
+# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks
+# use (N, M); MonteCarloIntegration uses N as its sample count and ignores M.
+build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M)
+function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T}
+    MonteCarloIntegration{T}(; n_samples=N)
+end
+
 # Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean
 # over `n_iter` iterations for trial `i`; the spread across trials gives stddev.
 struct BenchmarkResult{B<:AbstractBenchmark}
@@ -229,7 +246,7 @@ _std(x) = length(x) > 1 ? std(x) : 0.0
 
 function save_result(br::BenchmarkResult, gpus)
     N, M = dims(br.benchmark)
-    path = joinpath(@__DIR__, "results", "$(name(br.benchmark)).csv")
+    path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv")
     mkpath(dirname(path))
     open(path, "a") do io
         for trial in eachindex(br.times_ms)
diff --git a/benchmark/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
similarity index 81%
rename from benchmark/parse_benchmarks.jl
rename to benchmark/src/parse_benchmarks.jl
index cf79cf20..8fecedb3 100644
--- a/benchmark/parse_benchmarks.jl
+++ b/benchmark/src/parse_benchmarks.jl
@@ -27,7 +27,10 @@ function parse_config(path)
         for e in entries
             push!(
                 specs,
-                BenchmarkSpec(name, get(e, "T", "Float32"), e["gpus"], e["cpus"], [e["N"], e["M"]]),
+                BenchmarkSpec(
+                    name, get(e, "T", "Float32"), e["gpus"], e["cpus"],
+                    [e["N"], get(e, "M", 1)],
+                ),
             )
         end
     end
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
new file mode 100644
index 00000000..e86419bf
--- /dev/null
+++ b/benchmark/src/single.jl
@@ -0,0 +1,37 @@
+# single.jl: worker that runs exactly one benchmark. Launched by run_benchmark.sh
+# (dispatched from run.jl), which sets LEGATE_CONFIG in the env before julia starts.
+# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial>
+
+using cuNumeric
+using LinearAlgebra
+
+include("benchmarks.jl")
+
+# Resolve a TOML type string like "Float32" to the actual Julia type.
+parse_type(s) = getfield(Base, Symbol(s))::DataType
+
+function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
+    T = parse_type(T_str)
+    b = build_benchmark(BENCHMARKS[name], T, N, M)
+    gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
+
+    println(
+        "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " *
+        "($(n_warmup) warmup) x $(n_trial) trials",
+    )
+    br = run_benchmark(b, gs)
+    @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
+    @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
+
+    save_result(br, gpus)
+end
+
+gpus = parse(Int, ARGS[1])
+bench_name = ARGS[2]
+T_str = ARGS[3]
+N = parse(Int, ARGS[4])
+M = parse(Int, ARGS[5])
+n_iter = parse(Int, ARGS[6])
+n_warmup = parse(Int, ARGS[7])
+n_trial = parse(Int, ARGS[8])
+run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial)

From 87a9f908ad25c51cf201c54213fc8093bda40c26 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Tue, 2 Jun 2026 22:55:06 -0500
Subject: [PATCH 07/17] support multi list zipping configs

---
 benchmark/README.md               | 53 +++++++++++++++++++++++++++++++
 benchmark/benchmarks.toml         | 38 ++++++++--------------
 benchmark/src/parse_benchmarks.jl | 42 ++++++++++++++++++++----
 3 files changed, 102 insertions(+), 31 deletions(-)
 create mode 100644 benchmark/README.md

diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..658ce3ca
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,53 @@
+# Benchmark configuration
+
+Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.
+
+## Layout
+
+```toml
+[Global]
+n_warmup = 5
+n_iter   = 1000
+n_trial  = 5
+
+[[sgemm]]            # name registered in src/benchmarks.jl
+T    = "Float32"     # element type
+gpus = 1
+cpus = 2
+N    = 150
+M    = 150           # optional, defaults to 1
+```
+
+Repeat a `[[name]]` block to add independent configs.
+
+## Lists
+
+Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along two axes:
+
+- **`T` multiplies.** The whole sweep runs once per type.
+- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
+  of each is paired together.
+
+Each zipped field must be one of:
+
+- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config
+- a list whose length equals the sweep length
+
+Any other length mismatch is an error.
+
+```toml
+[[sgemm]]
+T    = ["Float64", "Float32"]   # multiplies
+gpus = [1, 2, 4]                #
+cpus = 2                        # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600)
+N    = [150, 300, 600]          #
+M    = [150, 300, 600]          #
+```
+
+-> 2 types * 3 sweep points = **6 runs**.
+
+### Gotcha
+
+When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
+combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
+to a specific size, use separate `[[name]]` blocks.
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index deb58d53..16def1ef 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -3,35 +3,25 @@ n_warmup = 5
 n_iter = 1000
 n_trial = 5
 
-[[grayscott]]
-T = "Float64"
-gpus = 1
-cpus = 2
-N = 1000
-M = 1000
-
-[[sgemm]]
-T = "Float32"
-gpus = 1
-cpus = 2
-N = 150
-M = 150
-
+# GEMM: work ~ 2*N^2*M. Hold N, scale M.
 [[sgemm]]
-T = "Float64"
-gpus = 1
+T = ["Float32", "Float64"]
+gpus = [1, 2, 4, 8]
 cpus = 2
-N = 150
-M = 150
+N = 4096
+M = [4096, 8192, 16384, 32768]
 
-[[montecarlo]]
-T = "Float64"
-gpus = 1
+# Gray-Scott: work ~ N*M. Hold N, scale M.
+[[grayscott]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
 cpus = 2
-N = 1_000_000
+N = 1024
+M = [1024, 2048, 4096, 8192]
 
+# Monte Carlo: work ~ N. Scale N linearly.
 [[montecarlo]]
 T = "Float32"
-gpus = 1
+gpus = [1, 2, 4, 8]
 cpus = 2
-N = 1_000_000
+N = [1_000_000, 2_000_000, 4_000_000, 8_000_000]
diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
index 8fecedb3..c44518df 100644
--- a/benchmark/src/parse_benchmarks.jl
+++ b/benchmark/src/parse_benchmarks.jl
@@ -13,6 +13,24 @@ struct BenchmarkSpec
     args::Vector{Int}
 end
 
+# A field may be a scalar or a list.
+aslist(x) = x isa AbstractVector ? collect(x) : [x]
+
+# Value of a zipped field for sweep position `i`. length==1 field broadcasts.
+sweep_value(field, i) = length(field) == 1 ? field[1] : field[i]
+
+# Number of positions in the sweep. Every multi-element field must agree on length;
+# length==1 fields broadcast and don't constrain it.
+function sweep_length(name, fields)
+    lengths = [length(field) for (_, field) in fields if length(field) > 1]
+    isempty(lengths) && return 1
+    allequal(lengths) || error(
+        "benchmark '$(name)': zipped fields gpus/cpus/N/M must share one length " *
+        "or be scalar; got " * join(("$k=$(length(v))" for (k, v) in fields), ", "),
+    )
+    return first(lengths)
+end
+
 function parse_config(path)
     raw = TOML.parsefile(path)
 
@@ -25,13 +43,23 @@ function parse_config(path)
     for (name, entries) in raw
         name == "Global" && continue
         for e in entries
-            push!(
-                specs,
-                BenchmarkSpec(
-                    name, get(e, "T", "Float32"), e["gpus"], e["cpus"],
-                    [e["N"], get(e, "M", 1)],
-                ),
-            )
+            types = aslist(get(e, "T", "Float32"))
+            gpus = aslist(e["gpus"])
+            cpus = aslist(e["cpus"])
+            N = aslist(e["N"])
+            M = aslist(get(e, "M", 1))
+
+            n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M])
+
+            for T in types, i in 1:n
+                push!(
+                    specs,
+                    BenchmarkSpec(
+                        name, T, sweep_value(gpus, i), sweep_value(cpus, i),
+                        [sweep_value(N, i), sweep_value(M, i)],
+                    ),
+                )
+            end
         end
     end
 

From b312512a1b565dc8567749935d969cb58af86328 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Tue, 2 Jun 2026 23:14:47 -0500
Subject: [PATCH 08/17] add benchmark registration seperating different
 benchmarks into individual files in src/benchmarks

---
 benchmark/src/benchmarks.jl            | 261 +------------------------
 benchmark/src/benchmarks/gemm.jl       |  27 +++
 benchmark/src/benchmarks/grayscott.jl  | 108 ++++++++++
 benchmark/src/benchmarks/montecarlo.jl |  31 +++
 benchmark/src/core.jl                  |  98 ++++++++++
 5 files changed, 269 insertions(+), 256 deletions(-)
 create mode 100644 benchmark/src/benchmarks/gemm.jl
 create mode 100644 benchmark/src/benchmarks/grayscott.jl
 create mode 100644 benchmark/src/benchmarks/montecarlo.jl
 create mode 100644 benchmark/src/core.jl

diff --git a/benchmark/src/benchmarks.jl b/benchmark/src/benchmarks.jl
index 2cd17cdf..4072b959 100644
--- a/benchmark/src/benchmarks.jl
+++ b/benchmark/src/benchmarks.jl
@@ -1,259 +1,8 @@
 using Printf
 using Statistics
 
-"""
-- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended
-    to avoid pre-compilation cost being timed.
-- `n_iter::Int` : Number of iterations to run per trial. Should be large enough
-    to build up queue depth of tasks such that latency is hidden.
-- `n_trial::Int` : Number of independent trials to run. Timing is restarted and
-    legate in between each trial. Sets number of datapoints used to estimated
-    standard deviations/errors.
-- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG,
-    this value is just bookkeeping.
-"""
-Base.@kwdef struct GlobalSettings
-    n_warmup::Int # Number of warmup steps, where timing is not done.
-    n_iter::Int # Number of iterations to run per trial
-    n_trial::Int = 1 # Number of independent trials to run. Benchmark
-    n_gpu::Int = 0
-end
-
-#########################################
-
-abstract type AbstractBenchmark{T} end
-
-#########################################
-
-Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
-    N::Int
-    M::Int
-end
-
-name(::GEMM) = "sgemm"
-dims(g::GEMM) = (g.N, g.M)
-data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
-
-function allowed_types(::Type{GEMM})
-    Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
-end
-
-total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
-total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
-
-function initialize(s::GEMM{T}) where {T}
-    A = cuNumeric.rand(T, s.N, s.M)
-    B = cuNumeric.rand(T, s.M, s.N)
-    C = cuNumeric.zeros(T, s.N, s.N)
-    GC.gc()
-    return C, A, B
-end
-
-run!(::GEMM, C, A, B) = mul!(C, A, B)
-
-#########################################
-
-Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T}
-    n_samples::Int
-end
-
-name(::MonteCarloIntegration) = "montecarlo"
-dims(mci::MonteCarloIntegration) = (mci.n_samples, 1)
-function data(mci::MonteCarloIntegration{T}) where {T}
-    "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)"
-end
-
-allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
-
-total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
-total_flops(s::MonteCarloIntegration) = s.n_samples
-
-function initialize(mci::MonteCarloIntegration{T}) where {T}
-    # Uniform samples over the integration domain [0, 10].
-    x = T(10) .* cuNumeric.rand(T, mci.n_samples)
-    GC.gc()
-    return (x,)
-end
-
-_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
-run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
-
-#########################################
-
-struct GSParams{T}
-    dx::T
-    dt::T
-    c_u::T
-    c_v::T
-    f::T
-    k::T
-end
-
-function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T}
-    GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k))
-end
-
-Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T}
-    N::Int
-    M::Int
-end
-
-name(::GrayScott) = "grayscott"
-dims(b::GrayScott) = (b.N, b.M)
-data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
-allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
-total_flops(b::GrayScott) = b.N * b.M # grid points updated per step
-
-mutable struct GrayScottState{A,P}
-    u::A
-    v::A
-    u_new::A
-    v_new::A
-    params::P
-end
-
-function initialize(b::GrayScott{T}) where {T}
-    d = (b.N, b.M)
-    u = cuNumeric.ones(T, d)
-    v = cuNumeric.zeros(T, d)
-    u_new = cuNumeric.zeros(T, d)
-    v_new = cuNumeric.zeros(T, d)
-
-    seed = min(150, b.N, b.M)
-    u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
-    v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
-
-    return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),)
-end
-
-function _gs_step!(u, v, u_new, v_new, args::GSParams)
-    # currently we don't have NDArray^x working yet.
-    F_u = (
-        (
-            -u[2:(end - 1), 2:(end - 1)] .*
-            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
-        ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)])
-    )
-    F_v = (
-        (
-            u[2:(end - 1), 2:(end - 1)] .*
-            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
-        ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)]
-    )
-    # 2-D Laplacian via slicing, excluding boundaries
-    u_lap = (
-        (
-            u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] +
-            u[1:(end - 2), 2:(end - 1)]
-        ) ./ args.dx^2 +
-        (
-            u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] +
-            u[2:(end - 1), 1:(end - 2)]
-        ) ./ args.dx^2
-    )
-    v_lap = (
-        (
-            v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] +
-            v[1:(end - 2), 2:(end - 1)]
-        ) ./ args.dx^2 +
-        (
-            v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] +
-            v[2:(end - 1), 1:(end - 2)]
-        ) ./ args.dx^2
-    )
-
-    # Forward-Euler step for all interior points
-    u_new[2:(end - 1), 2:(end - 1)] =
-        ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)]
-    v_new[2:(end - 1), 2:(end - 1)] =
-        ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)]
-
-    # Periodic boundary conditions
-    u_new[:, 1] = u[:, end - 1]
-    u_new[:, end] = u[:, 2]
-    u_new[1, :] = u[end - 1, :]
-    u_new[end, :] = u[2, :]
-    v_new[:, 1] = v[:, end - 1]
-    v_new[:, end] = v[:, 2]
-    v_new[1, :] = v[end - 1, :]
-    v_new[end, :] = v[2, :]
-end
-
-function run!(::GrayScott, st::GrayScottState)
-    _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params)
-    # swap references rather than copy
-    st.u, st.u_new = st.u_new, st.u
-    st.v, st.v_new = st.v_new, st.v
-    return nothing
-end
-
-#########################################
-
-# Maps the benchmarks.toml table name to its benchmark type. Add new benchmarks here.
-const BENCHMARKS = Dict{String,Type}(
-    "sgemm" => GEMM,
-    "grayscott" => GrayScott,
-    "montecarlo" => MonteCarloIntegration,
-)
-
-# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks
-# use (N, M); MonteCarloIntegration uses N as its sample count and ignores M.
-build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M)
-function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T}
-    MonteCarloIntegration{T}(; n_samples=N)
-end
-
-# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean
-# over `n_iter` iterations for trial `i`; the spread across trials gives stddev.
-struct BenchmarkResult{B<:AbstractBenchmark}
-    times_ms::Vector{Float64}
-    gflops::Vector{Float64}
-    benchmark::B
-end
-
-# One timed trial: warmup, then time `n_iter` iterations of `run!`.
-function _trial(b::AbstractBenchmark, gs::GlobalSettings)
-    GC.gc(true)
-    state = initialize(b)
-
-    start_time = zero(get_time_microseconds())
-    for idx in 1:(gs.n_warmup + gs.n_iter)
-        if idx == gs.n_warmup + 1
-            start_time = get_time_microseconds()
-        end
-        run!(b, state...)
-    end
-    total_time_μs = get_time_microseconds() - start_time
-
-    mean_time_ms = total_time_μs / (gs.n_iter * 1e3)
-    gflops = total_flops(b) / (mean_time_ms * 1e6)
-    return mean_time_ms, gflops
-end
-
-# Run `n_trial` independent trials and collect their per-trial measurements.
-function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings)
-    times_ms = Float64[]
-    gflops = Float64[]
-    for _ in 1:gs.n_trial
-        t, g = _trial(b, gs)
-        push!(times_ms, t)
-        push!(gflops, g)
-    end
-    return BenchmarkResult(times_ms, gflops, b)
-end
-
-_std(x) = length(x) > 1 ? std(x) : 0.0
-
-function save_result(br::BenchmarkResult, gpus)
-    N, M = dims(br.benchmark)
-    path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv")
-    mkpath(dirname(path))
-    open(path, "a") do io
-        for trial in eachindex(br.times_ms)
-            @printf(
-                io, "%s,%d,%d,%d,%d,%.6f,%.6f\n",
-                "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial],
-            )
-        end
-    end
-end
+# Adding a benchmark is: drop a file in benchmarks/ and include it below.
+include("core.jl")
+include("benchmarks/gemm.jl")
+include("benchmarks/grayscott.jl")
+include("benchmarks/montecarlo.jl")
diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl
new file mode 100644
index 00000000..09ff0c57
--- /dev/null
+++ b/benchmark/src/benchmarks/gemm.jl
@@ -0,0 +1,27 @@
+Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
+    N::Int
+    M::Int
+end
+
+name(::GEMM) = "sgemm"
+dims(g::GEMM) = (g.N, g.M)
+data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
+
+function allowed_types(::Type{GEMM})
+    Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
+end
+
+total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
+total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
+
+function initialize(s::GEMM{T}) where {T}
+    A = cuNumeric.rand(T, s.N, s.M)
+    B = cuNumeric.rand(T, s.M, s.N)
+    C = cuNumeric.zeros(T, s.N, s.N)
+    GC.gc()
+    return C, A, B
+end
+
+run!(::GEMM, C, A, B) = mul!(C, A, B)
+
+register_benchmark("sgemm", GEMM)
diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl
new file mode 100644
index 00000000..89b763bf
--- /dev/null
+++ b/benchmark/src/benchmarks/grayscott.jl
@@ -0,0 +1,108 @@
+struct GSParams{T}
+    dx::T
+    dt::T
+    c_u::T
+    c_v::T
+    f::T
+    k::T
+end
+
+function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T}
+    GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k))
+end
+
+Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T}
+    N::Int
+    M::Int
+end
+
+name(::GrayScott) = "grayscott"
+dims(b::GrayScott) = (b.N, b.M)
+data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
+allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
+total_flops(b::GrayScott) = b.N * b.M # grid points updated per step
+
+mutable struct GrayScottState{A,P}
+    u::A
+    v::A
+    u_new::A
+    v_new::A
+    params::P
+end
+
+function initialize(b::GrayScott{T}) where {T}
+    d = (b.N, b.M)
+    u = cuNumeric.ones(T, d)
+    v = cuNumeric.zeros(T, d)
+    u_new = cuNumeric.zeros(T, d)
+    v_new = cuNumeric.zeros(T, d)
+
+    seed = min(150, b.N, b.M)
+    u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
+    v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
+
+    return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),)
+end
+
+function _gs_step!(u, v, u_new, v_new, args::GSParams)
+    # currently we don't have NDArray^x working yet.
+    F_u = (
+        (
+            -u[2:(end - 1), 2:(end - 1)] .*
+            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+        ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)])
+    )
+    F_v = (
+        (
+            u[2:(end - 1), 2:(end - 1)] .*
+            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+        ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)]
+    )
+    # 2-D Laplacian via slicing, excluding boundaries
+    u_lap = (
+        (
+            u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] +
+            u[1:(end - 2), 2:(end - 1)]
+        ) ./ args.dx^2 +
+        (
+            u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] +
+            u[2:(end - 1), 1:(end - 2)]
+        ) ./ args.dx^2
+    )
+    v_lap = (
+        (
+            v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] +
+            v[1:(end - 2), 2:(end - 1)]
+        ) ./ args.dx^2 +
+        (
+            v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] +
+            v[2:(end - 1), 1:(end - 2)]
+        ) ./ args.dx^2
+    )
+
+    # Forward-Euler step for all interior points
+    u_new[2:(end - 1), 2:(end - 1)] =
+        ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)]
+    v_new[2:(end - 1), 2:(end - 1)] =
+        ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)]
+
+    # Periodic boundary conditions
+    u_new[:, 1] = u[:, end - 1]
+    u_new[:, end] = u[:, 2]
+    u_new[1, :] = u[end - 1, :]
+    u_new[end, :] = u[2, :]
+    v_new[:, 1] = v[:, end - 1]
+    v_new[:, end] = v[:, 2]
+    v_new[1, :] = v[end - 1, :]
+    v_new[end, :] = v[2, :]
+end
+
+function run!(::GrayScott, st::GrayScottState)
+    _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params)
+    # swap references rather than copy
+    st.u, st.u_new = st.u_new, st.u
+    st.v, st.v_new = st.v_new, st.v
+    return nothing
+end
+
+register_benchmark("grayscott", GrayScott)
diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl
new file mode 100644
index 00000000..ecbda4f9
--- /dev/null
+++ b/benchmark/src/benchmarks/montecarlo.jl
@@ -0,0 +1,31 @@
+Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T}
+    n_samples::Int
+end
+
+name(::MonteCarloIntegration) = "montecarlo"
+dims(mci::MonteCarloIntegration) = (mci.n_samples, 1)
+function data(mci::MonteCarloIntegration{T}) where {T}
+    "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)"
+end
+
+allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
+
+total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
+total_flops(s::MonteCarloIntegration) = s.n_samples
+
+function initialize(mci::MonteCarloIntegration{T}) where {T}
+    # Uniform samples over the integration domain [0, 10].
+    x = T(10) .* cuNumeric.rand(T, mci.n_samples)
+    GC.gc()
+    return (x,)
+end
+
+_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
+run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
+
+# n_samples comes in as N; M is unused.
+function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T}
+    MonteCarloIntegration{T}(; n_samples=N)
+end
+
+register_benchmark("montecarlo", MonteCarloIntegration)
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
new file mode 100644
index 00000000..63f6f898
--- /dev/null
+++ b/benchmark/src/core.jl
@@ -0,0 +1,98 @@
+"""
+- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended
+    to avoid pre-compilation cost being timed.
+- `n_iter::Int` : Number of iterations to run per trial. Should be large enough
+    to build up queue depth of tasks such that latency is hidden.
+- `n_trial::Int` : Number of independent trials to run. Timing is restarted and
+    legate in between each trial. Sets number of datapoints used to estimated
+    standard deviations/errors.
+- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG,
+    this value is just bookkeeping.
+"""
+Base.@kwdef struct GlobalSettings
+    n_warmup::Int # Number of warmup steps, where timing is not done.
+    n_iter::Int # Number of iterations to run per trial
+    n_trial::Int = 1 # Number of independent trials to run. Benchmark
+    n_gpu::Int = 0
+end
+
+#########################################
+
+abstract type AbstractBenchmark{T} end
+
+# Interface each benchmark implements (see benchmarks/gemm.jl for a template).
+function name end
+function dims end
+function data end
+function allowed_types end
+function total_flops end
+function initialize end
+function run! end
+
+# Maps a benchmarks.toml table name to its benchmark type. Each benchmark file
+# registers itself via `register_benchmark`.
+const BENCHMARKS = Dict{String,Type}()
+function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBenchmark}
+    BENCHMARKS[key] = B
+end
+
+# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks
+# use (N, M); a benchmark with different arity overrides this (see montecarlo.jl).
+build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M)
+
+#########################################
+
+# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean
+# over `n_iter` iterations for trial `i`; the spread across trials gives stddev.
+struct BenchmarkResult{B<:AbstractBenchmark}
+    times_ms::Vector{Float64}
+    gflops::Vector{Float64}
+    benchmark::B
+end
+
+# One timed trial: warmup, then time `n_iter` iterations of `run!`.
+function _trial(b::AbstractBenchmark, gs::GlobalSettings)
+    GC.gc(true)
+    state = initialize(b)
+
+    start_time = zero(get_time_microseconds())
+    for idx in 1:(gs.n_warmup + gs.n_iter)
+        if idx == gs.n_warmup + 1
+            start_time = get_time_microseconds()
+        end
+        run!(b, state...)
+    end
+    total_time_μs = get_time_microseconds() - start_time
+
+    mean_time_ms = total_time_μs / (gs.n_iter * 1e3)
+    gflops = total_flops(b) / (mean_time_ms * 1e6)
+    return mean_time_ms, gflops
+end
+
+# Run `n_trial` independent trials and collect their per-trial measurements.
+function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings)
+    times_ms = Float64[]
+    gflops = Float64[]
+    for _ in 1:gs.n_trial
+        t, g = _trial(b, gs)
+        push!(times_ms, t)
+        push!(gflops, g)
+    end
+    return BenchmarkResult(times_ms, gflops, b)
+end
+
+_std(x) = length(x) > 1 ? std(x) : 0.0
+
+function save_result(br::BenchmarkResult, gpus)
+    N, M = dims(br.benchmark)
+    path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv")
+    mkpath(dirname(path))
+    open(path, "a") do io
+        for trial in eachindex(br.times_ms)
+            @printf(
+                io, "%s,%d,%d,%d,%d,%.6f,%.6f\n",
+                "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial],
+            )
+        end
+    end
+end

From 871b5a560f30763831feed1a41f48a9c92adb140 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 3 Jun 2026 00:03:54 -0500
Subject: [PATCH 09/17] add notion of variants

---
 benchmark/README.md                    |  33 ++++++-
 benchmark/benchmarks.toml              |   1 +
 benchmark/run.jl                       |  13 +--
 benchmark/src/benchmarks/grayscott.jl  | 119 ++++++++++++++-----------
 benchmark/src/benchmarks/montecarlo.jl |   4 +-
 benchmark/src/core.jl                  |  41 +++++++--
 benchmark/src/parse_benchmarks.jl      |  10 ++-
 benchmark/src/single.jl                |  16 ++--
 8 files changed, 161 insertions(+), 76 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 658ce3ca..cf2a248b 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -22,9 +22,11 @@ Repeat a `[[name]]` block to add independent configs.
 
 ## Lists
 
-Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along two axes:
+Any of `T`, `variants`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
+two axes:
 
-- **`T` multiplies.** The whole sweep runs once per type.
+- **`T` and `variants` multiply.** The whole sweep runs once per type and once
+  per variant.
 - **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
   of each is paired together.
 
@@ -51,3 +53,30 @@ M    = [150, 300, 600]          #
 When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
 combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
 to a specific size, use separate `[[name]]` blocks.
+
+## Variants
+
+A variant is a named way of running a benchmark. List them per entry with
+`variants = [...]` (defaults to `["baseline"]`); they multiply like `T`, and the
+chosen variant is recorded as a column in the results CSV so runs can be compared.
+
+```toml
+[[grayscott]]
+T = "Float64"
+N = 1024
+M = [1024, 2048, 4096]
+gpus = [1, 2, 4]
+cpus = 2
+variants = ["baseline", "lifetimes"]   # 2 variants * 3 sweep points = 6 runs
+```
+
+There are two kinds, both flowing through the same `variant` string:
+
+- **Code-path variants** change what the worker runs. The benchmark's `run!`
+  dispatches on the variant. Example: grayscott's `lifetimes` wraps the step in
+  `@analyze_lifetimes` (see `src/benchmarks/grayscott.jl`). A benchmark that
+  doesn't recognize a variant just runs its baseline path.
+- **Process-level variants** flip a runtime setting before the run via a setup
+  thunk registered in `register_variant` (`src/core.jl`). The worker calls it at
+  startup. Broadcast fusion will plug in here once it lands, e.g.
+  `register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)`.
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index 16def1ef..a3eb531d 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -18,6 +18,7 @@ gpus = [1, 2, 4, 8]
 cpus = 2
 N = 1024
 M = [1024, 2048, 4096, 8192]
+variants = ["baseline", "lifetimes"]
 
 # Monte Carlo: work ~ N. Scale N linearly.
 [[montecarlo]]
diff --git a/benchmark/run.jl b/benchmark/run.jl
index 0d22ad0d..167d35f5 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -2,8 +2,9 @@
 # dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
 # launching the worker (single.jl) that actually runs the benchmark.
 #   no args   -> one command per benchmarks.toml entry
-#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>
+#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial> [variant]
 
+using cuNumeric
 include("src/benchmarks.jl")
 include("src/parse_benchmarks.jl")
 
@@ -12,18 +13,18 @@ const WORKER = joinpath(@__DIR__, "src/single.jl")
 
 banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
 
-function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial)
+function dispatch(; gpus, cpus, name, T, variant, N, M, n_iter, n_warmup, n_trial)
     if !haskey(BENCHMARKS, name)
         @warn "No benchmark registered for '$(name)'; skipping."
         return nothing
     end
 
     banner(
-        "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
+        "$(name) [$(variant)]: T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
         "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
     )
 
-    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
+    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial $variant`
     try
         run(cmd)
     catch e
@@ -36,7 +37,8 @@ function run_all_benchmarks(config="benchmarks.toml")
     for spec in specs
         N, M = spec.args
         dispatch(;
-            gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T, N=N, M=M,
+            gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T,
+            variant=spec.variant, N=N, M=M,
             n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial,
         )
     end
@@ -49,5 +51,6 @@ else # dispatch on args
         gpus=parse(Int, ARGS[1]), cpus=parse(Int, ARGS[2]), name=ARGS[3], T=ARGS[4],
         N=parse(Int, ARGS[5]), M=parse(Int, ARGS[6]),
         n_iter=parse(Int, ARGS[7]), n_warmup=parse(Int, ARGS[8]), n_trial=parse(Int, ARGS[9]),
+        variant=(length(ARGS) >= 10 ? ARGS[10] : "baseline"),
     )
 end
diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl
index 89b763bf..0a7eb02d 100644
--- a/benchmark/src/benchmarks/grayscott.jl
+++ b/benchmark/src/benchmarks/grayscott.jl
@@ -14,6 +14,7 @@ end
 Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T}
     N::Int
     M::Int
+    variant::Symbol = :baseline
 end
 
 name(::GrayScott) = "grayscott"
@@ -22,6 +23,10 @@ data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
 allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
 total_flops(b::GrayScott) = b.N * b.M # grid points updated per step
 
+function build_benchmark(::Type{GrayScott}, ::Type{T}, N, M, variant) where {T}
+    GrayScott{T}(; N=N, M=M, variant=Symbol(variant))
+end
+
 mutable struct GrayScottState{A,P}
     u::A
     v::A
@@ -44,65 +49,77 @@ function initialize(b::GrayScott{T}) where {T}
     return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),)
 end
 
-function _gs_step!(u, v, u_new, v_new, args::GSParams)
-    # currently we don't have NDArray^x working yet.
-    F_u = (
-        (
-            -u[2:(end - 1), 2:(end - 1)] .*
-            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
-        ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)])
-    )
-    F_v = (
-        (
-            u[2:(end - 1), 2:(end - 1)] .*
-            (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
-        ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)]
-    )
-    # 2-D Laplacian via slicing, excluding boundaries
-    u_lap = (
-        (
-            u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] +
-            u[1:(end - 2), 2:(end - 1)]
-        ) ./ args.dx^2 +
-        (
-            u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] +
-            u[2:(end - 1), 1:(end - 2)]
-        ) ./ args.dx^2
-    )
-    v_lap = (
-        (
-            v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] +
-            v[1:(end - 2), 2:(end - 1)]
-        ) ./ args.dx^2 +
-        (
-            v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] +
-            v[2:(end - 1), 1:(end - 2)]
-        ) ./ args.dx^2
-    )
+# VARIANT DESCRIPTION
+# baseline: as written
+# lifetimes: step wrapped in @analyze_lifetimes
+let body = quote
+        # currently we don't have NDArray^x working yet.
+        F_u = (
+            (
+                -u[2:(end - 1), 2:(end - 1)] .*
+                (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+            ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)])
+        )
+        F_v = (
+            (
+                u[2:(end - 1), 2:(end - 1)] .*
+                (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+            ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)]
+        )
+        # 2-D Laplacian via slicing, excluding boundaries
+        u_lap = (
+            (
+                u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] +
+                u[1:(end - 2), 2:(end - 1)]
+            ) ./ args.dx^2 +
+            (
+                u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] +
+                u[2:(end - 1), 1:(end - 2)]
+            ) ./ args.dx^2
+        )
+        v_lap = (
+            (
+                v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] +
+                v[1:(end - 2), 2:(end - 1)]
+            ) ./ args.dx^2 +
+            (
+                v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] +
+                v[2:(end - 1), 1:(end - 2)]
+            ) ./ args.dx^2
+        )
 
-    # Forward-Euler step for all interior points
-    u_new[2:(end - 1), 2:(end - 1)] =
-        ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)]
-    v_new[2:(end - 1), 2:(end - 1)] =
-        ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)]
+        # Forward-Euler step for all interior points
+        u_new[2:(end - 1), 2:(end - 1)] =
+            ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)]
+        v_new[2:(end - 1), 2:(end - 1)] =
+            ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)]
+
+        # Periodic boundary conditions
+        u_new[:, 1] = u[:, end - 1]
+        u_new[:, end] = u[:, 2]
+        u_new[1, :] = u[end - 1, :]
+        u_new[end, :] = u[2, :]
+        v_new[:, 1] = v[:, end - 1]
+        v_new[:, end] = v[:, 2]
+        v_new[1, :] = v[end - 1, :]
+        v_new[end, :] = v[2, :]
+    end
+    @eval _gs_step!(::Val{:baseline}, u, v, u_new, v_new, args::GSParams) = $body
+    @eval _gs_step!(::Val{:lifetimes}, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body
+end
 
-    # Periodic boundary conditions
-    u_new[:, 1] = u[:, end - 1]
-    u_new[:, end] = u[:, 2]
-    u_new[1, :] = u[end - 1, :]
-    u_new[end, :] = u[2, :]
-    v_new[:, 1] = v[:, end - 1]
-    v_new[:, end] = v[:, 2]
-    v_new[1, :] = v[end - 1, :]
-    v_new[end, :] = v[2, :]
+# Variants not special-cased (e.g. testing fusion) run the baseline path.
+function _gs_step!(::Val, u, v, u_new, v_new, args::GSParams)
+    _gs_step!(Val(:baseline), u, v, u_new, v_new, args)
 end
 
-function run!(::GrayScott, st::GrayScottState)
-    _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params)
+function run!(b::GrayScott, st::GrayScottState)
+    _gs_step!(Val(b.variant), st.u, st.v, st.u_new, st.v_new, st.params)
     # swap references rather than copy
     st.u, st.u_new = st.u_new, st.u
     st.v, st.v_new = st.v_new, st.v
     return nothing
 end
 
+register_variant("lifetimes")
 register_benchmark("grayscott", GrayScott)
diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl
index ecbda4f9..0b5175f6 100644
--- a/benchmark/src/benchmarks/montecarlo.jl
+++ b/benchmark/src/benchmarks/montecarlo.jl
@@ -23,8 +23,8 @@ end
 _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
 run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
 
-# n_samples comes in as N; M is unused.
-function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T}
+# n_samples comes in as N; M and variant are unused.
+function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M, variant) where {T}
     MonteCarloIntegration{T}(; n_samples=N)
 end
 
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
index 63f6f898..e1759ca0 100644
--- a/benchmark/src/core.jl
+++ b/benchmark/src/core.jl
@@ -36,9 +36,37 @@ function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBe
     BENCHMARKS[key] = B
 end
 
-# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks
-# use (N, M); a benchmark with different arity overrides this (see montecarlo.jl).
-build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M)
+# Default uses (N, M); benchmarks with a code-path variant or different arity
+# override this (see grayscott.jl / montecarlo.jl).
+function build_benchmark(::Type{B}, ::Type{T}, N, M, variant) where {B<:AbstractBenchmark,T}
+    B{T}(; N=N, M=M)
+end
+
+#########################################
+
+# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime
+# preference); code-path variants leave it a no-op.
+struct Variant
+    name::String
+    setup::Function
+end
+
+const VARIANTS = Dict{String,Variant}()
+
+function register_variant(name, setup=() -> nothing)
+    VARIANTS[name] = Variant(name, setup)
+end
+
+function variant_setup(name)
+    if haskey(VARIANTS, name)
+        return VARIANTS[name].setup
+    end
+    return () -> nothing
+end
+
+register_variant("baseline")
+# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)
+# register_variant("fusion_on",  cuNumeric.CNPreferences.enable_broadcast_fusion!)
 
 #########################################
 
@@ -83,15 +111,16 @@ end
 
 _std(x) = length(x) > 1 ? std(x) : 0.0
 
-function save_result(br::BenchmarkResult, gpus)
+function save_result(br::BenchmarkResult, gpus, variant)
     N, M = dims(br.benchmark)
     path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv")
     mkpath(dirname(path))
     open(path, "a") do io
         for trial in eachindex(br.times_ms)
             @printf(
-                io, "%s,%d,%d,%d,%d,%.6f,%.6f\n",
-                "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial],
+                io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n",
+                "cunumeric", variant, gpus, N, M, trial,
+                br.times_ms[trial], br.gflops[trial],
             )
         end
     end
diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
index c44518df..7144097c 100644
--- a/benchmark/src/parse_benchmarks.jl
+++ b/benchmark/src/parse_benchmarks.jl
@@ -3,11 +3,13 @@ using TOML
 """
 One benchmark invocation parsed from `benchmarks.toml`. `name` selects the
 benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32");
-`args` are the sizes (currently `N M`).
+`variant` names the run variant (e.g. "baseline", "lifetimes"); `args` are the
+sizes (currently `N M`).
 """
 struct BenchmarkSpec
     name::String
     T::String
+    variant::String
     gpus::Int
     cpus::Int
     args::Vector{Int}
@@ -44,6 +46,7 @@ function parse_config(path)
         name == "Global" && continue
         for e in entries
             types = aslist(get(e, "T", "Float32"))
+            variants = aslist(get(e, "variants", "baseline"))
             gpus = aslist(e["gpus"])
             cpus = aslist(e["cpus"])
             N = aslist(e["N"])
@@ -51,11 +54,12 @@ function parse_config(path)
 
             n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M])
 
-            for T in types, i in 1:n
+            # `T` and `variants` multiply; gpus/cpus/N/M zip into the sweep.
+            for T in types, variant in variants, i in 1:n
                 push!(
                     specs,
                     BenchmarkSpec(
-                        name, T, sweep_value(gpus, i), sweep_value(cpus, i),
+                        name, T, variant, sweep_value(gpus, i), sweep_value(cpus, i),
                         [sweep_value(N, i), sweep_value(M, i)],
                     ),
                 )
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
index e86419bf..8e6e85ee 100644
--- a/benchmark/src/single.jl
+++ b/benchmark/src/single.jl
@@ -1,6 +1,6 @@
 # single.jl: worker that runs exactly one benchmark. Launched by run_benchmark.sh
 # (dispatched from run.jl), which sets LEGATE_CONFIG in the env before julia starts.
-# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial>
+# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial> <variant>
 
 using cuNumeric
 using LinearAlgebra
@@ -10,20 +10,21 @@ include("benchmarks.jl")
 # Resolve a TOML type string like "Float32" to the actual Julia type.
 parse_type(s) = getfield(Base, Symbol(s))::DataType
 
-function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
+function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, variant)
     T = parse_type(T_str)
-    b = build_benchmark(BENCHMARKS[name], T, N, M)
+    variant_setup(variant)() # apply any pre-run setup (e.g. flip a runtime preference)
+    b = build_benchmark(BENCHMARKS[name], T, N, M, variant)
     gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
 
     println(
-        "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " *
-        "($(n_warmup) warmup) x $(n_trial) trials",
+        "[cuNumeric] $(name) [$(variant)] benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
+        "iterations ($(n_warmup) warmup) x $(n_trial) trials",
     )
     br = run_benchmark(b, gs)
     @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
     @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
 
-    save_result(br, gpus)
+    save_result(br, gpus, variant)
 end
 
 gpus = parse(Int, ARGS[1])
@@ -34,4 +35,5 @@ M = parse(Int, ARGS[5])
 n_iter = parse(Int, ARGS[6])
 n_warmup = parse(Int, ARGS[7])
 n_trial = parse(Int, ARGS[8])
-run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial)
+variant = ARGS[9]
+run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, variant)

From 83b54674add8e0d722527eb4898dcad80985cdc2 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 3 Jun 2026 00:25:26 -0500
Subject: [PATCH 10/17] adjust includes and using cuNumeric.

---
 benchmark/run.jl            | 11 ++++-------
 benchmark/src/benchmarks.jl |  3 ---
 benchmark/src/core.jl       |  3 +++
 benchmark/src/single.jl     |  3 +++
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/benchmark/run.jl b/benchmark/run.jl
index 167d35f5..039306a7 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -4,8 +4,9 @@
 #   no args   -> one command per benchmarks.toml entry
 #   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial> [variant]
 
-using cuNumeric
-include("src/benchmarks.jl")
+# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
+# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
+include("src/core.jl")
 include("src/parse_benchmarks.jl")
 
 const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
@@ -14,11 +15,7 @@ const WORKER = joinpath(@__DIR__, "src/single.jl")
 banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
 
 function dispatch(; gpus, cpus, name, T, variant, N, M, n_iter, n_warmup, n_trial)
-    if !haskey(BENCHMARKS, name)
-        @warn "No benchmark registered for '$(name)'; skipping."
-        return nothing
-    end
-
+    # Name validity is checked in the worker (single.jl), which owns the registry.
     banner(
         "$(name) [$(variant)]: T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
         "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
diff --git a/benchmark/src/benchmarks.jl b/benchmark/src/benchmarks.jl
index 4072b959..794068fa 100644
--- a/benchmark/src/benchmarks.jl
+++ b/benchmark/src/benchmarks.jl
@@ -1,6 +1,3 @@
-using Printf
-using Statistics
-
 # Adding a benchmark is: drop a file in benchmarks/ and include it below.
 include("core.jl")
 include("benchmarks/gemm.jl")
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
index e1759ca0..b87c9fd0 100644
--- a/benchmark/src/core.jl
+++ b/benchmark/src/core.jl
@@ -1,3 +1,6 @@
+using Printf
+using Statistics
+
 """
 - `n_warmup::Int` : Number of warmup steps. These are not timed. Intended
     to avoid pre-compilation cost being timed.
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
index 8e6e85ee..86b09f78 100644
--- a/benchmark/src/single.jl
+++ b/benchmark/src/single.jl
@@ -11,6 +11,9 @@ include("benchmarks.jl")
 parse_type(s) = getfield(Base, Symbol(s))::DataType
 
 function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, variant)
+    haskey(BENCHMARKS, name) || error(
+        "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))"
+    )
     T = parse_type(T_str)
     variant_setup(variant)() # apply any pre-run setup (e.g. flip a runtime preference)
     b = build_benchmark(BENCHMARKS[name], T, N, M, variant)

From 63cdf9bb5e63696019d31c9603a87641b5be8931 Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 3 Jun 2026 11:04:03 -0500
Subject: [PATCH 11/17] remove variants

---
 benchmark/Project.toml                |  1 +
 benchmark/README.md                   |  2 +-
 benchmark/benchmarks.toml             | 30 +++++++++++---
 benchmark/run.jl                      | 32 +++++++++------
 benchmark/src/benchmarks.jl           |  5 ---
 benchmark/src/benchmarks/gemm.jl      |  4 +-
 benchmark/src/benchmarks/grayscott.jl | 43 ++++++++++----------
 benchmark/src/core.jl                 | 57 +++++++++++++--------------
 benchmark/src/parse_benchmarks.jl     | 12 +++---
 benchmark/src/single.jl               | 16 ++++----
 10 files changed, 112 insertions(+), 90 deletions(-)
 delete mode 100644 benchmark/src/benchmarks.jl

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index e6583a71..71a488f9 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
diff --git a/benchmark/README.md b/benchmark/README.md
index cf2a248b..762bffc6 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -10,7 +10,7 @@ n_warmup = 5
 n_iter   = 1000
 n_trial  = 5
 
-[[sgemm]]            # name registered in src/benchmarks.jl
+[[gemm]]            # name registered in src/benchmarks.jl
 T    = "Float32"     # element type
 gpus = 1
 cpus = 2
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index a3eb531d..855a3b8d 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -3,24 +3,42 @@ n_warmup = 5
 n_iter = 1000
 n_trial = 5
 
-# GEMM: work ~ 2*N^2*M. Hold N, scale M.
-[[sgemm]]
+####################################
+#             GEMM                 #
+# Work ~ 2*N^2*M. Hold N, scale M. #
+####################################
+
+[[gemm]]
 T = ["Float32", "Float64"]
 gpus = [1, 2, 4, 8]
 cpus = 2
 N = 4096
 M = [4096, 8192, 16384, 32768]
 
-# Gray-Scott: work ~ N*M. Hold N, scale M.
-[[grayscott]]
+#################################
+#         Gray-Scott            #
+#  Work ~ N*M. Hold N, scale M. #
+#################################
+
+[[grayscott_baseline]]
 T = "Float32"
 gpus = [1, 2, 4, 8]
 cpus = 2
 N = 1024
 M = [1024, 2048, 4096, 8192]
-variants = ["baseline", "lifetimes"]
 
-# Monte Carlo: work ~ N. Scale N linearly.
+[[grayscott_lifetimes]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+#################################
+#   Monte-Carlo Integration     #
+#  Work ~ N. Scale N linearly   #
+#################################
+
 [[montecarlo]]
 T = "Float32"
 gpus = [1, 2, 4, 8]
diff --git a/benchmark/run.jl b/benchmark/run.jl
index 039306a7..89d6d28b 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -2,7 +2,7 @@
 # dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
 # launching the worker (single.jl) that actually runs the benchmark.
 #   no args   -> one command per benchmarks.toml entry
-#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial> [variant]
+#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>
 
 # Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
 # both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
@@ -14,14 +14,14 @@ const WORKER = joinpath(@__DIR__, "src/single.jl")
 
 banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
 
-function dispatch(; gpus, cpus, name, T, variant, N, M, n_iter, n_warmup, n_trial)
+function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial)
     # Name validity is checked in the worker (single.jl), which owns the registry.
     banner(
-        "$(name) [$(variant)]: T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
+        "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
         "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
     )
 
-    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial $variant`
+    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
     try
         run(cmd)
     catch e
@@ -34,9 +34,14 @@ function run_all_benchmarks(config="benchmarks.toml")
     for spec in specs
         N, M = spec.args
         dispatch(;
-            gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T,
-            variant=spec.variant, N=N, M=M,
-            n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial,
+            gpus=spec.gpus,
+            cpus=spec.cpus,
+            name=spec.name,
+            T=spec.T,
+            N=N, M=M,
+            n_iter=gs.n_iter,
+            n_warmup=gs.n_warmup,
+            n_trial=gs.n_trial,
         )
     end
 end
@@ -45,9 +50,14 @@ if isempty(ARGS)
     run_all_benchmarks()
 else # dispatch on args
     dispatch(;
-        gpus=parse(Int, ARGS[1]), cpus=parse(Int, ARGS[2]), name=ARGS[3], T=ARGS[4],
-        N=parse(Int, ARGS[5]), M=parse(Int, ARGS[6]),
-        n_iter=parse(Int, ARGS[7]), n_warmup=parse(Int, ARGS[8]), n_trial=parse(Int, ARGS[9]),
-        variant=(length(ARGS) >= 10 ? ARGS[10] : "baseline"),
+        gpus=parse(Int, ARGS[1]),
+        cpus=parse(Int, ARGS[2]),
+        name=ARGS[3],
+        T=ARGS[4],
+        N=parse(Int, ARGS[5]),
+        M=parse(Int, ARGS[6]),
+        n_iter=parse(Int, ARGS[7]),
+        n_warmup=parse(Int, ARGS[8]),
+        n_trial=parse(Int, ARGS[9]),
     )
 end
diff --git a/benchmark/src/benchmarks.jl b/benchmark/src/benchmarks.jl
deleted file mode 100644
index 794068fa..00000000
--- a/benchmark/src/benchmarks.jl
+++ /dev/null
@@ -1,5 +0,0 @@
-# Adding a benchmark is: drop a file in benchmarks/ and include it below.
-include("core.jl")
-include("benchmarks/gemm.jl")
-include("benchmarks/grayscott.jl")
-include("benchmarks/montecarlo.jl")
diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl
index 09ff0c57..e4939df8 100644
--- a/benchmark/src/benchmarks/gemm.jl
+++ b/benchmark/src/benchmarks/gemm.jl
@@ -3,7 +3,7 @@ Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
     M::Int
 end
 
-name(::GEMM) = "sgemm"
+name(::GEMM) = "gemm"
 dims(g::GEMM) = (g.N, g.M)
 data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
 
@@ -24,4 +24,4 @@ end
 
 run!(::GEMM, C, A, B) = mul!(C, A, B)
 
-register_benchmark("sgemm", GEMM)
+register_benchmark("gemm", GEMM)
diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl
index 0a7eb02d..b4c60306 100644
--- a/benchmark/src/benchmarks/grayscott.jl
+++ b/benchmark/src/benchmarks/grayscott.jl
@@ -11,20 +11,26 @@ function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T}
     GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k))
 end
 
-Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T}
+abstract type AbstractGrayScott{T} <: AbstractBenchmark{T} end
+
+Base.@kwdef struct GrayScottBaseline{T} <: AbstractGrayScott{T}
     N::Int
     M::Int
-    variant::Symbol = :baseline
 end
 
-name(::GrayScott) = "grayscott"
-dims(b::GrayScott) = (b.N, b.M)
-data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
-allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
-total_flops(b::GrayScott) = b.N * b.M # grid points updated per step
+Base.@kwdef struct GrayScottLifetimes{T} <: AbstractGrayScott{T}
+    N::Int
+    M::Int
+end
 
-function build_benchmark(::Type{GrayScott}, ::Type{T}, N, M, variant) where {T}
-    GrayScott{T}(; N=N, M=M, variant=Symbol(variant))
+name(::AbstractGrayScott) = "grayscott"
+dims(b::AbstractGrayScott) = (b.N, b.M)
+data(b::AbstractGrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
+allowed_types(::Type{AbstractGrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
+total_flops(b::AbstractGrayScott) = b.N * b.M # grid points updated per step
+
+function build_benchmark(::Type{A}, ::Type{T}, N, M) where {A<:AbstractGrayScott,T}
+    A{T}(; N=N, M=M)
 end
 
 mutable struct GrayScottState{A,P}
@@ -35,7 +41,7 @@ mutable struct GrayScottState{A,P}
     params::P
 end
 
-function initialize(b::GrayScott{T}) where {T}
+function initialize(b::AbstractGrayScott{T}) where {T}
     d = (b.N, b.M)
     u = cuNumeric.ones(T, d)
     v = cuNumeric.zeros(T, d)
@@ -104,22 +110,17 @@ let body = quote
         v_new[1, :] = v[end - 1, :]
         v_new[end, :] = v[2, :]
     end
-    @eval _gs_step!(::Val{:baseline}, u, v, u_new, v_new, args::GSParams) = $body
-    @eval _gs_step!(::Val{:lifetimes}, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body
-end
-
-# Variants not special-cased (e.g. testing fusion) run the baseline path.
-function _gs_step!(::Val, u, v, u_new, v_new, args::GSParams)
-    _gs_step!(Val(:baseline), u, v, u_new, v_new, args)
+    @eval _gs_step!(b::GrayScottBaseline, u, v, u_new, v_new, args::GSParams) = $body
+    @eval _gs_step!(b::GrayScottLifetimes, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body
 end
 
-function run!(b::GrayScott, st::GrayScottState)
-    _gs_step!(Val(b.variant), st.u, st.v, st.u_new, st.v_new, st.params)
+function run!(b::AbstractGrayScott, st::GrayScottState)
+    _gs_step!(b, st.u, st.v, st.u_new, st.v_new, st.params)
     # swap references rather than copy
     st.u, st.u_new = st.u_new, st.u
     st.v, st.v_new = st.v_new, st.v
     return nothing
 end
 
-register_variant("lifetimes")
-register_benchmark("grayscott", GrayScott)
+register_benchmark("grayscott_baseline", GrayScottBaseline)
+register_benchmark("grayscott_lifetimes", GrayScottLifetimes)
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
index b87c9fd0..698596ac 100644
--- a/benchmark/src/core.jl
+++ b/benchmark/src/core.jl
@@ -47,32 +47,6 @@ end
 
 #########################################
 
-# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime
-# preference); code-path variants leave it a no-op.
-struct Variant
-    name::String
-    setup::Function
-end
-
-const VARIANTS = Dict{String,Variant}()
-
-function register_variant(name, setup=() -> nothing)
-    VARIANTS[name] = Variant(name, setup)
-end
-
-function variant_setup(name)
-    if haskey(VARIANTS, name)
-        return VARIANTS[name].setup
-    end
-    return () -> nothing
-end
-
-register_variant("baseline")
-# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)
-# register_variant("fusion_on",  cuNumeric.CNPreferences.enable_broadcast_fusion!)
-
-#########################################
-
 # Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean
 # over `n_iter` iterations for trial `i`; the spread across trials gives stddev.
 struct BenchmarkResult{B<:AbstractBenchmark}
@@ -86,7 +60,6 @@ function _trial(b::AbstractBenchmark, gs::GlobalSettings)
     GC.gc(true)
     state = initialize(b)
 
-    start_time = zero(get_time_microseconds())
     for idx in 1:(gs.n_warmup + gs.n_iter)
         if idx == gs.n_warmup + 1
             start_time = get_time_microseconds()
@@ -114,7 +87,7 @@ end
 
 _std(x) = length(x) > 1 ? std(x) : 0.0
 
-function save_result(br::BenchmarkResult, gpus, variant)
+function save_result(br::BenchmarkResult, gpus)
     N, M = dims(br.benchmark)
     path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv")
     mkpath(dirname(path))
@@ -122,9 +95,35 @@ function save_result(br::BenchmarkResult, gpus, variant)
         for trial in eachindex(br.times_ms)
             @printf(
                 io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n",
-                "cunumeric", variant, gpus, N, M, trial,
+                "cunumeric", gpus, N, M, trial,
                 br.times_ms[trial], br.gflops[trial],
             )
         end
     end
 end
+
+#########################################
+
+# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime
+# preference); code-path variants leave it a no-op.
+# struct Variant
+#     name::String
+#     setup::Function
+# end
+
+# const VARIANTS = Dict{String,Variant}()
+
+# function register_variant(name, setup=() -> nothing)
+#     VARIANTS[name] = Variant(name, setup)
+# end
+
+# function variant_setup(name)
+#     if haskey(VARIANTS, name)
+#         return VARIANTS[name].setup
+#     end
+#     return () -> nothing
+# end
+
+# register_variant("baseline")
+# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)
+# register_variant("fusion_on",  cuNumeric.CNPreferences.enable_broadcast_fusion!)
diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
index 7144097c..f5211b5a 100644
--- a/benchmark/src/parse_benchmarks.jl
+++ b/benchmark/src/parse_benchmarks.jl
@@ -3,15 +3,14 @@ using TOML
 """
 One benchmark invocation parsed from `benchmarks.toml`. `name` selects the
 benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32");
-`variant` names the run variant (e.g. "baseline", "lifetimes"); `args` are the
-sizes (currently `N M`).
+`args` are the sizes (currently `N M`).
 """
 struct BenchmarkSpec
     name::String
     T::String
-    variant::String
     gpus::Int
     cpus::Int
+    fusion::Bool
     args::Vector{Int}
 end
 
@@ -46,20 +45,19 @@ function parse_config(path)
         name == "Global" && continue
         for e in entries
             types = aslist(get(e, "T", "Float32"))
-            variants = aslist(get(e, "variants", "baseline"))
             gpus = aslist(e["gpus"])
             cpus = aslist(e["cpus"])
+            # fusion = get(e, "fusion", true)
             N = aslist(e["N"])
             M = aslist(get(e, "M", 1))
 
             n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M])
 
-            # `T` and `variants` multiply; gpus/cpus/N/M zip into the sweep.
-            for T in types, variant in variants, i in 1:n
+            for T in types for i in 1:n
                 push!(
                     specs,
                     BenchmarkSpec(
-                        name, T, variant, sweep_value(gpus, i), sweep_value(cpus, i),
+                        name, T, sweep_value(gpus, i), sweep_value(cpus, i),
                         [sweep_value(N, i), sweep_value(M, i)],
                     ),
                 )
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
index 86b09f78..e4ee7cdb 100644
--- a/benchmark/src/single.jl
+++ b/benchmark/src/single.jl
@@ -5,29 +5,30 @@
 using cuNumeric
 using LinearAlgebra
 
-include("benchmarks.jl")
+include("core.jl")
+const BENCHMARK_DIR = joinpath(@__DIR__, "benchmarks")
+include.(filter(contains(r".jl$"), readdir(BENCHMARK_DIR; join=true)))
 
 # Resolve a TOML type string like "Float32" to the actual Julia type.
 parse_type(s) = getfield(Base, Symbol(s))::DataType
 
-function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, variant)
+function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
     haskey(BENCHMARKS, name) || error(
         "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))"
     )
     T = parse_type(T_str)
-    variant_setup(variant)() # apply any pre-run setup (e.g. flip a runtime preference)
-    b = build_benchmark(BENCHMARKS[name], T, N, M, variant)
+    b = build_benchmark(BENCHMARKS[name], T, N, M)
     gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
 
     println(
-        "[cuNumeric] $(name) [$(variant)] benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
+        "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
         "iterations ($(n_warmup) warmup) x $(n_trial) trials",
     )
     br = run_benchmark(b, gs)
     @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
     @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
 
-    save_result(br, gpus, variant)
+    save_result(br, gpus)
 end
 
 gpus = parse(Int, ARGS[1])
@@ -38,5 +39,4 @@ M = parse(Int, ARGS[5])
 n_iter = parse(Int, ARGS[6])
 n_warmup = parse(Int, ARGS[7])
 n_trial = parse(Int, ARGS[8])
-variant = ARGS[9]
-run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, variant)
+run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial)

From 866892d35b4a42458db88af620eb5db34f148949 Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 3 Jun 2026 11:43:24 -0500
Subject: [PATCH 12/17] setup things to use CUDACore too

---
 benchmark/run_benchmark.sh             |  2 ++
 benchmark/src/benchmarks/gemm.jl       |  8 ++++----
 benchmark/src/benchmarks/grayscott.jl  | 14 +++++++-------
 benchmark/src/benchmarks/montecarlo.jl |  4 ++--
 benchmark/src/core.jl                  | 18 +++++++++++-------
 benchmark/src/single.jl                | 16 ++++++++++++++++
 6 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
index ef54dfa5..752eecde 100755
--- a/benchmark/run_benchmark.sh
+++ b/benchmark/run_benchmark.sh
@@ -55,6 +55,8 @@ export LD_LIBRARY_PATH=""
 
 echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"
 
+eval "julia --project -e 'using Pkg; Pkg.dev(\"..\"); Pkg.instantiate()'"
+
 CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}"
 
 printf "Running: %s\n" "$CMD"
diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl
index e4939df8..a4356792 100644
--- a/benchmark/src/benchmarks/gemm.jl
+++ b/benchmark/src/benchmarks/gemm.jl
@@ -14,10 +14,10 @@ end
 total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
 total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
 
-function initialize(s::GEMM{T}) where {T}
-    A = cuNumeric.rand(T, s.N, s.M)
-    B = cuNumeric.rand(T, s.M, s.N)
-    C = cuNumeric.zeros(T, s.N, s.N)
+function initialize(s::GEMM{T}; mod=cuNumeric) where {T}
+    A = mod.rand(T, s.N, s.M)
+    B = mod.rand(T, s.M, s.N)
+    C = mod.zeros(T, s.N, s.N)
     GC.gc()
     return C, A, B
 end
diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl
index b4c60306..a2d51315 100644
--- a/benchmark/src/benchmarks/grayscott.jl
+++ b/benchmark/src/benchmarks/grayscott.jl
@@ -41,16 +41,16 @@ mutable struct GrayScottState{A,P}
     params::P
 end
 
-function initialize(b::AbstractGrayScott{T}) where {T}
+function initialize(b::AbstractGrayScott{T}; mod=cuNumeric) where {T}
     d = (b.N, b.M)
-    u = cuNumeric.ones(T, d)
-    v = cuNumeric.zeros(T, d)
-    u_new = cuNumeric.zeros(T, d)
-    v_new = cuNumeric.zeros(T, d)
+    u = mod.ones(T, d)
+    v = mod.zeros(T, d)
+    u_new = mod.zeros(T, d)
+    v_new = mod.zeros(T, d)
 
     seed = min(150, b.N, b.M)
-    u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
-    v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed))
+    u[1:seed, 1:seed] = mod.rand(T, (seed, seed))
+    v[1:seed, 1:seed] = mod.rand(T, (seed, seed))
 
     return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),)
 end
diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl
index 0b5175f6..f82ebba6 100644
--- a/benchmark/src/benchmarks/montecarlo.jl
+++ b/benchmark/src/benchmarks/montecarlo.jl
@@ -13,9 +13,9 @@ allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
 total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
 total_flops(s::MonteCarloIntegration) = s.n_samples
 
-function initialize(mci::MonteCarloIntegration{T}) where {T}
+function initialize(mci::MonteCarloIntegration{T}; mod=cuNumeric) where {T}
     # Uniform samples over the integration domain [0, 10].
-    x = T(10) .* cuNumeric.rand(T, mci.n_samples)
+    x = T(10) .* mod.rand(T, mci.n_samples)
     GC.gc()
     return (x,)
 end
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
index 698596ac..f8aab2c9 100644
--- a/benchmark/src/core.jl
+++ b/benchmark/src/core.jl
@@ -56,9 +56,9 @@ struct BenchmarkResult{B<:AbstractBenchmark}
 end
 
 # One timed trial: warmup, then time `n_iter` iterations of `run!`.
-function _trial(b::AbstractBenchmark, gs::GlobalSettings)
+function _trial(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric)
     GC.gc(true)
-    state = initialize(b)
+    state = initialize(b; mod=mod)
 
     for idx in 1:(gs.n_warmup + gs.n_iter)
         if idx == gs.n_warmup + 1
@@ -74,11 +74,15 @@ function _trial(b::AbstractBenchmark, gs::GlobalSettings)
 end
 
 # Run `n_trial` independent trials and collect their per-trial measurements.
-function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings)
+function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric)
+
+    # Can only test CUDA.jl performance with 1 GPU
+    (mod == CUDACore && gs.n_gpu == 1) || continue
+
     times_ms = Float64[]
     gflops = Float64[]
     for _ in 1:gs.n_trial
-        t, g = _trial(b, gs)
+        t, g = _trial(b, gs; mod=mod)
         push!(times_ms, t)
         push!(gflops, g)
     end
@@ -87,15 +91,15 @@ end
 
 _std(x) = length(x) > 1 ? std(x) : 0.0
 
-function save_result(br::BenchmarkResult, gpus)
+function save_result(br::BenchmarkResult, gpus; mod::String="cunumeric")
     N, M = dims(br.benchmark)
-    path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv")
+    path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark))_$(mod).csv")
     mkpath(dirname(path))
     open(path, "a") do io
         for trial in eachindex(br.times_ms)
             @printf(
                 io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n",
-                "cunumeric", gpus, N, M, trial,
+                mod, gpus, N, M, trial,
                 br.times_ms[trial], br.gflops[trial],
             )
         end
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
index e4ee7cdb..eb24ae38 100644
--- a/benchmark/src/single.jl
+++ b/benchmark/src/single.jl
@@ -3,6 +3,7 @@
 # Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial> <variant>
 
 using cuNumeric
+using CUDACore
 using LinearAlgebra
 
 include("core.jl")
@@ -29,6 +30,21 @@ function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
     @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
 
     save_result(br, gpus)
+
+    # Run CUDA.jl benchmark
+    if gpus == 1
+        println(
+            "[CUDA.jl] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
+            "iterations ($(n_warmup) warmup) x $(n_trial) trials",
+        )
+
+        br = run_benchmark(b, gs; mod=CUDACore)
+
+        @printf("[CUDA.jl] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
+        @printf("[CUDA.jl] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
+
+        save_result(br, gpus; mod="CUDA.jl")
+    end
 end
 
 gpus = parse(Int, ARGS[1])

From 00ed6bab794c80b3ad25df5df51df1d615ccb487 Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 3 Jun 2026 12:13:45 -0500
Subject: [PATCH 13/17] kinda runs

---
 Project.toml                      |  8 +++-----
 benchmark/Project.toml            |  2 +-
 benchmark/README.md               | 31 +------------------------------
 benchmark/run_benchmark.sh        |  4 ++--
 benchmark/src/core.jl             | 11 +++--------
 benchmark/src/parse_benchmarks.jl |  8 +++++---
 6 files changed, 15 insertions(+), 49 deletions(-)

diff --git a/Project.toml b/Project.toml
index c898d6b4..feca11f0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,12 +2,8 @@ name = "cuNumeric"
 uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
 version = "0.1.1"
 
-[workspace]
-projects = ["test", "dev"]
-
 [deps]
 CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
-CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
 CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909"
@@ -33,7 +29,6 @@ CUDAExt = "CUDA"
 [compat]
 CNPreferences = "0.1.2"
 CUDA = "5.9"
-CUDA_SDK_jll = "13"
 CxxWrap = "0.17"
 JuliaFormatter = "2.3.0"
 Legate = "0.1.2"
@@ -47,3 +42,6 @@ StatsBase = "0.34"
 cunumeric_jl_wrapper_jll = "25.10.3"
 cupynumeric_jll = "25.10.3"
 julia = "1.10"
+
+[workspace]
+projects = ["test", "dev"]
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 71a488f9..62eb4c27 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,8 +1,8 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
diff --git a/benchmark/README.md b/benchmark/README.md
index 762bffc6..c2345ad5 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -22,11 +22,9 @@ Repeat a `[[name]]` block to add independent configs.
 
 ## Lists
 
-Any of `T`, `variants`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
+Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
 two axes:
 
-- **`T` and `variants` multiply.** The whole sweep runs once per type and once
-  per variant.
 - **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
   of each is paired together.
 
@@ -53,30 +51,3 @@ M    = [150, 300, 600]          #
 When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
 combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
 to a specific size, use separate `[[name]]` blocks.
-
-## Variants
-
-A variant is a named way of running a benchmark. List them per entry with
-`variants = [...]` (defaults to `["baseline"]`); they multiply like `T`, and the
-chosen variant is recorded as a column in the results CSV so runs can be compared.
-
-```toml
-[[grayscott]]
-T = "Float64"
-N = 1024
-M = [1024, 2048, 4096]
-gpus = [1, 2, 4]
-cpus = 2
-variants = ["baseline", "lifetimes"]   # 2 variants * 3 sweep points = 6 runs
-```
-
-There are two kinds, both flowing through the same `variant` string:
-
-- **Code-path variants** change what the worker runs. The benchmark's `run!`
-  dispatches on the variant. Example: grayscott's `lifetimes` wraps the step in
-  `@analyze_lifetimes` (see `src/benchmarks/grayscott.jl`). A benchmark that
-  doesn't recognize a variant just runs its baseline path.
-- **Process-level variants** flip a runtime setting before the run via a setup
-  thunk registered in `register_variant` (`src/core.jl`). The worker calls it at
-  startup. Broadcast fusion will plug in here once it lands, e.g.
-  `register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)`.
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
index 752eecde..4e50f8d0 100755
--- a/benchmark/run_benchmark.sh
+++ b/benchmark/run_benchmark.sh
@@ -55,9 +55,9 @@ export LD_LIBRARY_PATH=""
 
 echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"
 
-eval "julia --project -e 'using Pkg; Pkg.dev(\"..\"); Pkg.instantiate()'"
+eval "julia --project -e 'using Pkg; Pkg.develop(path=\"..\"); Pkg.instantiate()'"
 
-CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}"
 
 printf "Running: %s\n" "$CMD"
 eval "$CMD"
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
index f8aab2c9..33c4a4ef 100644
--- a/benchmark/src/core.jl
+++ b/benchmark/src/core.jl
@@ -39,9 +39,7 @@ function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBe
     BENCHMARKS[key] = B
 end
 
-# Default uses (N, M); benchmarks with a code-path variant or different arity
-# override this (see grayscott.jl / montecarlo.jl).
-function build_benchmark(::Type{B}, ::Type{T}, N, M, variant) where {B<:AbstractBenchmark,T}
+function build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T}
     B{T}(; N=N, M=M)
 end
 
@@ -60,6 +58,7 @@ function _trial(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric)
     GC.gc(true)
     state = initialize(b; mod=mod)
 
+    start_time = nothing
     for idx in 1:(gs.n_warmup + gs.n_iter)
         if idx == gs.n_warmup + 1
             start_time = get_time_microseconds()
@@ -75,10 +74,6 @@ end
 
 # Run `n_trial` independent trials and collect their per-trial measurements.
 function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric)
-
-    # Can only test CUDA.jl performance with 1 GPU
-    (mod == CUDACore && gs.n_gpu == 1) || continue
-
     times_ms = Float64[]
     gflops = Float64[]
     for _ in 1:gs.n_trial
@@ -98,7 +93,7 @@ function save_result(br::BenchmarkResult, gpus; mod::String="cunumeric")
     open(path, "a") do io
         for trial in eachindex(br.times_ms)
             @printf(
-                io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n",
+                io, "%s,%d,%d,%d,%d,%.6f,%.6f\n",
                 mod, gpus, N, M, trial,
                 br.times_ms[trial], br.gflops[trial],
             )
diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
index f5211b5a..3b7c21df 100644
--- a/benchmark/src/parse_benchmarks.jl
+++ b/benchmark/src/parse_benchmarks.jl
@@ -10,7 +10,6 @@ struct BenchmarkSpec
     T::String
     gpus::Int
     cpus::Int
-    fusion::Bool
     args::Vector{Int}
 end
 
@@ -53,11 +52,14 @@ function parse_config(path)
 
             n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M])
 
-            for T in types for i in 1:n
+            for T in types, i in 1:n
                 push!(
                     specs,
                     BenchmarkSpec(
-                        name, T, sweep_value(gpus, i), sweep_value(cpus, i),
+                        name,
+                        T,
+                        sweep_value(gpus, i),
+                        sweep_value(cpus, i),
                         [sweep_value(N, i), sweep_value(M, i)],
                     ),
                 )

From 92d8ac367a8191b6a15af3799faa794e4a0fcdaa Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 3 Jun 2026 19:34:21 -0500
Subject: [PATCH 14/17] add a python benchmarks, install_cupynumeric.sh, and
 python benchmark runner. run.jl will construct and orchistrate the python
 runs as well.

---
 .gitignore                                |  3 +
 benchmark/README.md                       | 25 ++++++++
 benchmark/benchmarks.toml                 |  2 +
 benchmark/install_cupynumeric.sh          | 73 +++++++++++++++++++++++
 benchmark/run.jl                          | 52 +++++++++++++---
 benchmark/run_benchmark.sh                | 19 +++++-
 benchmark/src/benchmarks/montecarlo.jl    |  4 +-
 benchmark/src/core.jl                     |  2 +
 benchmark/src/parse_benchmarks.jl         | 21 ++++++-
 benchmark/src/single.jl                   | 48 +++++++--------
 benchmark/src_py/benchmarks/__init__.py   |  8 +++
 benchmark/src_py/benchmarks/gemm.py       | 29 +++++++++
 benchmark/src_py/benchmarks/grayscott.py  | 71 ++++++++++++++++++++++
 benchmark/src_py/benchmarks/montecarlo.py | 28 +++++++++
 benchmark/src_py/core.py                  | 57 ++++++++++++++++++
 benchmark/src_py/single.py                | 48 +++++++++++++++
 16 files changed, 449 insertions(+), 41 deletions(-)
 create mode 100755 benchmark/install_cupynumeric.sh
 create mode 100644 benchmark/src_py/benchmarks/__init__.py
 create mode 100644 benchmark/src_py/benchmarks/gemm.py
 create mode 100644 benchmark/src_py/benchmarks/grayscott.py
 create mode 100644 benchmark/src_py/benchmarks/montecarlo.py
 create mode 100644 benchmark/src_py/core.py
 create mode 100644 benchmark/src_py/single.py

diff --git a/.gitignore b/.gitignore
index c2af1d47..29f3d39d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ help.sh
 docker.log
 docs/package-lock.json
 
+__pycache__
+*.pyc
+
 # auto-generated script
 build_wrapper.sh
 
diff --git a/benchmark/README.md b/benchmark/README.md
index c2345ad5..bd646666 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -2,6 +2,31 @@
 
 Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.
 
+## Running
+
+```bash
+julia --project run.jl     # runs whatever benchmarks.toml configures
+```
+
+`run.jl` runs each (benchmark, backend) pair in its own process via
+`run_benchmark.sh`, so backends never share a GPU/runtime within a measurement.
+cuNumeric always runs; extra comparison backends are toggled in `[Global]`:
+
+- `cuda = true` → also run under CUDA.jl (single-GPU configs only; CUDA.jl is
+  single-device).
+- `cupynumeric = true` → also run under cupynumeric (see below).
+
+### Comparing against cupynumeric
+
+cupynumeric runs in a conda env whose major.minor matches this project's
+resolved `cupynumeric_jll`. Build it once:
+
+```bash
+./install_cupynumeric.sh   # creates env cupynumeric-bench-<major.minor>
+```
+
+`run.jl` derives the env name automatically; override it with `CUPYNUMERIC_ENV`.
+
 ## Layout
 
 ```toml
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
index 855a3b8d..688ae9e2 100644
--- a/benchmark/benchmarks.toml
+++ b/benchmark/benchmarks.toml
@@ -2,6 +2,8 @@
 n_warmup = 5
 n_iter = 1000
 n_trial = 5
+cupynumeric = true # (needs install_cupynumeric.sh)
+cuda = false # compare against CUDA.jl (single-GPU configs only)
 
 ####################################
 #             GEMM                 #
diff --git a/benchmark/install_cupynumeric.sh b/benchmark/install_cupynumeric.sh
new file mode 100755
index 00000000..541a654c
--- /dev/null
+++ b/benchmark/install_cupynumeric.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Install a cupynumeric conda env matching the cupynumeric_jll our project resolves.
+# The conda package and the JLL share the calendar-versioning scheme (e.g. 25.10),
+# so we pin major.minor (patch ignored) and install from the legate channel.
+#
+# Usage:
+#   ./install_cupynumeric.sh                 # create a fresh env named cupynumeric-bench-<ver>
+#   ./install_cupynumeric.sh --name myenv    # override the env name
+#   ./install_cupynumeric.sh --into existing # install into an existing env instead of creating one
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+ENV_NAME=""
+INTO_ENV=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --name)
+            ENV_NAME=$2
+            shift 2
+            ;;
+        --into)
+            INTO_ENV=$2
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 [--name <env>] [--into <existing-env>]"
+            exit 1
+            ;;
+    esac
+done
+
+# Resolve the JLL version Julia actually instantiated for this project, then keep
+# major.minor only — conda packages are not published per patch.
+echo "Detecting cupynumeric_jll version from the benchmark project..."
+VER=$(cd "$SCRIPT_DIR" && julia --project -e '
+using Pkg
+for (_, info) in Pkg.dependencies()
+    info.name == "cupynumeric_jll" || continue
+    v = info.version
+    isnothing(v) && continue
+    println("$(v.major).$(v.minor)")
+end' | tail -1)
+
+if [[ -z "$VER" ]]; then
+    echo "Error: could not detect cupynumeric_jll version. Has the project been instantiated?"
+    exit 1
+fi
+
+echo "cupynumeric_jll major.minor: $VER"
+SPEC="cupynumeric=$VER.*"
+
+if [[ -n "$INTO_ENV" ]]; then
+    echo "Installing $SPEC into existing env '$INTO_ENV'..."
+    conda install -y -n "$INTO_ENV" -c conda-forge -c legate "$SPEC"
+    echo "Done. Activate with: conda activate $INTO_ENV"
+    exit 0
+fi
+
+[[ -z "$ENV_NAME" ]] && ENV_NAME="cupynumeric-bench-$VER"
+
+if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then
+    echo "Env '$ENV_NAME' already exists with $SPEC; nothing to do."
+    echo "Activate with: conda activate $ENV_NAME"
+    exit 0
+fi
+
+echo "Creating env '$ENV_NAME' with $SPEC..."
+conda create -y -n "$ENV_NAME" -c conda-forge -c legate "$SPEC"
+
+echo "Done. Activate with: conda activate $ENV_NAME"
diff --git a/benchmark/run.jl b/benchmark/run.jl
index 89d6d28b..ff40e21d 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -6,26 +6,61 @@
 
 # Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
 # both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
+
+using Pkg
+
 include("src/core.jl")
 include("src/parse_benchmarks.jl")
 
 const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
 const WORKER = joinpath(@__DIR__, "src/single.jl")
+const PY_WORKER = joinpath(@__DIR__, "src_py/single.py")
 
 banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
 
-function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial)
-    # Name validity is checked in the worker (single.jl), which owns the registry.
+# ensure things are resolved and devlop'd properly
+function ensure_project_ready()
+    Pkg.develop(; path=joinpath(@__DIR__, ".."))
+    Pkg.instantiate()
+end
+
+# default env name mirrors install_cupynumeric.sh: cupynumeric-bench-<major>.<minor>
+# CUPYNUMERIC_ENV overrides it.
+function cupynumeric_env_name()
+    haskey(ENV, "CUPYNUMERIC_ENV") && return ENV["CUPYNUMERIC_ENV"]
+    for (_, info) in Pkg.dependencies()
+        info.name == "cupynumeric_jll" || continue
+        info.version === nothing && continue
+        return "cupynumeric-bench-$(info.version.major).$(info.version.minor)"
+    end
+    error("could not resolve cupynumeric_jll version; set CUPYNUMERIC_ENV explicitly")
+end
+
+function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
+    cupynumeric=false, cuda=false)
     banner(
         "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
         "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
     )
 
-    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
-    try
-        run(cmd)
-    catch e
-        @error "Benchmark '$(name)' failed; continuing." exception = e
+    # each backend runs in its own worker process
+    args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
+    cmds = [`bash $RUNNER $WORKER $args cunumeric`]
+    # CUDA.jl is single-GPU only
+    if cuda && gpus == 1
+        push!(cmds, `bash $RUNNER $WORKER $args cudajl`)
+    end
+    # cupynumeric has no code-path variants; only baseline benchmarks compare against it
+    if cupynumeric && !endswith(name, "_lifetimes")
+        push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`)
+    end
+
+    for cmd in cmds
+        try
+            run(cmd)
+        catch e
+            @error "Benchmark '$(name)' failed; continuing." exception = e
+        end
     end
 end
 
@@ -42,10 +77,13 @@ function run_all_benchmarks(config="benchmarks.toml")
             n_iter=gs.n_iter,
             n_warmup=gs.n_warmup,
             n_trial=gs.n_trial,
+            cupynumeric=gs.cupynumeric,
+            cuda=gs.cuda,
         )
     end
 end
 
+ensure_project_ready()
 if isempty(ARGS)
     run_all_benchmarks()
 else # dispatch on args
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
index 4e50f8d0..b802f7bc 100755
--- a/benchmark/run_benchmark.sh
+++ b/benchmark/run_benchmark.sh
@@ -11,6 +11,7 @@ shift
 
 GPUS=0
 CPUS=1
+PYENV=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -22,6 +23,10 @@ while [[ $# -gt 0 ]]; do
             CPUS=$2
             shift 2
             ;;
+        --pyenv)
+            PYENV=$2
+            shift 2
+            ;;
         *)
             # Collect all other arguments as extra arguments
             EXTRA_ARGS+=("$1")
@@ -55,9 +60,17 @@ export LD_LIBRARY_PATH=""
 
 echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"
 
-eval "julia --project -e 'using Pkg; Pkg.develop(path=\"..\"); Pkg.instantiate()'"
-
-CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+# Python (cupynumeric) workers run in the conda env built by install_cupynumeric.sh;
+# Julia (cuNumeric) workers run against the local project.
+if [[ $FILENAME == *.py ]]; then
+    if [[ -z $PYENV ]]; then
+        echo "Error: running a .py worker requires --pyenv <conda-env> (run install_cupynumeric.sh first)."
+        exit 1
+    fi
+    CMD="conda run --no-capture-output -n $PYENV python $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+else
+    CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+fi
 
 printf "Running: %s\n" "$CMD"
 eval "$CMD"
diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl
index f82ebba6..1df91c97 100644
--- a/benchmark/src/benchmarks/montecarlo.jl
+++ b/benchmark/src/benchmarks/montecarlo.jl
@@ -23,8 +23,8 @@ end
 _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
 run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
 
-# n_samples comes in as N; M and variant are unused.
-function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M, variant) where {T}
+# n_samples comes in as N; M is unused.
+function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T}
     MonteCarloIntegration{T}(; n_samples=N)
 end
 
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
index 33c4a4ef..db452c6e 100644
--- a/benchmark/src/core.jl
+++ b/benchmark/src/core.jl
@@ -17,6 +17,8 @@ Base.@kwdef struct GlobalSettings
     n_iter::Int # Number of iterations to run per trial
     n_trial::Int = 1 # Number of independent trials to run. Benchmark
     n_gpu::Int = 0
+    cupynumeric::Bool = false # also run baselines under cupynumeric for comparison
+    cuda::Bool = false # also run under CUDA.jl for comparison (single-GPU only)
 end
 
 #########################################
diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
index 3b7c21df..605c5002 100644
--- a/benchmark/src/parse_benchmarks.jl
+++ b/benchmark/src/parse_benchmarks.jl
@@ -31,17 +31,32 @@ function sweep_length(name, fields)
     return first(lengths)
 end
 
+# Names of the `[[name]]` blocks in the order they appear in the file. TOML.jl
+# parses into an unordered Dict, so we scan the source to preserve run order.
+function declared_order(path)
+    order = String[]
+    for line in eachline(path)
+        header = strip(line)
+        startswith(header, "[[") && endswith(header, "]]") || continue
+        name = strip(header[3:(end - 2)])
+        name in order || push!(order, name) # if not in list, push to ordered list
+    end
+    return order
+end
+
 function parse_config(path)
     raw = TOML.parsefile(path)
 
     g = raw["Global"]
     global_settings = GlobalSettings(;
-        n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1)
+        n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1),
+        cupynumeric=get(g, "cupynumeric", false),
+        cuda=get(g, "cuda", false),
     )
 
     specs = BenchmarkSpec[]
-    for (name, entries) in raw
-        name == "Global" && continue
+    for name in declared_order(path)
+        entries = raw[name]
         for e in entries
             types = aslist(get(e, "T", "Float32"))
             gpus = aslist(e["gpus"])
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
index eb24ae38..5b2fff54 100644
--- a/benchmark/src/single.jl
+++ b/benchmark/src/single.jl
@@ -1,6 +1,7 @@
-# single.jl: worker that runs exactly one benchmark. Launched by run_benchmark.sh
-# (dispatched from run.jl), which sets LEGATE_CONFIG in the env before julia starts.
-# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial> <variant>
+# single.jl: worker that runs exactly one benchmark under one backend. Launched by
+# run_benchmark.sh (dispatched from run.jl), which sets LEGATE_CONFIG before julia starts.
+# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial> <backend>
+# backend is "cunumeric" or "cudajl"; run.jl launches one worker per backend.
 
 using cuNumeric
 using CUDACore
@@ -13,38 +14,32 @@ include.(filter(contains(r".jl$"), readdir(BENCHMARK_DIR; join=true)))
 # Resolve a TOML type string like "Float32" to the actual Julia type.
 parse_type(s) = getfield(Base, Symbol(s))::DataType
 
-function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial)
+# mod runs the kernels; label tags stdout; save_as names the results CSV.
+const BACKENDS = Dict(
+    "cunumeric" => (mod=cuNumeric, label="cuNumeric", save_as="cunumeric"),
+    "cudajl" => (mod=CUDACore, label="CUDA.jl", save_as="CUDA.jl"),
+)
+
+function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, backend)
     haskey(BENCHMARKS, name) || error(
         "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))"
     )
+    haskey(BACKENDS, backend) || error(
+        "Unknown backend '$(backend)'. Known: $(join(sort(collect(keys(BACKENDS))), ", "))"
+    )
+    bk = BACKENDS[backend]
     T = parse_type(T_str)
     b = build_benchmark(BENCHMARKS[name], T, N, M)
     gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
 
     println(
-        "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
+        "[$(bk.label)] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
         "iterations ($(n_warmup) warmup) x $(n_trial) trials",
     )
-    br = run_benchmark(b, gs)
-    @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
-    @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
-
-    save_result(br, gpus)
-
-    # Run CUDA.jl benchmark
-    if gpus == 1
-        println(
-            "[CUDA.jl] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
-            "iterations ($(n_warmup) warmup) x $(n_trial) trials",
-        )
-
-        br = run_benchmark(b, gs; mod=CUDACore)
-
-        @printf("[CUDA.jl] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms))
-        @printf("[CUDA.jl] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops))
-
-        save_result(br, gpus; mod="CUDA.jl")
-    end
+    br = run_benchmark(b, gs; mod=bk.mod)
+    @printf("[%s] Mean Run Time: %.5f ± %.5f ms\n", bk.label, mean(br.times_ms), _std(br.times_ms))
+    @printf("[%s] FLOPS: %.5f ± %.5f GFLOPS\n", bk.label, mean(br.gflops), _std(br.gflops))
+    save_result(br, gpus; mod=bk.save_as)
 end
 
 gpus = parse(Int, ARGS[1])
@@ -55,4 +50,5 @@ M = parse(Int, ARGS[5])
 n_iter = parse(Int, ARGS[6])
 n_warmup = parse(Int, ARGS[7])
 n_trial = parse(Int, ARGS[8])
-run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial)
+backend = ARGS[9]
+run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, backend)
diff --git a/benchmark/src_py/benchmarks/__init__.py b/benchmark/src_py/benchmarks/__init__.py
new file mode 100644
index 00000000..2eee4477
--- /dev/null
+++ b/benchmark/src_py/benchmarks/__init__.py
@@ -0,0 +1,8 @@
+import importlib
+import pkgutil
+
+from core import BENCHMARKS
+
+# Import each module so it self-registers into BENCHMARKS.
+for _info in pkgutil.iter_modules(__path__):
+    importlib.import_module(f"{__name__}.{_info.name}")
diff --git a/benchmark/src_py/benchmarks/gemm.py b/benchmark/src_py/benchmarks/gemm.py
new file mode 100644
index 00000000..b5d1a4b3
--- /dev/null
+++ b/benchmark/src_py/benchmarks/gemm.py
@@ -0,0 +1,29 @@
+import cupynumeric as np
+
+from core import register_benchmark
+
+
+class GEMM:
+    name = "gemm"
+
+    def __init__(self, T, N, M):
+        self.T, self.N, self.M = T, N, M
+
+    def dims(self):
+        return self.N, self.M
+
+    def total_flops(self):
+        return self.N * self.N * (2 * self.M - 1)
+
+    def initialize(self):
+        A = np.random.rand(self.N, self.M).astype(self.T)
+        B = np.random.rand(self.M, self.N).astype(self.T)
+        C = np.zeros((self.N, self.N), dtype=self.T)
+        return (C, A, B)
+
+    def run(self, state):
+        C, A, B = state
+        np.matmul(A, B, out=C)
+
+
+register_benchmark("gemm", GEMM)
diff --git a/benchmark/src_py/benchmarks/grayscott.py b/benchmark/src_py/benchmarks/grayscott.py
new file mode 100644
index 00000000..a1a89e73
--- /dev/null
+++ b/benchmark/src_py/benchmarks/grayscott.py
@@ -0,0 +1,71 @@
+import cupynumeric as np
+
+from core import register_benchmark
+
+
+class GrayScott:
+    name = "grayscott"
+
+    # dt = dx/5; c_u, c_v, f, k as in grayscott.jl's GSParams defaults.
+    def __init__(self, T, N, M, dx=1.0, c_u=1.0, c_v=0.3, f=0.03, k=0.06):
+        self.T, self.N, self.M = T, N, M
+        self.dx = T(dx)
+        self.dt = T(dx / 5)
+        self.c_u, self.c_v, self.f, self.k = T(c_u), T(c_v), T(f), T(k)
+
+    def dims(self):
+        return self.N, self.M
+
+    def total_flops(self):
+        return self.N * self.M
+
+    def initialize(self):
+        d = (self.N, self.M)
+        u = np.ones(d, dtype=self.T)
+        v = np.zeros(d, dtype=self.T)
+        u_new = np.zeros(d, dtype=self.T)
+        v_new = np.zeros(d, dtype=self.T)
+
+        seed = min(150, self.N, self.M)
+        u[:seed, :seed] = np.random.rand(seed, seed).astype(self.T)
+        v[:seed, :seed] = np.random.rand(seed, seed).astype(self.T)
+        # mutable list so run() can swap buffers in place
+        return [u, v, u_new, v_new]
+
+    def run(self, state):
+        u, v, u_new, v_new = state
+        ui = u[1:-1, 1:-1]
+        vi = v[1:-1, 1:-1]
+
+        F_u = (-ui * (vi * vi)) + self.f * (1 - ui)
+        F_v = (ui * (vi * vi)) - (self.f + self.k) * vi
+
+        dx2 = self.dx * self.dx
+        u_lap = (
+            (u[2:, 1:-1] - 2 * ui + u[:-2, 1:-1]) / dx2
+            + (u[1:-1, 2:] - 2 * ui + u[1:-1, :-2]) / dx2
+        )
+        v_lap = (
+            (v[2:, 1:-1] - 2 * vi + v[:-2, 1:-1]) / dx2
+            + (v[1:-1, 2:] - 2 * vi + v[1:-1, :-2]) / dx2
+        )
+
+        u_new[1:-1, 1:-1] = (self.c_u * u_lap + F_u) * self.dt + ui
+        v_new[1:-1, 1:-1] = (self.c_v * v_lap + F_v) * self.dt + vi
+
+        # periodic boundary conditions
+        u_new[:, 0] = u[:, -2]
+        u_new[:, -1] = u[:, 1]
+        u_new[0, :] = u[-2, :]
+        u_new[-1, :] = u[1, :]
+        v_new[:, 0] = v[:, -2]
+        v_new[:, -1] = v[:, 1]
+        v_new[0, :] = v[-2, :]
+        v_new[-1, :] = v[1, :]
+
+        # swap references rather than copy
+        state[0], state[2] = u_new, u
+        state[1], state[3] = v_new, v
+
+
+register_benchmark("grayscott_baseline", GrayScott)
diff --git a/benchmark/src_py/benchmarks/montecarlo.py b/benchmark/src_py/benchmarks/montecarlo.py
new file mode 100644
index 00000000..370fc7b9
--- /dev/null
+++ b/benchmark/src_py/benchmarks/montecarlo.py
@@ -0,0 +1,28 @@
+import cupynumeric as np
+
+from core import register_benchmark
+
+
+class MonteCarlo:
+    name = "montecarlo"
+
+    def __init__(self, T, N, M):
+        self.T = T
+        self.n_samples = N
+
+    def dims(self):
+        return self.n_samples, 1
+
+    def total_flops(self):
+        return self.n_samples
+
+    def initialize(self):
+        x = (self.T(10) * np.random.rand(self.n_samples)).astype(self.T)
+        return (x,)
+
+    def run(self, state):
+        (x,) = state
+        return (self.T(10) / self.n_samples) * np.sum(np.exp(-(x * x)))
+
+
+register_benchmark("montecarlo", MonteCarlo)
diff --git a/benchmark/src_py/core.py b/benchmark/src_py/core.py
new file mode 100644
index 00000000..f32a4fc3
--- /dev/null
+++ b/benchmark/src_py/core.py
@@ -0,0 +1,57 @@
+import os
+import math
+
+import cupynumeric as np
+from legate.timing import time  # blocks on preceding legate ops; returns microseconds
+
+MOD = "cupynumeric"
+RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "results")
+
+DTYPES = {"Float32": np.float32, "Float64": np.float64}
+
+
+def parse_type(s):
+    if s not in DTYPES:
+        raise ValueError(f"Unsupported type '{s}'. Known: {', '.join(DTYPES)}")
+    return DTYPES[s]
+
+
+BENCHMARKS = {}
+
+
+def register_benchmark(key, cls):
+    BENCHMARKS[key] = cls
+
+
+def trial(bench, n_warmup, n_iter):
+    state = bench.initialize()
+    start = None
+    for idx in range(n_warmup + n_iter):
+        if idx == n_warmup:
+            start = time()
+        bench.run(state)
+    total_us = time() - start
+
+    mean_time_ms = total_us / (n_iter * 1e3)
+    gflops = bench.total_flops() / (mean_time_ms * 1e6)
+    return mean_time_ms, gflops
+
+
+def _mean(x):
+    return sum(x) / len(x)
+
+
+def _std(x):
+    if len(x) < 2:
+        return 0.0
+    m = _mean(x)
+    return math.sqrt(sum((v - m) ** 2 for v in x) / (len(x) - 1))
+
+
+def save_result(name, dims, gpus, times_ms, gflops):
+    os.makedirs(RESULTS_DIR, exist_ok=True)
+    N, M = dims
+    path = os.path.join(RESULTS_DIR, f"{name}_{MOD}.csv")
+    with open(path, "a") as io:
+        for i, (t, g) in enumerate(zip(times_ms, gflops), start=1):
+            io.write(f"{MOD},{gpus},{N},{M},{i},{t:.6f},{g:.6f}\n")
diff --git a/benchmark/src_py/single.py b/benchmark/src_py/single.py
new file mode 100644
index 00000000..005cda31
--- /dev/null
+++ b/benchmark/src_py/single.py
@@ -0,0 +1,48 @@
+# cupynumeric worker, run by run_benchmark.sh which sets LEGATE_CONFIG first.
+# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial>
+import os
+import sys
+
+# Make `core` and the `benchmarks` package importable when run as a script.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from core import MOD, parse_type, trial, save_result, _mean, _std
+from benchmarks import BENCHMARKS  # import populates BENCHMARKS
+
+
+def main():
+    gpus = int(sys.argv[1])
+    name = sys.argv[2]
+    T_str = sys.argv[3]
+    N = int(sys.argv[4])
+    M = int(sys.argv[5])
+    n_iter = int(sys.argv[6])
+    n_warmup = int(sys.argv[7])
+    n_trial = int(sys.argv[8])
+
+    if name not in BENCHMARKS:
+        raise ValueError(
+            f"No benchmark registered for '{name}'. Known: {', '.join(sorted(BENCHMARKS))}"
+        )
+    T = parse_type(T_str)
+    bench = BENCHMARKS[name](T, N, M)
+
+    print(
+        f"[{MOD}] {name} benchmark ({T_str}) on {N}x{M} for {n_iter} "
+        f"iterations ({n_warmup} warmup) x {n_trial} trials"
+    )
+
+    times_ms, gflops = [], []
+    for _ in range(n_trial):
+        t, g = trial(bench, n_warmup, n_iter)
+        times_ms.append(t)
+        gflops.append(g)
+
+    print(f"[{MOD}] Mean Run Time: {_mean(times_ms):.5f} ± {_std(times_ms):.5f} ms")
+    print(f"[{MOD}] FLOPS: {_mean(gflops):.5f} ± {_std(gflops):.5f} GFLOPS")
+
+    save_result(bench.name, bench.dims(), gpus, times_ms, gflops)
+
+
+if __name__ == "__main__":
+    main()

From c068593fa67d617fbd3cfd90ef303ba8b23eb79b Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 3 Jun 2026 19:38:21 -0500
Subject: [PATCH 15/17] cuda -> cudajl as name

---
 benchmark/run.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/run.jl b/benchmark/run.jl
index ff40e21d..6db80516 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -37,7 +37,7 @@ function cupynumeric_env_name()
 end
 
 function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
-    cupynumeric=false, cuda=false)
+    cupynumeric=false, cudajl=false)
     banner(
         "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
         "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
@@ -47,7 +47,7 @@ function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
     args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
     cmds = [`bash $RUNNER $WORKER $args cunumeric`]
     # CUDA.jl is single-GPU only
-    if cuda && gpus == 1
+    if cudajl && gpus == 1
         push!(cmds, `bash $RUNNER $WORKER $args cudajl`)
     end
     # cupynumeric has no code-path variants; only baseline benchmarks compare against it
@@ -78,7 +78,7 @@ function run_all_benchmarks(config="benchmarks.toml")
             n_warmup=gs.n_warmup,
             n_trial=gs.n_trial,
             cupynumeric=gs.cupynumeric,
-            cuda=gs.cuda,
+            cudajl=gs.cuda,
         )
     end
 end

From 3679fc92d28aca7633c45d0de8521c3278715e35 Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 3 Jun 2026 19:40:31 -0500
Subject: [PATCH 16/17] patch cudajl run path for cunumeric specific tests

---
 benchmark/run.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmark/run.jl b/benchmark/run.jl
index 6db80516..7d6c3bb4 100644
--- a/benchmark/run.jl
+++ b/benchmark/run.jl
@@ -18,6 +18,9 @@ const PY_WORKER = joinpath(@__DIR__, "src_py/single.py")
 
 banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
 
+# `_lifetimes` is a cuNumeric-only code-path variant (@analyze_lifetimes)
+cunumeric_only(name) = endswith(name, "_lifetimes")
+
 # ensure things are resolved and devlop'd properly
 function ensure_project_ready()
     Pkg.develop(; path=joinpath(@__DIR__, ".."))
@@ -47,11 +50,10 @@ function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
     args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
     cmds = [`bash $RUNNER $WORKER $args cunumeric`]
     # CUDA.jl is single-GPU only
-    if cudajl && gpus == 1
+    if cudajl && gpus == 1 && !cunumeric_only(name)
         push!(cmds, `bash $RUNNER $WORKER $args cudajl`)
     end
-    # cupynumeric has no code-path variants; only baseline benchmarks compare against it
-    if cupynumeric && !endswith(name, "_lifetimes")
+    if cupynumeric && !cunumeric_only(name)
         push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`)
     end
 

From 943b1ec17d20fd685b124cde9819ed7bb75cde5a Mon Sep 17 00:00:00 2001
From: krasow <krasow@u.northwestern.edu>
Date: Wed, 3 Jun 2026 19:42:28 -0500
Subject: [PATCH 17/17] update readme

---
 benchmark/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index bd646666..753a0416 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -49,7 +49,7 @@ Repeat a `[[name]]` block to add independent configs.
 
 Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
 two axes:
-
+- **`T` multiply.** The whole sweep runs once per type.
 - **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
   of each is paired together.