From a8154943afd977787c174b558e218ebd4603529e Mon Sep 17 00:00:00 2001 From: ejmeitz Date: Mon, 1 Jun 2026 14:53:23 -0500 Subject: [PATCH 01/17] start benchmarking harness --- benchmark/Project.toml | 10 ++++++ benchmark/benchmarks.jl | 50 +++++++++++++++++++++++++++ benchmark/benchmarks.toml | 22 ++++++++++++ benchmark/parse_benchmarks.jl | 39 ++++++++++++++++++++++ benchmark/run.jl | 63 +++++++++++++++++++++++++++++++++++ 5 files changed, 184 insertions(+) create mode 100644 benchmark/Project.toml create mode 100644 benchmark/benchmarks.jl create mode 100644 benchmark/benchmarks.toml create mode 100644 benchmark/parse_benchmarks.jl create mode 100644 benchmark/run.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 00000000..a6989391 --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,10 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620" + +[extras] +CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f" +LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl new file mode 100644 index 00000000..dde1a6ce --- /dev/null +++ b/benchmark/benchmarks.jl @@ -0,0 +1,50 @@ +abstract type AbstractBenchmark{T} end + +######################################### + +Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T} + N::Int + M::Int +end + +data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" + +function allowed_types(::MonteCarloIntegration) + Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES} +end + +total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1) +total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T)) + +function initialize_cpu(s::GEMM{T}) where {T} + A = rand(T, s.N, s.M) + B = rand(T, s.M, s.N) + C = zeros(T, s.N, s.N) + return A, B, C +end + +run!(::GEMM, C, A, B) = mul!(C, A, B) + +######################################### + +Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T} + n_samples::Int +end + +function data(mci::MonteCarloIntegration{T}) where {T} + "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)" +end + +allowed_types(::MonteCarloIntegration) = cuNumeric.SUPPORTED_FLOAT_TYPES + +total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) +total_flops(s::MonteCarloIntegration) = missing # cannot estimate FLOPS for squaring or exp easily + +function initialize_cpu(s::MonteCarloIntegration{T}) where {T} + return T(10) .* rand(T, s.n_samples) .+ T(-5) # random samples in [-5, 5] +end + +_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples +run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) + +################# diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml new file mode 100644 index 00000000..acdeec45 --- /dev/null +++ b/benchmark/benchmarks.toml @@ -0,0 +1,22 @@ +[Global] +N_WARMUP = 2 +N_ITER = 100 +N_GPU = 1 + +[[GEMM]] +T = "Float64" +N = 100 +M = 100 + +[[GEMM]] +T = "Float32" +N = 150 +M = 150 + +[[MonteCarloIntegration]] +T = "Float64" +N_samples = 1_000_000 + +[[MonteCarloIntegration]] +T = "Float32" +N_samples = 10_000_000 diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl new file mode 100644 index 00000000..11505f71 --- /dev/null +++ b/benchmark/parse_benchmarks.jl @@ -0,0 +1,39 @@ +using TOML + +Base.@kwdef struct GlobalSettings + N_warmup::Int + N_iter::Int + N_GPU::Int +end + +function to_symbol_dict(d) + return Dict(Symbol(k) => v for (k, v) in d) +end + +function parse_config(path) + raw = TOML.parsefile(path) + + global_settings = GlobalSettings(; to_symbol_dict(raw["Global"])...) + + benchmarks = AbstractBenchmark[] + + for (name, entries) in raw + name == "Global" && continue + + BenchmarkType = getproperty(Main, Symbol(name)) + + for entry in entries + T = getproperty(Main, Symbol(entry["T"])) + + params = Dict{Symbol,Any}() + for (k, v) in entry + k == "T" && continue + params[Symbol(k)] = v + end + + push!(benchmarks, BenchmarkType{T}(; params...)) + end + end + + return global_settings, benchmarks +end diff --git a/benchmark/run.jl b/benchmark/run.jl new file mode 100644 index 00000000..3f69cfb5 --- /dev/null +++ b/benchmark/run.jl @@ -0,0 +1,63 @@ +import Random +import cuNumeric + +include("benchmarks.jl") + +function work(b::AbstractBenchmark, N_GPU, arrs_cpu...) + + run!(b, arrs_cunumeric...) + + GC.gc(full = true) + + if N_GPU == 1 + arrs_cuda = ... + run!(b, arrs_cunumeric...) + end + + # Reset state in between + GC.gc(full = true) +end + +function run_all_benchmarks() + + global_settings, benchmarks = parse_config("benchmarks.toml") + + @show global_settings + @show benchmarks + + for b in benchmarks + println("================================") + println(data(b)) + println("================================") + + arrs = init(benchmark) + + #TODO FIX + + arrs_cunumeric = + run!(b, arrs_cunumeric...) + + # Reset state in between + GC.gc(full = true) + + if N_GPU == 1 + arrs_cuda = ... + run!(b, arrs_cunumeric...) + end + + # Reset state in between + GC.gc(full = true) + end + +end + + +function run_sgemm_benchmark(N) + include("sgemm.jl") + name = "SGEMM" +end + +function run_monte_carlo_benchmark(N) + include("monte_carlo.jl") + name = "Monte_Carlo_Integration" +end From 0465d8c952b3bbe3bdb5d9bb5b07a1efceb5454c Mon Sep 17 00:00:00 2001 From: ejmeitz Date: Tue, 2 Jun 2026 13:16:15 -0500 Subject: [PATCH 02/17] clean some thing sup --- benchmark/Project.toml | 1 + benchmark/benchmarks.jl | 39 +++++++++++++++++-- benchmark/parse_benchmarks.jl | 14 +++---- benchmark/run.jl | 72 +++++++++++++++++++---------------- 4 files changed, 83 insertions(+), 43 deletions(-) diff --git a/benchmark/Project.toml b/benchmark/Project.toml index a6989391..e6583a71 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -1,5 +1,6 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index dde1a6ce..f93293e6 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -1,3 +1,25 @@ +using CSV + +""" +- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended + to avoid pre-compilation cost being timed. +- `n_iter::Int` : Number of iterations to run per trial. Should be large enough + to build up queue depth of tasks such that latency is hidden. +- `n_trial::Int` : Number of independent trials to run. Timing is restarted and + legate in between each trial. Sets number of datapoints used to estimated + standard deviations/errors. +- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG, + this value is just bookkeeping. +""" +Base.@kwdef struct GlobalSettings + n_warmup::Int # Number of warmup steps, where timing is not done. + n_iter::Int # Number of iterations to run per trial + n_trial::Int # Number of independent trials to run. Benchmark + n_gpu::Int +end + +######################################### + abstract type AbstractBenchmark{T} end ######################################### @@ -9,7 +31,7 @@ end data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" -function allowed_types(::MonteCarloIntegration) +function allowed_types(::Type{GEMM}) Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES} end @@ -35,7 +57,7 @@ function data(mci::MonteCarloIntegration{T}) where {T} "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)" end -allowed_types(::MonteCarloIntegration) = cuNumeric.SUPPORTED_FLOAT_TYPES +allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) total_flops(s::MonteCarloIntegration) = missing # cannot estimate FLOPS for squaring or exp easily @@ -47,4 +69,15 @@ end _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) -################# +######################################### + +struct BenchmarkResult{T,B<:AbstractBenchmark} + times_ms::T + gflops::T + benchmark::B +end + +function save(br::BenchmarkResult) + # Compute standard error and mean time and save to + # some kind of file. +end diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl index 11505f71..f25d4836 100644 --- a/benchmark/parse_benchmarks.jl +++ b/benchmark/parse_benchmarks.jl @@ -1,11 +1,5 @@ using TOML -Base.@kwdef struct GlobalSettings - N_warmup::Int - N_iter::Int - N_GPU::Int -end - function to_symbol_dict(d) return Dict(Symbol(k) => v for (k, v) in d) end @@ -20,9 +14,11 @@ function parse_config(path) for (name, entries) in raw name == "Global" && continue + # Convert name parsed as String, to actual type BenchmarkType = getproperty(Main, Symbol(name)) for entry in entries + # Convert type parsed as String, to actual type T = getproperty(Main, Symbol(entry["T"])) params = Dict{Symbol,Any}() @@ -31,7 +27,11 @@ function parse_config(path) params[Symbol(k)] = v end - push!(benchmarks, BenchmarkType{T}(; params...)) + if T <: allowed_types(BenchmarkType) + push!(benchmarks, BenchmarkType{T}(; params...)) + else + @warn "$(BenchmarkType) does not support benchmarking with type $(T). Skipping." + end end end diff --git a/benchmark/run.jl b/benchmark/run.jl index 3f69cfb5..c5efba93 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -1,63 +1,69 @@ -import Random -import cuNumeric +using Random: Random +using cuNumeric: cuNumeric include("benchmarks.jl") -function work(b::AbstractBenchmark, N_GPU, arrs_cpu...) +function benchmark(b::AbstractBenchmark, gs::GlobalSettings, arrs) + GC.gc(; full=true) - run!(b, arrs_cunumeric...) - - GC.gc(full = true) + for idx in range(1, gs.n_iter + gs.n_warmup) + if idx == gs.n_warmup + 1 + start_time = get_time_microseconds() + end - if N_GPU == 1 - arrs_cuda = ... - run!(b, arrs_cunumeric...) + run!(b, arrays...) end + total_time_μs = get_time_microseconds() - start_time + mean_time_ms = total_time_μs / (gs.n_iter * 1e3) + gflops = total_flops(N, M) / (mean_time_ms * 1e6) + + GC.gc(; full=true) - # Reset state in between - GC.gc(full = true) + return mean_time_ms, gflops end function run_all_benchmarks() - global_settings, benchmarks = parse_config("benchmarks.toml") @show global_settings @show benchmarks + cunumeric_results = BenchmarkResult[] + cuda_results = BenchmarkResult[] + for b in benchmarks println("================================") println(data(b)) println("================================") - arrs = init(benchmark) + cn_times_ms = Vector{Float64}(undef, global_settings.n_trial) + cn_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial) - #TODO FIX + cuda_times_ms = Vector{Float64}(undef, global_settings.n_trial) + cuda_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial) - arrs_cunumeric = - run!(b, arrs_cunumeric...) + for i in 1:global_settings.n_trial + arrs_julia = initialize_cpu(b) - # Reset state in between - GC.gc(full = true) + arrs_cunumeric = # TODO + cn_times_ms[i], cn_gflops[i] = benchmark(b, arrs_cunumeric...) + push - if N_GPU == 1 - arrs_cuda = ... - run!(b, arrs_cunumeric...) + if gs.n_gpu == 1 + arrs_cuda = # TODO + cuda_times_ms[i], cuda_gflops[i] = benchmark(b, arrs_cuda...) + push!(cuda_results, res_cuda) + end end - # Reset state in between - GC.gc(full = true) - end - -end + cn_result = BenchmarkResult(cn_times_ms, cn_gflops, b) + cuda_result = BenchmarkResult(cuda_times_ms, cuda_gflops, b) + push!(cunumeric_results, cn_result) + push!(cuda_results, cuda_result) + end -function run_sgemm_benchmark(N) - include("sgemm.jl") - name = "SGEMM" -end + # Call the `save` function for the cuda_results + # This function is not implemeneted as I was not sure how to do it -function run_monte_carlo_benchmark(N) - include("monte_carlo.jl") - name = "Monte_Carlo_Integration" end From 12373c7f4747f7b6932bd6a4c0a41aacb709508e Mon Sep 17 00:00:00 2001 From: ejmeitz Date: Tue, 2 Jun 2026 13:25:26 -0500 Subject: [PATCH 03/17] add include --- benchmark/run.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark/run.jl b/benchmark/run.jl index c5efba93..b090c8ef 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -2,6 +2,7 @@ using Random: Random using cuNumeric: cuNumeric include("benchmarks.jl") +include("parse_benchmarks.jl") function benchmark(b::AbstractBenchmark, gs::GlobalSettings, arrs) GC.gc(; full=true) From 73fbfb223416fe8f4a054d5600c5a5e85c7443ce Mon Sep 17 00:00:00 2001 From: krasow Date: Tue, 2 Jun 2026 20:45:37 -0500 Subject: [PATCH 04/17] run benchmarks modifications --- .gitignore | 4 + benchmark/benchmarks.jl | 197 ++++++++++++++++++++++++++++++---- benchmark/benchmarks.toml | 28 ++--- benchmark/parse_benchmarks.jl | 42 +++----- benchmark/run.jl | 106 +++++++++--------- benchmark/run_benchmark.sh | 8 +- benchmark/sgemm.jl | 56 ---------- 7 files changed, 266 insertions(+), 175 deletions(-) delete mode 100644 benchmark/sgemm.jl diff --git a/.gitignore b/.gitignore index 3b09cfb4..c2af1d47 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,10 @@ logging/* debug debug/* +# benchmark outputs +benchmark/results +benchmark/results/* + compile_wrapper.sh *.tar.gz diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index f93293e6..8a87dd38 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -1,4 +1,5 @@ -using CSV +using Printf +using Statistics """ - `n_warmup::Int` : Number of warmup steps. These are not timed. Intended @@ -14,8 +15,8 @@ using CSV Base.@kwdef struct GlobalSettings n_warmup::Int # Number of warmup steps, where timing is not done. n_iter::Int # Number of iterations to run per trial - n_trial::Int # Number of independent trials to run. Benchmark - n_gpu::Int + n_trial::Int = 1 # Number of independent trials to run. Benchmark + n_gpu::Int = 0 end ######################################### @@ -29,6 +30,8 @@ Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T} M::Int end +name(::GEMM) = "sgemm" +dims(g::GEMM) = (g.N, g.M) data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" function allowed_types(::Type{GEMM}) @@ -38,11 +41,12 @@ end total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1) total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T)) -function initialize_cpu(s::GEMM{T}) where {T} - A = rand(T, s.N, s.M) - B = rand(T, s.M, s.N) - C = zeros(T, s.N, s.N) - return A, B, C +function initialize(s::GEMM{T}) where {T} + A = cuNumeric.rand(T, s.N, s.M) + B = cuNumeric.rand(T, s.M, s.N) + C = cuNumeric.zeros(T, s.N, s.N) + GC.gc() + return C, A, B end run!(::GEMM, C, A, B) = mul!(C, A, B) @@ -60,24 +64,179 @@ end allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) -total_flops(s::MonteCarloIntegration) = missing # cannot estimate FLOPS for squaring or exp easily - -function initialize_cpu(s::MonteCarloIntegration{T}) where {T} - return T(10) .* rand(T, s.n_samples) .+ T(-5) # random samples in [-5, 5] -end +total_flops(s::MonteCarloIntegration) = s.n_samples _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) ######################################### -struct BenchmarkResult{T,B<:AbstractBenchmark} - times_ms::T - gflops::T +struct GSParams{T} + dx::T + dt::T + c_u::T + c_v::T + f::T + k::T +end + +function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T} + GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k)) +end + +Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T} + N::Int + M::Int +end + +name(::GrayScott) = "grayscott" +dims(b::GrayScott) = (b.N, b.M) +data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" +allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES +total_flops(b::GrayScott) = b.N * b.M # grid points updated per step + +mutable struct GrayScottState{A,P} + u::A + v::A + u_new::A + v_new::A + params::P +end + +function initialize(b::GrayScott{T}) where {T} + d = (b.N, b.M) + u = cuNumeric.ones(T, d) + v = cuNumeric.zeros(T, d) + u_new = cuNumeric.zeros(T, d) + v_new = cuNumeric.zeros(T, d) + + seed = min(150, b.N, b.M) + u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) + v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) + + return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),) +end + +function _gs_step!(u, v, u_new, v_new, args::GSParams) + # currently we don't have NDArray^x working yet. + F_u = ( + ( + -u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)]) + ) + F_v = ( + ( + u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)] + ) + # 2-D Laplacian via slicing, excluding boundaries + u_lap = ( + ( + u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + v_lap = ( + ( + v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + + # Forward-Euler step for all interior points + u_new[2:(end - 1), 2:(end - 1)] = + ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)] + v_new[2:(end - 1), 2:(end - 1)] = + ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)] + + # Periodic boundary conditions + u_new[:, 1] = u[:, end - 1] + u_new[:, end] = u[:, 2] + u_new[1, :] = u[end - 1, :] + u_new[end, :] = u[2, :] + v_new[:, 1] = v[:, end - 1] + v_new[:, end] = v[:, 2] + v_new[1, :] = v[end - 1, :] + v_new[end, :] = v[2, :] +end + +function run!(::GrayScott, st::GrayScottState) + _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params) + # swap references rather than copy + st.u, st.u_new = st.u_new, st.u + st.v, st.v_new = st.v_new, st.v + return nothing +end + +######################################### + +# Maps the benchmarks.toml table name to its benchmark type. Add new benchmarks here. +const BENCHMARKS = Dict{String,Type}( + "sgemm" => GEMM, + "grayscott" => GrayScott, +) + +# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean +# over `n_iter` iterations for trial `i`; the spread across trials gives stddev. +struct BenchmarkResult{B<:AbstractBenchmark} + times_ms::Vector{Float64} + gflops::Vector{Float64} benchmark::B end -function save(br::BenchmarkResult) - # Compute standard error and mean time and save to - # some kind of file. +# One timed trial: warmup, then time `n_iter` iterations of `run!`. +function _trial(b::AbstractBenchmark, gs::GlobalSettings) + GC.gc(true) + state = initialize(b) + + start_time = zero(get_time_microseconds()) + for idx in 1:(gs.n_warmup + gs.n_iter) + if idx == gs.n_warmup + 1 + start_time = get_time_microseconds() + end + run!(b, state...) + end + total_time_μs = get_time_microseconds() - start_time + + mean_time_ms = total_time_μs / (gs.n_iter * 1e3) + gflops = total_flops(b) / (mean_time_ms * 1e6) + return mean_time_ms, gflops +end + +# Run `n_trial` independent trials and collect their per-trial measurements. +function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings) + times_ms = Float64[] + gflops = Float64[] + for _ in 1:gs.n_trial + t, g = _trial(b, gs) + push!(times_ms, t) + push!(gflops, g) + end + return BenchmarkResult(times_ms, gflops, b) +end + +_std(x) = length(x) > 1 ? std(x) : 0.0 + +function save_result(br::BenchmarkResult, gpus) + N, M = dims(br.benchmark) + path = joinpath(@__DIR__, "results", "$(name(br.benchmark)).csv") + mkpath(dirname(path)) + open(path, "a") do io + for trial in eachindex(br.times_ms) + @printf( + io, "%s,%d,%d,%d,%d,%.6f,%.6f\n", + "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], + ) + end + end end diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index acdeec45..472e13bc 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -1,22 +1,16 @@ [Global] -N_WARMUP = 2 -N_ITER = 100 -N_GPU = 1 +n_warmup = 5 +n_iter = 1000 +n_trial = 5 -[[GEMM]] -T = "Float64" -N = 100 -M = 100 +[[grayscott]] +gpus = 1 +cpus = 2 +N = 1000 +M = 1000 -[[GEMM]] -T = "Float32" +[[sgemm]] +gpus = 1 +cpus = 2 N = 150 M = 150 - -[[MonteCarloIntegration]] -T = "Float64" -N_samples = 1_000_000 - -[[MonteCarloIntegration]] -T = "Float32" -N_samples = 10_000_000 diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl index f25d4836..b11ef983 100644 --- a/benchmark/parse_benchmarks.jl +++ b/benchmark/parse_benchmarks.jl @@ -1,39 +1,31 @@ using TOML -function to_symbol_dict(d) - return Dict(Symbol(k) => v for (k, v) in d) +""" +One benchmark invocation parsed from `benchmarks.toml`. `name` selects the +benchmark type from `BENCHMARKS`; `args` are the sizes (currently `N M`). +""" +struct BenchmarkSpec + name::String + gpus::Int + cpus::Int + args::Vector{Int} end function parse_config(path) raw = TOML.parsefile(path) - global_settings = GlobalSettings(; to_symbol_dict(raw["Global"])...) - - benchmarks = AbstractBenchmark[] + g = raw["Global"] + global_settings = GlobalSettings(; + n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1) + ) + specs = BenchmarkSpec[] for (name, entries) in raw name == "Global" && continue - - # Convert name parsed as String, to actual type - BenchmarkType = getproperty(Main, Symbol(name)) - - for entry in entries - # Convert type parsed as String, to actual type - T = getproperty(Main, Symbol(entry["T"])) - - params = Dict{Symbol,Any}() - for (k, v) in entry - k == "T" && continue - params[Symbol(k)] = v - end - - if T <: allowed_types(BenchmarkType) - push!(benchmarks, BenchmarkType{T}(; params...)) - else - @warn "$(BenchmarkType) does not support benchmarking with type $(T). Skipping." - end + for e in entries + push!(specs, BenchmarkSpec(name, e["gpus"], e["cpus"], [e["N"], e["M"]])) end end - return global_settings, benchmarks + return global_settings, specs end diff --git a/benchmark/run.jl b/benchmark/run.jl index b090c8ef..58990e14 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -1,70 +1,66 @@ -using Random: Random -using cuNumeric: cuNumeric +# run.jl: orchestrator, one child per benchmarks.toml entry. With args +# ( ) it runs one benchmark, e.g. +# `julia run.jl 1 grayscott 1000 1000 100 5 5` +# Separate child per benchmark since LEGATE_CONFIG must be set before julia starts. include("benchmarks.jl") include("parse_benchmarks.jl") -function benchmark(b::AbstractBenchmark, gs::GlobalSettings, arrs) - GC.gc(; full=true) +function run_all_benchmarks(config="benchmarks.toml") + gs, specs = parse_config(joinpath(@__DIR__, config)) - for idx in range(1, gs.n_iter + gs.n_warmup) - if idx == gs.n_warmup + 1 - start_time = get_time_microseconds() - end - - run!(b, arrays...) - end - total_time_μs = get_time_microseconds() - start_time - mean_time_ms = total_time_μs / (gs.n_iter * 1e3) - gflops = total_flops(N, M) / (mean_time_ms * 1e6) + runner = joinpath(@__DIR__, "run_benchmark.sh") + self = @__FILE__ - GC.gc(; full=true) - - return mean_time_ms, gflops -end - -function run_all_benchmarks() - global_settings, benchmarks = parse_config("benchmarks.toml") - - @show global_settings - @show benchmarks - - cunumeric_results = BenchmarkResult[] - cuda_results = BenchmarkResult[] + for spec in specs + if !haskey(BENCHMARKS, spec.name) + @warn "No benchmark registered for '$(spec.name)'; skipping." + continue + end - for b in benchmarks + N, M = spec.args + println("\n================================") + println( + "$(spec.name): gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " * + "n_iter=$(gs.n_iter) n_warmup=$(gs.n_warmup) n_trial=$(gs.n_trial)", + ) println("================================") - println(data(b)) - println("================================") - - cn_times_ms = Vector{Float64}(undef, global_settings.n_trial) - cn_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial) - - cuda_times_ms = Vector{Float64}(undef, global_settings.n_trial) - cuda_gflops = Vector{Union{Missing,Float64}}(undef, global_settings.n_trial) - - for i in 1:global_settings.n_trial - arrs_julia = initialize_cpu(b) - - arrs_cunumeric = # TODO - cn_times_ms[i], cn_gflops[i] = benchmark(b, arrs_cunumeric...) - push - if gs.n_gpu == 1 - arrs_cuda = # TODO - cuda_times_ms[i], cuda_gflops[i] = benchmark(b, arrs_cuda...) - push!(cuda_results, res_cuda) - end + cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)` + try + run(cmd) + catch e + @error "Benchmark '$(spec.name)' failed; continuing." exception = e end + end +end - cn_result = BenchmarkResult(cn_times_ms, cn_gflops, b) - cuda_result = BenchmarkResult(cuda_times_ms, cuda_gflops, b) +function run_single(gpus, name, N, M, n_iter, n_warmup, n_trial) + b = BENCHMARKS[name]{Float32}(; N=N, M=M) + gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) - push!(cunumeric_results, cn_result) - push!(cuda_results, cuda_result) - end + println( + "[cuNumeric] $(name) benchmark on $(N)x$(M) for $(n_iter) iterations " * + "($(n_warmup) warmup) x $(n_trial) trials", + ) + br = run_benchmark(b, gs) + @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) + @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) - # Call the `save` function for the cuda_results - # This function is not implemeneted as I was not sure how to do it + save_result(br, gpus) +end +if isempty(ARGS) + run_all_benchmarks() +else + using cuNumeric + using LinearAlgebra + gpus = parse(Int, ARGS[1]) + bench_name = ARGS[2] + N = parse(Int, ARGS[3]) + M = parse(Int, ARGS[4]) + n_iter = parse(Int, ARGS[5]) + n_warmup = parse(Int, ARGS[6]) + n_trial = parse(Int, ARGS[7]) + run_single(gpus, bench_name, N, M, n_iter, n_warmup, n_trial) end diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh index 07a97a05..ef54dfa5 100755 --- a/benchmark/run_benchmark.sh +++ b/benchmark/run_benchmark.sh @@ -43,14 +43,16 @@ if [[ $GPUS -lt 0 ]]; then fi if [[ $CPUS -lt 0 ]]; then - echo "CPUs ivnalid, using cpus = 1" + echo "CPUs invalid, using cpus = 1" exit fi -export LEGATE_AUTO_CONFIG=0 -export LEGATE_CONFIG="--cpus=1 --gpus=$GPUS --omps=$CPUS --ompthreads=3 --utility=2 --sysmem=256 --numamem=19029 --fbmem=7569 --zcmem=128 --regmem=0" +export LEGATE_AUTO_CONFIG=1 +export LEGATE_CONFIG="--cpus=$CPUS --gpus=$GPUS" export LEGATE_SHOW_CONFIG=1 +export LD_LIBRARY_PATH="" + echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs" CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}" diff --git a/benchmark/sgemm.jl b/benchmark/sgemm.jl deleted file mode 100644 index 28d9ad7c..00000000 --- a/benchmark/sgemm.jl +++ /dev/null @@ -1,56 +0,0 @@ -using cuNumeric -using LinearAlgebra -using Printf - -function initialize_cunumeric(N, M) - A = cuNumeric.as_type(cuNumeric.rand(NDArray, N, M), Float32) - B = cuNumeric.as_type(cuNumeric.rand(NDArray, M, N), Float32) - C = cuNumeric.zeros(Float32, N, N) - GC.gc() # remove the intermediate FP64 arrays - return A, B, C -end - -function total_flops(N, M) - return N * N * ((2*M) - 1) -end - -function total_space(N, M) - return 2 * (N*M) * sizeof(Float32) + (N*N) * sizeof(Float32) -end - -function gemm_cunumeric(N, M, n_samples, n_warmup) - A, B, C = initialize_cunumeric(N, M) - - start_time = nothing - for idx in range(1, n_samples + n_warmup) - if idx == n_warmup + 1 - start_time = get_time_microseconds() - end - - mul!(C, A, B) - end - total_time_μs = get_time_microseconds() - start_time - mean_time_ms = total_time_μs / (n_samples * 1e3) - gflops = total_flops(N, M) / (mean_time_ms * 1e6) # GFLOP is 1e9 - - return mean_time_ms, gflops -end - -gpus = parse(Int, ARGS[1]) -N = parse(Int, ARGS[2]) -M = parse(Int, ARGS[3]) -n_samples = parse(Int, ARGS[4]) -n_warmup = parse(Int, ARGS[5]) - -println( - "[cuNumeric] MATMUL benchmark on $(N)x$(M) matricies for $(n_samples) iterations, $(n_warmup) warmups" -) - -mean_time_ms, gflops = gemm_cunumeric(N, M, n_samples, n_warmup) - -println("[cuNumeric] Mean Run Time: $(mean_time_ms) ms") -println("[cuNumeric] FLOPS: $(gflops) GFLOPS") - -open("./gemm.csv", "a") do io - @printf(io, "%s,%d,%d,%d,%.6f,%.6f\n", "cunumeric", gpus, N, M, mean_time_ms, gflops) -end From cbd9a3a3b449fa7f6f1a5091258d83af601e8217 Mon Sep 17 00:00:00 2001 From: krasow Date: Tue, 2 Jun 2026 21:10:41 -0500 Subject: [PATCH 05/17] bring back T config --- benchmark/benchmarks.toml | 9 +++++++++ benchmark/parse_benchmarks.jl | 9 +++++++-- benchmark/run.jl | 31 ++++++++++++++++++------------- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index 472e13bc..7d70c807 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -4,12 +4,21 @@ n_iter = 1000 n_trial = 5 [[grayscott]] +T = "Float64" gpus = 1 cpus = 2 N = 1000 M = 1000 [[sgemm]] +T = "Float32" +gpus = 1 +cpus = 2 +N = 150 +M = 150 + +[[sgemm]] +T = "Float64" gpus = 1 cpus = 2 N = 150 diff --git a/benchmark/parse_benchmarks.jl b/benchmark/parse_benchmarks.jl index b11ef983..cf79cf20 100644 --- a/benchmark/parse_benchmarks.jl +++ b/benchmark/parse_benchmarks.jl @@ -2,10 +2,12 @@ using TOML """ One benchmark invocation parsed from `benchmarks.toml`. `name` selects the -benchmark type from `BENCHMARKS`; `args` are the sizes (currently `N M`). +benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32"); +`args` are the sizes (currently `N M`). """ struct BenchmarkSpec name::String + T::String gpus::Int cpus::Int args::Vector{Int} @@ -23,7 +25,10 @@ function parse_config(path) for (name, entries) in raw name == "Global" && continue for e in entries - push!(specs, BenchmarkSpec(name, e["gpus"], e["cpus"], [e["N"], e["M"]])) + push!( + specs, + BenchmarkSpec(name, get(e, "T", "Float32"), e["gpus"], e["cpus"], [e["N"], e["M"]]), + ) end end diff --git a/benchmark/run.jl b/benchmark/run.jl index 58990e14..72d3036c 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -1,6 +1,6 @@ # run.jl: orchestrator, one child per benchmarks.toml entry. With args -# ( ) it runs one benchmark, e.g. -# `julia run.jl 1 grayscott 1000 1000 100 5 5` +# ( ) it runs one benchmark, e.g. +# `julia run.jl 1 grayscott Float32 1000 1000 100 5 5` # Separate child per benchmark since LEGATE_CONFIG must be set before julia starts. include("benchmarks.jl") @@ -21,12 +21,12 @@ function run_all_benchmarks(config="benchmarks.toml") N, M = spec.args println("\n================================") println( - "$(spec.name): gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " * + "$(spec.name): T=$(spec.T) gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " * "n_iter=$(gs.n_iter) n_warmup=$(gs.n_warmup) n_trial=$(gs.n_trial)", ) println("================================") - cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)` + cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $(spec.T) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)` try run(cmd) catch e @@ -35,12 +35,16 @@ function run_all_benchmarks(config="benchmarks.toml") end end -function run_single(gpus, name, N, M, n_iter, n_warmup, n_trial) - b = BENCHMARKS[name]{Float32}(; N=N, M=M) +# Resolve a TOML type string like "Float32" to the actual Julia type. +parse_type(s) = getfield(Base, Symbol(s))::DataType + +function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) + T = parse_type(T_str) + b = BENCHMARKS[name]{T}(; N=N, M=M) gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) println( - "[cuNumeric] $(name) benchmark on $(N)x$(M) for $(n_iter) iterations " * + "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " * "($(n_warmup) warmup) x $(n_trial) trials", ) br = run_benchmark(b, gs) @@ -57,10 +61,11 @@ else using LinearAlgebra gpus = parse(Int, ARGS[1]) bench_name = ARGS[2] - N = parse(Int, ARGS[3]) - M = parse(Int, ARGS[4]) - n_iter = parse(Int, ARGS[5]) - n_warmup = parse(Int, ARGS[6]) - n_trial = parse(Int, ARGS[7]) - run_single(gpus, bench_name, N, M, n_iter, n_warmup, n_trial) + T_str = ARGS[3] + N = parse(Int, ARGS[4]) + M = parse(Int, ARGS[5]) + n_iter = parse(Int, ARGS[6]) + n_warmup = parse(Int, ARGS[7]) + n_trial = parse(Int, ARGS[8]) + run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial) end From fcfd749331dd29b97326329b9c9caf7ec7763888 Mon Sep 17 00:00:00 2001 From: krasow Date: Tue, 2 Jun 2026 22:16:38 -0500 Subject: [PATCH 06/17] modularize some stuff. run.jl (does all) -> run_benchmark.sh (shell script to set legate config and call worker) -> benchmark/single.jl (worker to call benchmark) --- benchmark/benchmarks.toml | 12 ++++ benchmark/run.jl | 96 ++++++++++--------------- benchmark/{ => src}/benchmarks.jl | 19 ++++- benchmark/{ => src}/parse_benchmarks.jl | 5 +- benchmark/src/single.jl | 37 ++++++++++ 5 files changed, 110 insertions(+), 59 deletions(-) rename benchmark/{ => src}/benchmarks.jl (90%) rename benchmark/{ => src}/parse_benchmarks.jl (81%) create mode 100644 benchmark/src/single.jl diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index 7d70c807..deb58d53 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -23,3 +23,15 @@ gpus = 1 cpus = 2 N = 150 M = 150 + +[[montecarlo]] +T = "Float64" +gpus = 1 +cpus = 2 +N = 1_000_000 + +[[montecarlo]] +T = "Float32" +gpus = 1 +cpus = 2 +N = 1_000_000 diff --git a/benchmark/run.jl b/benchmark/run.jl index 72d3036c..0d22ad0d 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -1,71 +1,53 @@ -# run.jl: orchestrator, one child per benchmarks.toml entry. With args -# ( ) it runs one benchmark, e.g. -# `julia run.jl 1 grayscott Float32 1000 1000 100 5 5` -# Separate child per benchmark since LEGATE_CONFIG must be set before julia starts. +# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and +# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before +# launching the worker (single.jl) that actually runs the benchmark. +# no args -> one command per benchmarks.toml entry +# with args -> one command from -include("benchmarks.jl") -include("parse_benchmarks.jl") +include("src/benchmarks.jl") +include("src/parse_benchmarks.jl") -function run_all_benchmarks(config="benchmarks.toml") - gs, specs = parse_config(joinpath(@__DIR__, config)) +const RUNNER = joinpath(@__DIR__, "run_benchmark.sh") +const WORKER = joinpath(@__DIR__, "src/single.jl") - runner = joinpath(@__DIR__, "run_benchmark.sh") - self = @__FILE__ +banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) - for spec in specs - if !haskey(BENCHMARKS, spec.name) - @warn "No benchmark registered for '$(spec.name)'; skipping." - continue - end +function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial) + if !haskey(BENCHMARKS, name) + @warn "No benchmark registered for '$(name)'; skipping." + return nothing + end - N, M = spec.args - println("\n================================") - println( - "$(spec.name): T=$(spec.T) gpus=$(spec.gpus) cpus=$(spec.cpus) N=$(N) M=$(M) " * - "n_iter=$(gs.n_iter) n_warmup=$(gs.n_warmup) n_trial=$(gs.n_trial)", - ) - println("================================") + banner( + "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * + "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", + ) - cmd = `bash $runner $self --gpus $(spec.gpus) --cpus $(spec.cpus) $(spec.name) $(spec.T) $N $M $(gs.n_iter) $(gs.n_warmup) $(gs.n_trial)` - try - run(cmd) - catch e - @error "Benchmark '$(spec.name)' failed; continuing." exception = e - end + cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` + try + run(cmd) + catch e + @error "Benchmark '$(name)' failed; continuing." exception = e end end -# Resolve a TOML type string like "Float32" to the actual Julia type. -parse_type(s) = getfield(Base, Symbol(s))::DataType - -function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) - T = parse_type(T_str) - b = BENCHMARKS[name]{T}(; N=N, M=M) - gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) - - println( - "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " * - "($(n_warmup) warmup) x $(n_trial) trials", - ) - br = run_benchmark(b, gs) - @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) - @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) - - save_result(br, gpus) +function run_all_benchmarks(config="benchmarks.toml") + gs, specs = parse_config(joinpath(@__DIR__, config)) + for spec in specs + N, M = spec.args + dispatch(; + gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T, N=N, M=M, + n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial, + ) + end end if isempty(ARGS) run_all_benchmarks() -else - using cuNumeric - using LinearAlgebra - gpus = parse(Int, ARGS[1]) - bench_name = ARGS[2] - T_str = ARGS[3] - N = parse(Int, ARGS[4]) - M = parse(Int, ARGS[5]) - n_iter = parse(Int, ARGS[6]) - n_warmup = parse(Int, ARGS[7]) - n_trial = parse(Int, ARGS[8]) - run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial) +else # dispatch on args + dispatch(; + gpus=parse(Int, ARGS[1]), cpus=parse(Int, ARGS[2]), name=ARGS[3], T=ARGS[4], + N=parse(Int, ARGS[5]), M=parse(Int, ARGS[6]), + n_iter=parse(Int, ARGS[7]), n_warmup=parse(Int, ARGS[8]), n_trial=parse(Int, ARGS[9]), + ) end diff --git a/benchmark/benchmarks.jl b/benchmark/src/benchmarks.jl similarity index 90% rename from benchmark/benchmarks.jl rename to benchmark/src/benchmarks.jl index 8a87dd38..2cd17cdf 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/src/benchmarks.jl @@ -57,6 +57,8 @@ Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T} n_samples::Int end +name(::MonteCarloIntegration) = "montecarlo" +dims(mci::MonteCarloIntegration) = (mci.n_samples, 1) function data(mci::MonteCarloIntegration{T}) where {T} "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)" end @@ -66,6 +68,13 @@ allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) total_flops(s::MonteCarloIntegration) = s.n_samples +function initialize(mci::MonteCarloIntegration{T}) where {T} + # Uniform samples over the integration domain [0, 10]. + x = T(10) .* cuNumeric.rand(T, mci.n_samples) + GC.gc() + return (x,) +end + _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) @@ -184,8 +193,16 @@ end const BENCHMARKS = Dict{String,Type}( "sgemm" => GEMM, "grayscott" => GrayScott, + "montecarlo" => MonteCarloIntegration, ) +# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks +# use (N, M); MonteCarloIntegration uses N as its sample count and ignores M. +build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M) +function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T} + MonteCarloIntegration{T}(; n_samples=N) +end + # Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean # over `n_iter` iterations for trial `i`; the spread across trials gives stddev. struct BenchmarkResult{B<:AbstractBenchmark} @@ -229,7 +246,7 @@ _std(x) = length(x) > 1 ? std(x) : 0.0 function save_result(br::BenchmarkResult, gpus) N, M = dims(br.benchmark) - path = joinpath(@__DIR__, "results", "$(name(br.benchmark)).csv") + path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv") mkpath(dirname(path)) open(path, "a") do io for trial in eachindex(br.times_ms) diff --git a/benchmark/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl similarity index 81% rename from benchmark/parse_benchmarks.jl rename to benchmark/src/parse_benchmarks.jl index cf79cf20..8fecedb3 100644 --- a/benchmark/parse_benchmarks.jl +++ b/benchmark/src/parse_benchmarks.jl @@ -27,7 +27,10 @@ function parse_config(path) for e in entries push!( specs, - BenchmarkSpec(name, get(e, "T", "Float32"), e["gpus"], e["cpus"], [e["N"], e["M"]]), + BenchmarkSpec( + name, get(e, "T", "Float32"), e["gpus"], e["cpus"], + [e["N"], get(e, "M", 1)], + ), ) end end diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl new file mode 100644 index 00000000..e86419bf --- /dev/null +++ b/benchmark/src/single.jl @@ -0,0 +1,37 @@ +# single.jl: worker that runs exactly one benchmark. Launched by run_benchmark.sh +# (dispatched from run.jl), which sets LEGATE_CONFIG in the env before julia starts. +# Args: + +using cuNumeric +using LinearAlgebra + +include("benchmarks.jl") + +# Resolve a TOML type string like "Float32" to the actual Julia type. +parse_type(s) = getfield(Base, Symbol(s))::DataType + +function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) + T = parse_type(T_str) + b = build_benchmark(BENCHMARKS[name], T, N, M) + gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) + + println( + "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " * + "($(n_warmup) warmup) x $(n_trial) trials", + ) + br = run_benchmark(b, gs) + @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) + @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) + + save_result(br, gpus) +end + +gpus = parse(Int, ARGS[1]) +bench_name = ARGS[2] +T_str = ARGS[3] +N = parse(Int, ARGS[4]) +M = parse(Int, ARGS[5]) +n_iter = parse(Int, ARGS[6]) +n_warmup = parse(Int, ARGS[7]) +n_trial = parse(Int, ARGS[8]) +run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial) From 87a9f908ad25c51cf201c54213fc8093bda40c26 Mon Sep 17 00:00:00 2001 From: krasow Date: Tue, 2 Jun 2026 22:55:06 -0500 Subject: [PATCH 07/17] support multi list zipping configs --- benchmark/README.md | 53 +++++++++++++++++++++++++++++++ benchmark/benchmarks.toml | 38 ++++++++-------------- benchmark/src/parse_benchmarks.jl | 42 ++++++++++++++++++++---- 3 files changed, 102 insertions(+), 31 deletions(-) create mode 100644 benchmark/README.md diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..658ce3ca --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,53 @@ +# Benchmark configuration + +Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it. + +## Layout + +```toml +[Global] +n_warmup = 5 +n_iter = 1000 +n_trial = 5 + +[[sgemm]] # name registered in src/benchmarks.jl +T = "Float32" # element type +gpus = 1 +cpus = 2 +N = 150 +M = 150 # optional, defaults to 1 +``` + +Repeat a `[[name]]` block to add independent configs. + +## Lists + +Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along two axes: + +- **`T` multiplies.** The whole sweep runs once per type. +- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i` + of each is paired together. + +Each zipped field must be one of: + +- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config +- a list whose length equals the sweep length + +Any other length mismatch is an error. + +```toml +[[sgemm]] +T = ["Float64", "Float32"] # multiplies +gpus = [1, 2, 4] # +cpus = 2 # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600) +N = [150, 300, 600] # +M = [150, 300, 600] # +``` + +-> 2 types * 3 sweep points = **6 runs**. + +### Gotcha + +When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4** +combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type +to a specific size, use separate `[[name]]` blocks. diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index deb58d53..16def1ef 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -3,35 +3,25 @@ n_warmup = 5 n_iter = 1000 n_trial = 5 -[[grayscott]] -T = "Float64" -gpus = 1 -cpus = 2 -N = 1000 -M = 1000 - -[[sgemm]] -T = "Float32" -gpus = 1 -cpus = 2 -N = 150 -M = 150 - +# GEMM: work ~ 2*N^2*M. Hold N, scale M. [[sgemm]] -T = "Float64" -gpus = 1 +T = ["Float32", "Float64"] +gpus = [1, 2, 4, 8] cpus = 2 -N = 150 -M = 150 +N = 4096 +M = [4096, 8192, 16384, 32768] -[[montecarlo]] -T = "Float64" -gpus = 1 +# Gray-Scott: work ~ N*M. Hold N, scale M. +[[grayscott]] +T = "Float32" +gpus = [1, 2, 4, 8] cpus = 2 -N = 1_000_000 +N = 1024 +M = [1024, 2048, 4096, 8192] +# Monte Carlo: work ~ N. Scale N linearly. [[montecarlo]] T = "Float32" -gpus = 1 +gpus = [1, 2, 4, 8] cpus = 2 -N = 1_000_000 +N = [1_000_000, 2_000_000, 4_000_000, 8_000_000] diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl index 8fecedb3..c44518df 100644 --- a/benchmark/src/parse_benchmarks.jl +++ b/benchmark/src/parse_benchmarks.jl @@ -13,6 +13,24 @@ struct BenchmarkSpec args::Vector{Int} end +# A field may be a scalar or a list. +aslist(x) = x isa AbstractVector ? collect(x) : [x] + +# Value of a zipped field for sweep position `i`. length==1 field broadcasts. +sweep_value(field, i) = length(field) == 1 ? field[1] : field[i] + +# Number of positions in the sweep. Every multi-element field must agree on length; +# length==1 fields broadcast and don't constrain it. +function sweep_length(name, fields) + lengths = [length(field) for (_, field) in fields if length(field) > 1] + isempty(lengths) && return 1 + allequal(lengths) || error( + "benchmark '$(name)': zipped fields gpus/cpus/N/M must share one length " * + "or be scalar; got " * join(("$k=$(length(v))" for (k, v) in fields), ", "), + ) + return first(lengths) +end + function parse_config(path) raw = TOML.parsefile(path) @@ -25,13 +43,23 @@ function parse_config(path) for (name, entries) in raw name == "Global" && continue for e in entries - push!( - specs, - BenchmarkSpec( - name, get(e, "T", "Float32"), e["gpus"], e["cpus"], - [e["N"], get(e, "M", 1)], - ), - ) + types = aslist(get(e, "T", "Float32")) + gpus = aslist(e["gpus"]) + cpus = aslist(e["cpus"]) + N = aslist(e["N"]) + M = aslist(get(e, "M", 1)) + + n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M]) + + for T in types, i in 1:n + push!( + specs, + BenchmarkSpec( + name, T, sweep_value(gpus, i), sweep_value(cpus, i), + [sweep_value(N, i), sweep_value(M, i)], + ), + ) + end end end From b312512a1b565dc8567749935d969cb58af86328 Mon Sep 17 00:00:00 2001 From: krasow Date: Tue, 2 Jun 2026 23:14:47 -0500 Subject: [PATCH 08/17] add benchmark registration seperating different benchmarks into individual files in src/benchmarks --- benchmark/src/benchmarks.jl | 261 +------------------------ benchmark/src/benchmarks/gemm.jl | 27 +++ benchmark/src/benchmarks/grayscott.jl | 108 ++++++++++ benchmark/src/benchmarks/montecarlo.jl | 31 +++ benchmark/src/core.jl | 98 ++++++++++ 5 files changed, 269 insertions(+), 256 deletions(-) create mode 100644 benchmark/src/benchmarks/gemm.jl create mode 100644 benchmark/src/benchmarks/grayscott.jl create mode 100644 benchmark/src/benchmarks/montecarlo.jl create mode 100644 benchmark/src/core.jl diff --git a/benchmark/src/benchmarks.jl b/benchmark/src/benchmarks.jl index 2cd17cdf..4072b959 100644 --- a/benchmark/src/benchmarks.jl +++ b/benchmark/src/benchmarks.jl @@ -1,259 +1,8 @@ using Printf using Statistics -""" -- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended - to avoid pre-compilation cost being timed. -- `n_iter::Int` : Number of iterations to run per trial. Should be large enough - to build up queue depth of tasks such that latency is hidden. -- `n_trial::Int` : Number of independent trials to run. Timing is restarted and - legate in between each trial. Sets number of datapoints used to estimated - standard deviations/errors. -- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG, - this value is just bookkeeping. -""" -Base.@kwdef struct GlobalSettings - n_warmup::Int # Number of warmup steps, where timing is not done. - n_iter::Int # Number of iterations to run per trial - n_trial::Int = 1 # Number of independent trials to run. Benchmark - n_gpu::Int = 0 -end - -######################################### - -abstract type AbstractBenchmark{T} end - -######################################### - -Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T} - N::Int - M::Int -end - -name(::GEMM) = "sgemm" -dims(g::GEMM) = (g.N, g.M) -data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" - -function allowed_types(::Type{GEMM}) - Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES} -end - -total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1) -total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T)) - -function initialize(s::GEMM{T}) where {T} - A = cuNumeric.rand(T, s.N, s.M) - B = cuNumeric.rand(T, s.M, s.N) - C = cuNumeric.zeros(T, s.N, s.N) - GC.gc() - return C, A, B -end - -run!(::GEMM, C, A, B) = mul!(C, A, B) - -######################################### - -Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T} - n_samples::Int -end - -name(::MonteCarloIntegration) = "montecarlo" -dims(mci::MonteCarloIntegration) = (mci.n_samples, 1) -function data(mci::MonteCarloIntegration{T}) where {T} - "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)" -end - -allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES - -total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) -total_flops(s::MonteCarloIntegration) = s.n_samples - -function initialize(mci::MonteCarloIntegration{T}) where {T} - # Uniform samples over the integration domain [0, 10]. - x = T(10) .* cuNumeric.rand(T, mci.n_samples) - GC.gc() - return (x,) -end - -_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples -run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) - -######################################### - -struct GSParams{T} - dx::T - dt::T - c_u::T - c_v::T - f::T - k::T -end - -function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T} - GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k)) -end - -Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T} - N::Int - M::Int -end - -name(::GrayScott) = "grayscott" -dims(b::GrayScott) = (b.N, b.M) -data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" -allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES -total_flops(b::GrayScott) = b.N * b.M # grid points updated per step - -mutable struct GrayScottState{A,P} - u::A - v::A - u_new::A - v_new::A - params::P -end - -function initialize(b::GrayScott{T}) where {T} - d = (b.N, b.M) - u = cuNumeric.ones(T, d) - v = cuNumeric.zeros(T, d) - u_new = cuNumeric.zeros(T, d) - v_new = cuNumeric.zeros(T, d) - - seed = min(150, b.N, b.M) - u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) - v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) - - return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),) -end - -function _gs_step!(u, v, u_new, v_new, args::GSParams) - # currently we don't have NDArray^x working yet. - F_u = ( - ( - -u[2:(end - 1), 2:(end - 1)] .* - (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) - ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)]) - ) - F_v = ( - ( - u[2:(end - 1), 2:(end - 1)] .* - (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) - ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)] - ) - # 2-D Laplacian via slicing, excluding boundaries - u_lap = ( - ( - u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] + - u[1:(end - 2), 2:(end - 1)] - ) ./ args.dx^2 + - ( - u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] + - u[2:(end - 1), 1:(end - 2)] - ) ./ args.dx^2 - ) - v_lap = ( - ( - v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] + - v[1:(end - 2), 2:(end - 1)] - ) ./ args.dx^2 + - ( - v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] + - v[2:(end - 1), 1:(end - 2)] - ) ./ args.dx^2 - ) - - # Forward-Euler step for all interior points - u_new[2:(end - 1), 2:(end - 1)] = - ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)] - v_new[2:(end - 1), 2:(end - 1)] = - ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)] - - # Periodic boundary conditions - u_new[:, 1] = u[:, end - 1] - u_new[:, end] = u[:, 2] - u_new[1, :] = u[end - 1, :] - u_new[end, :] = u[2, :] - v_new[:, 1] = v[:, end - 1] - v_new[:, end] = v[:, 2] - v_new[1, :] = v[end - 1, :] - v_new[end, :] = v[2, :] -end - -function run!(::GrayScott, st::GrayScottState) - _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params) - # swap references rather than copy - st.u, st.u_new = st.u_new, st.u - st.v, st.v_new = st.v_new, st.v - return nothing -end - -######################################### - -# Maps the benchmarks.toml table name to its benchmark type. Add new benchmarks here. -const BENCHMARKS = Dict{String,Type}( - "sgemm" => GEMM, - "grayscott" => GrayScott, - "montecarlo" => MonteCarloIntegration, -) - -# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks -# use (N, M); MonteCarloIntegration uses N as its sample count and ignores M. -build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M) -function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T} - MonteCarloIntegration{T}(; n_samples=N) -end - -# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean -# over `n_iter` iterations for trial `i`; the spread across trials gives stddev. -struct BenchmarkResult{B<:AbstractBenchmark} - times_ms::Vector{Float64} - gflops::Vector{Float64} - benchmark::B -end - -# One timed trial: warmup, then time `n_iter` iterations of `run!`. -function _trial(b::AbstractBenchmark, gs::GlobalSettings) - GC.gc(true) - state = initialize(b) - - start_time = zero(get_time_microseconds()) - for idx in 1:(gs.n_warmup + gs.n_iter) - if idx == gs.n_warmup + 1 - start_time = get_time_microseconds() - end - run!(b, state...) - end - total_time_μs = get_time_microseconds() - start_time - - mean_time_ms = total_time_μs / (gs.n_iter * 1e3) - gflops = total_flops(b) / (mean_time_ms * 1e6) - return mean_time_ms, gflops -end - -# Run `n_trial` independent trials and collect their per-trial measurements. -function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings) - times_ms = Float64[] - gflops = Float64[] - for _ in 1:gs.n_trial - t, g = _trial(b, gs) - push!(times_ms, t) - push!(gflops, g) - end - return BenchmarkResult(times_ms, gflops, b) -end - -_std(x) = length(x) > 1 ? std(x) : 0.0 - -function save_result(br::BenchmarkResult, gpus) - N, M = dims(br.benchmark) - path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv") - mkpath(dirname(path)) - open(path, "a") do io - for trial in eachindex(br.times_ms) - @printf( - io, "%s,%d,%d,%d,%d,%.6f,%.6f\n", - "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], - ) - end - end -end +# Adding a benchmark is: drop a file in benchmarks/ and include it below. +include("core.jl") +include("benchmarks/gemm.jl") +include("benchmarks/grayscott.jl") +include("benchmarks/montecarlo.jl") diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl new file mode 100644 index 00000000..09ff0c57 --- /dev/null +++ b/benchmark/src/benchmarks/gemm.jl @@ -0,0 +1,27 @@ +Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T} + N::Int + M::Int +end + +name(::GEMM) = "sgemm" +dims(g::GEMM) = (g.N, g.M) +data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" + +function allowed_types(::Type{GEMM}) + Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES} +end + +total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1) +total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T)) + +function initialize(s::GEMM{T}) where {T} + A = cuNumeric.rand(T, s.N, s.M) + B = cuNumeric.rand(T, s.M, s.N) + C = cuNumeric.zeros(T, s.N, s.N) + GC.gc() + return C, A, B +end + +run!(::GEMM, C, A, B) = mul!(C, A, B) + +register_benchmark("sgemm", GEMM) diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl new file mode 100644 index 00000000..89b763bf --- /dev/null +++ b/benchmark/src/benchmarks/grayscott.jl @@ -0,0 +1,108 @@ +struct GSParams{T} + dx::T + dt::T + c_u::T + c_v::T + f::T + k::T +end + +function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T} + GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k)) +end + +Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T} + N::Int + M::Int +end + +name(::GrayScott) = "grayscott" +dims(b::GrayScott) = (b.N, b.M) +data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" +allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES +total_flops(b::GrayScott) = b.N * b.M # grid points updated per step + +mutable struct GrayScottState{A,P} + u::A + v::A + u_new::A + v_new::A + params::P +end + +function initialize(b::GrayScott{T}) where {T} + d = (b.N, b.M) + u = cuNumeric.ones(T, d) + v = cuNumeric.zeros(T, d) + u_new = cuNumeric.zeros(T, d) + v_new = cuNumeric.zeros(T, d) + + seed = min(150, b.N, b.M) + u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) + v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) + + return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),) +end + +function _gs_step!(u, v, u_new, v_new, args::GSParams) + # currently we don't have NDArray^x working yet. + F_u = ( + ( + -u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)]) + ) + F_v = ( + ( + u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)] + ) + # 2-D Laplacian via slicing, excluding boundaries + u_lap = ( + ( + u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + v_lap = ( + ( + v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + + # Forward-Euler step for all interior points + u_new[2:(end - 1), 2:(end - 1)] = + ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)] + v_new[2:(end - 1), 2:(end - 1)] = + ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)] + + # Periodic boundary conditions + u_new[:, 1] = u[:, end - 1] + u_new[:, end] = u[:, 2] + u_new[1, :] = u[end - 1, :] + u_new[end, :] = u[2, :] + v_new[:, 1] = v[:, end - 1] + v_new[:, end] = v[:, 2] + v_new[1, :] = v[end - 1, :] + v_new[end, :] = v[2, :] +end + +function run!(::GrayScott, st::GrayScottState) + _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params) + # swap references rather than copy + st.u, st.u_new = st.u_new, st.u + st.v, st.v_new = st.v_new, st.v + return nothing +end + +register_benchmark("grayscott", GrayScott) diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl new file mode 100644 index 00000000..ecbda4f9 --- /dev/null +++ b/benchmark/src/benchmarks/montecarlo.jl @@ -0,0 +1,31 @@ +Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T} + n_samples::Int +end + +name(::MonteCarloIntegration) = "montecarlo" +dims(mci::MonteCarloIntegration) = (mci.n_samples, 1) +function data(mci::MonteCarloIntegration{T}) where {T} + "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)" +end + +allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES + +total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) +total_flops(s::MonteCarloIntegration) = s.n_samples + +function initialize(mci::MonteCarloIntegration{T}) where {T} + # Uniform samples over the integration domain [0, 10]. + x = T(10) .* cuNumeric.rand(T, mci.n_samples) + GC.gc() + return (x,) +end + +_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples +run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) + +# n_samples comes in as N; M is unused. +function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T} + MonteCarloIntegration{T}(; n_samples=N) +end + +register_benchmark("montecarlo", MonteCarloIntegration) diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl new file mode 100644 index 00000000..63f6f898 --- /dev/null +++ b/benchmark/src/core.jl @@ -0,0 +1,98 @@ +""" +- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended + to avoid pre-compilation cost being timed. +- `n_iter::Int` : Number of iterations to run per trial. Should be large enough + to build up queue depth of tasks such that latency is hidden. +- `n_trial::Int` : Number of independent trials to run. Timing is restarted and + legate in between each trial. Sets number of datapoints used to estimated + standard deviations/errors. +- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG, + this value is just bookkeeping. +""" +Base.@kwdef struct GlobalSettings + n_warmup::Int # Number of warmup steps, where timing is not done. + n_iter::Int # Number of iterations to run per trial + n_trial::Int = 1 # Number of independent trials to run. Benchmark + n_gpu::Int = 0 +end + +######################################### + +abstract type AbstractBenchmark{T} end + +# Interface each benchmark implements (see benchmarks/gemm.jl for a template). +function name end +function dims end +function data end +function allowed_types end +function total_flops end +function initialize end +function run! end + +# Maps a benchmarks.toml table name to its benchmark type. Each benchmark file +# registers itself via `register_benchmark`. +const BENCHMARKS = Dict{String,Type}() +function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBenchmark} + BENCHMARKS[key] = B +end + +# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks +# use (N, M); a benchmark with different arity overrides this (see montecarlo.jl). +build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M) + +######################################### + +# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean +# over `n_iter` iterations for trial `i`; the spread across trials gives stddev. +struct BenchmarkResult{B<:AbstractBenchmark} + times_ms::Vector{Float64} + gflops::Vector{Float64} + benchmark::B +end + +# One timed trial: warmup, then time `n_iter` iterations of `run!`. +function _trial(b::AbstractBenchmark, gs::GlobalSettings) + GC.gc(true) + state = initialize(b) + + start_time = zero(get_time_microseconds()) + for idx in 1:(gs.n_warmup + gs.n_iter) + if idx == gs.n_warmup + 1 + start_time = get_time_microseconds() + end + run!(b, state...) + end + total_time_μs = get_time_microseconds() - start_time + + mean_time_ms = total_time_μs / (gs.n_iter * 1e3) + gflops = total_flops(b) / (mean_time_ms * 1e6) + return mean_time_ms, gflops +end + +# Run `n_trial` independent trials and collect their per-trial measurements. +function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings) + times_ms = Float64[] + gflops = Float64[] + for _ in 1:gs.n_trial + t, g = _trial(b, gs) + push!(times_ms, t) + push!(gflops, g) + end + return BenchmarkResult(times_ms, gflops, b) +end + +_std(x) = length(x) > 1 ? std(x) : 0.0 + +function save_result(br::BenchmarkResult, gpus) + N, M = dims(br.benchmark) + path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv") + mkpath(dirname(path)) + open(path, "a") do io + for trial in eachindex(br.times_ms) + @printf( + io, "%s,%d,%d,%d,%d,%.6f,%.6f\n", + "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], + ) + end + end +end From 871b5a560f30763831feed1a41f48a9c92adb140 Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 3 Jun 2026 00:03:54 -0500 Subject: [PATCH 09/17] add notion of variants --- benchmark/README.md | 33 ++++++- benchmark/benchmarks.toml | 1 + benchmark/run.jl | 13 +-- benchmark/src/benchmarks/grayscott.jl | 119 ++++++++++++++----------- benchmark/src/benchmarks/montecarlo.jl | 4 +- benchmark/src/core.jl | 41 +++++++-- benchmark/src/parse_benchmarks.jl | 10 ++- benchmark/src/single.jl | 16 ++-- 8 files changed, 161 insertions(+), 76 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index 658ce3ca..cf2a248b 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -22,9 +22,11 @@ Repeat a `[[name]]` block to add independent configs. ## Lists -Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along two axes: +Any of `T`, `variants`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along +two axes: -- **`T` multiplies.** The whole sweep runs once per type. +- **`T` and `variants` multiply.** The whole sweep runs once per type and once + per variant. - **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i` of each is paired together. @@ -51,3 +53,30 @@ M = [150, 300, 600] # When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4** combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type to a specific size, use separate `[[name]]` blocks. + +## Variants + +A variant is a named way of running a benchmark. List them per entry with +`variants = [...]` (defaults to `["baseline"]`); they multiply like `T`, and the +chosen variant is recorded as a column in the results CSV so runs can be compared. + +```toml +[[grayscott]] +T = "Float64" +N = 1024 +M = [1024, 2048, 4096] +gpus = [1, 2, 4] +cpus = 2 +variants = ["baseline", "lifetimes"] # 2 variants * 3 sweep points = 6 runs +``` + +There are two kinds, both flowing through the same `variant` string: + +- **Code-path variants** change what the worker runs. The benchmark's `run!` + dispatches on the variant. Example: grayscott's `lifetimes` wraps the step in + `@analyze_lifetimes` (see `src/benchmarks/grayscott.jl`). A benchmark that + doesn't recognize a variant just runs its baseline path. +- **Process-level variants** flip a runtime setting before the run via a setup + thunk registered in `register_variant` (`src/core.jl`). The worker calls it at + startup. Broadcast fusion will plug in here once it lands, e.g. + `register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)`. diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index 16def1ef..a3eb531d 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -18,6 +18,7 @@ gpus = [1, 2, 4, 8] cpus = 2 N = 1024 M = [1024, 2048, 4096, 8192] +variants = ["baseline", "lifetimes"] # Monte Carlo: work ~ N. Scale N linearly. [[montecarlo]] diff --git a/benchmark/run.jl b/benchmark/run.jl index 0d22ad0d..167d35f5 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -2,8 +2,9 @@ # dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before # launching the worker (single.jl) that actually runs the benchmark. # no args -> one command per benchmarks.toml entry -# with args -> one command from +# with args -> one command from [variant] +using cuNumeric include("src/benchmarks.jl") include("src/parse_benchmarks.jl") @@ -12,18 +13,18 @@ const WORKER = joinpath(@__DIR__, "src/single.jl") banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) -function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial) +function dispatch(; gpus, cpus, name, T, variant, N, M, n_iter, n_warmup, n_trial) if !haskey(BENCHMARKS, name) @warn "No benchmark registered for '$(name)'; skipping." return nothing end banner( - "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * + "$(name) [$(variant)]: T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", ) - cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` + cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial $variant` try run(cmd) catch e @@ -36,7 +37,8 @@ function run_all_benchmarks(config="benchmarks.toml") for spec in specs N, M = spec.args dispatch(; - gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T, N=N, M=M, + gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T, + variant=spec.variant, N=N, M=M, n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial, ) end @@ -49,5 +51,6 @@ else # dispatch on args gpus=parse(Int, ARGS[1]), cpus=parse(Int, ARGS[2]), name=ARGS[3], T=ARGS[4], N=parse(Int, ARGS[5]), M=parse(Int, ARGS[6]), n_iter=parse(Int, ARGS[7]), n_warmup=parse(Int, ARGS[8]), n_trial=parse(Int, ARGS[9]), + variant=(length(ARGS) >= 10 ? ARGS[10] : "baseline"), ) end diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl index 89b763bf..0a7eb02d 100644 --- a/benchmark/src/benchmarks/grayscott.jl +++ b/benchmark/src/benchmarks/grayscott.jl @@ -14,6 +14,7 @@ end Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T} N::Int M::Int + variant::Symbol = :baseline end name(::GrayScott) = "grayscott" @@ -22,6 +23,10 @@ data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES total_flops(b::GrayScott) = b.N * b.M # grid points updated per step +function build_benchmark(::Type{GrayScott}, ::Type{T}, N, M, variant) where {T} + GrayScott{T}(; N=N, M=M, variant=Symbol(variant)) +end + mutable struct GrayScottState{A,P} u::A v::A @@ -44,65 +49,77 @@ function initialize(b::GrayScott{T}) where {T} return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),) end -function _gs_step!(u, v, u_new, v_new, args::GSParams) - # currently we don't have NDArray^x working yet. - F_u = ( - ( - -u[2:(end - 1), 2:(end - 1)] .* - (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) - ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)]) - ) - F_v = ( - ( - u[2:(end - 1), 2:(end - 1)] .* - (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) - ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)] - ) - # 2-D Laplacian via slicing, excluding boundaries - u_lap = ( - ( - u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] + - u[1:(end - 2), 2:(end - 1)] - ) ./ args.dx^2 + - ( - u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] + - u[2:(end - 1), 1:(end - 2)] - ) ./ args.dx^2 - ) - v_lap = ( - ( - v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] + - v[1:(end - 2), 2:(end - 1)] - ) ./ args.dx^2 + - ( - v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] + - v[2:(end - 1), 1:(end - 2)] - ) ./ args.dx^2 - ) +# VARIANT DESCRIPTION +# baseline: as written +# lifetimes: step wrapped in @analyze_lifetimes +let body = quote + # currently we don't have NDArray^x working yet. + F_u = ( + ( + -u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)]) + ) + F_v = ( + ( + u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)] + ) + # 2-D Laplacian via slicing, excluding boundaries + u_lap = ( + ( + u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + v_lap = ( + ( + v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) - # Forward-Euler step for all interior points - u_new[2:(end - 1), 2:(end - 1)] = - ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)] - v_new[2:(end - 1), 2:(end - 1)] = - ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)] + # Forward-Euler step for all interior points + u_new[2:(end - 1), 2:(end - 1)] = + ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)] + v_new[2:(end - 1), 2:(end - 1)] = + ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)] + + # Periodic boundary conditions + u_new[:, 1] = u[:, end - 1] + u_new[:, end] = u[:, 2] + u_new[1, :] = u[end - 1, :] + u_new[end, :] = u[2, :] + v_new[:, 1] = v[:, end - 1] + v_new[:, end] = v[:, 2] + v_new[1, :] = v[end - 1, :] + v_new[end, :] = v[2, :] + end + @eval _gs_step!(::Val{:baseline}, u, v, u_new, v_new, args::GSParams) = $body + @eval _gs_step!(::Val{:lifetimes}, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body +end - # Periodic boundary conditions - u_new[:, 1] = u[:, end - 1] - u_new[:, end] = u[:, 2] - u_new[1, :] = u[end - 1, :] - u_new[end, :] = u[2, :] - v_new[:, 1] = v[:, end - 1] - v_new[:, end] = v[:, 2] - v_new[1, :] = v[end - 1, :] - v_new[end, :] = v[2, :] +# Variants not special-cased (e.g. testing fusion) run the baseline path. +function _gs_step!(::Val, u, v, u_new, v_new, args::GSParams) + _gs_step!(Val(:baseline), u, v, u_new, v_new, args) end -function run!(::GrayScott, st::GrayScottState) - _gs_step!(st.u, st.v, st.u_new, st.v_new, st.params) +function run!(b::GrayScott, st::GrayScottState) + _gs_step!(Val(b.variant), st.u, st.v, st.u_new, st.v_new, st.params) # swap references rather than copy st.u, st.u_new = st.u_new, st.u st.v, st.v_new = st.v_new, st.v return nothing end +register_variant("lifetimes") register_benchmark("grayscott", GrayScott) diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl index ecbda4f9..0b5175f6 100644 --- a/benchmark/src/benchmarks/montecarlo.jl +++ b/benchmark/src/benchmarks/montecarlo.jl @@ -23,8 +23,8 @@ end _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) -# n_samples comes in as N; M is unused. -function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T} +# n_samples comes in as N; M and variant are unused. +function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M, variant) where {T} MonteCarloIntegration{T}(; n_samples=N) end diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl index 63f6f898..e1759ca0 100644 --- a/benchmark/src/core.jl +++ b/benchmark/src/core.jl @@ -36,9 +36,37 @@ function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBe BENCHMARKS[key] = B end -# Construct a benchmark from the orchestrator's positional sizes. Most benchmarks -# use (N, M); a benchmark with different arity overrides this (see montecarlo.jl). -build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} = B{T}(; N=N, M=M) +# Default uses (N, M); benchmarks with a code-path variant or different arity +# override this (see grayscott.jl / montecarlo.jl). +function build_benchmark(::Type{B}, ::Type{T}, N, M, variant) where {B<:AbstractBenchmark,T} + B{T}(; N=N, M=M) +end + +######################################### + +# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime +# preference); code-path variants leave it a no-op. +struct Variant + name::String + setup::Function +end + +const VARIANTS = Dict{String,Variant}() + +function register_variant(name, setup=() -> nothing) + VARIANTS[name] = Variant(name, setup) +end + +function variant_setup(name) + if haskey(VARIANTS, name) + return VARIANTS[name].setup + end + return () -> nothing +end + +register_variant("baseline") +# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!) +# register_variant("fusion_on", cuNumeric.CNPreferences.enable_broadcast_fusion!) ######################################### @@ -83,15 +111,16 @@ end _std(x) = length(x) > 1 ? std(x) : 0.0 -function save_result(br::BenchmarkResult, gpus) +function save_result(br::BenchmarkResult, gpus, variant) N, M = dims(br.benchmark) path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv") mkpath(dirname(path)) open(path, "a") do io for trial in eachindex(br.times_ms) @printf( - io, "%s,%d,%d,%d,%d,%.6f,%.6f\n", - "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], + io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n", + "cunumeric", variant, gpus, N, M, trial, + br.times_ms[trial], br.gflops[trial], ) end end diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl index c44518df..7144097c 100644 --- a/benchmark/src/parse_benchmarks.jl +++ b/benchmark/src/parse_benchmarks.jl @@ -3,11 +3,13 @@ using TOML """ One benchmark invocation parsed from `benchmarks.toml`. `name` selects the benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32"); -`args` are the sizes (currently `N M`). +`variant` names the run variant (e.g. "baseline", "lifetimes"); `args` are the +sizes (currently `N M`). """ struct BenchmarkSpec name::String T::String + variant::String gpus::Int cpus::Int args::Vector{Int} @@ -44,6 +46,7 @@ function parse_config(path) name == "Global" && continue for e in entries types = aslist(get(e, "T", "Float32")) + variants = aslist(get(e, "variants", "baseline")) gpus = aslist(e["gpus"]) cpus = aslist(e["cpus"]) N = aslist(e["N"]) @@ -51,11 +54,12 @@ function parse_config(path) n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M]) - for T in types, i in 1:n + # `T` and `variants` multiply; gpus/cpus/N/M zip into the sweep. + for T in types, variant in variants, i in 1:n push!( specs, BenchmarkSpec( - name, T, sweep_value(gpus, i), sweep_value(cpus, i), + name, T, variant, sweep_value(gpus, i), sweep_value(cpus, i), [sweep_value(N, i), sweep_value(M, i)], ), ) diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl index e86419bf..8e6e85ee 100644 --- a/benchmark/src/single.jl +++ b/benchmark/src/single.jl @@ -1,6 +1,6 @@ # single.jl: worker that runs exactly one benchmark. Launched by run_benchmark.sh # (dispatched from run.jl), which sets LEGATE_CONFIG in the env before julia starts. -# Args: +# Args: using cuNumeric using LinearAlgebra @@ -10,20 +10,21 @@ include("benchmarks.jl") # Resolve a TOML type string like "Float32" to the actual Julia type. parse_type(s) = getfield(Base, Symbol(s))::DataType -function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) +function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, variant) T = parse_type(T_str) - b = build_benchmark(BENCHMARKS[name], T, N, M) + variant_setup(variant)() # apply any pre-run setup (e.g. flip a runtime preference) + b = build_benchmark(BENCHMARKS[name], T, N, M, variant) gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) println( - "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) iterations " * - "($(n_warmup) warmup) x $(n_trial) trials", + "[cuNumeric] $(name) [$(variant)] benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * + "iterations ($(n_warmup) warmup) x $(n_trial) trials", ) br = run_benchmark(b, gs) @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) - save_result(br, gpus) + save_result(br, gpus, variant) end gpus = parse(Int, ARGS[1]) @@ -34,4 +35,5 @@ M = parse(Int, ARGS[5]) n_iter = parse(Int, ARGS[6]) n_warmup = parse(Int, ARGS[7]) n_trial = parse(Int, ARGS[8]) -run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial) +variant = ARGS[9] +run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, variant) From 83b54674add8e0d722527eb4898dcad80985cdc2 Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 3 Jun 2026 00:25:26 -0500 Subject: [PATCH 10/17] adjust includes and using cuNumeric. --- benchmark/run.jl | 11 ++++------- benchmark/src/benchmarks.jl | 3 --- benchmark/src/core.jl | 3 +++ benchmark/src/single.jl | 3 +++ 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark/run.jl b/benchmark/run.jl index 167d35f5..039306a7 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -4,8 +4,9 @@ # no args -> one command per benchmarks.toml entry # with args -> one command from [variant] -using cuNumeric -include("src/benchmarks.jl") +# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config, +# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels. +include("src/core.jl") include("src/parse_benchmarks.jl") const RUNNER = joinpath(@__DIR__, "run_benchmark.sh") @@ -14,11 +15,7 @@ const WORKER = joinpath(@__DIR__, "src/single.jl") banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) function dispatch(; gpus, cpus, name, T, variant, N, M, n_iter, n_warmup, n_trial) - if !haskey(BENCHMARKS, name) - @warn "No benchmark registered for '$(name)'; skipping." - return nothing - end - + # Name validity is checked in the worker (single.jl), which owns the registry. banner( "$(name) [$(variant)]: T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", diff --git a/benchmark/src/benchmarks.jl b/benchmark/src/benchmarks.jl index 4072b959..794068fa 100644 --- a/benchmark/src/benchmarks.jl +++ b/benchmark/src/benchmarks.jl @@ -1,6 +1,3 @@ -using Printf -using Statistics - # Adding a benchmark is: drop a file in benchmarks/ and include it below. include("core.jl") include("benchmarks/gemm.jl") diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl index e1759ca0..b87c9fd0 100644 --- a/benchmark/src/core.jl +++ b/benchmark/src/core.jl @@ -1,3 +1,6 @@ +using Printf +using Statistics + """ - `n_warmup::Int` : Number of warmup steps. These are not timed. Intended to avoid pre-compilation cost being timed. diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl index 8e6e85ee..86b09f78 100644 --- a/benchmark/src/single.jl +++ b/benchmark/src/single.jl @@ -11,6 +11,9 @@ include("benchmarks.jl") parse_type(s) = getfield(Base, Symbol(s))::DataType function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, variant) + haskey(BENCHMARKS, name) || error( + "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))" + ) T = parse_type(T_str) variant_setup(variant)() # apply any pre-run setup (e.g. flip a runtime preference) b = build_benchmark(BENCHMARKS[name], T, N, M, variant) From 63cdf9bb5e63696019d31c9603a87641b5be8931 Mon Sep 17 00:00:00 2001 From: ejmeitz Date: Wed, 3 Jun 2026 11:04:03 -0500 Subject: [PATCH 11/17] remove variants --- benchmark/Project.toml | 1 + benchmark/README.md | 2 +- benchmark/benchmarks.toml | 30 +++++++++++--- benchmark/run.jl | 32 +++++++++------ benchmark/src/benchmarks.jl | 5 --- benchmark/src/benchmarks/gemm.jl | 4 +- benchmark/src/benchmarks/grayscott.jl | 43 ++++++++++---------- benchmark/src/core.jl | 57 +++++++++++++-------------- benchmark/src/parse_benchmarks.jl | 12 +++--- benchmark/src/single.jl | 16 ++++---- 10 files changed, 112 insertions(+), 90 deletions(-) delete mode 100644 benchmark/src/benchmarks.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml index e6583a71..71a488f9 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -1,6 +1,7 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" diff --git a/benchmark/README.md b/benchmark/README.md index cf2a248b..762bffc6 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -10,7 +10,7 @@ n_warmup = 5 n_iter = 1000 n_trial = 5 -[[sgemm]] # name registered in src/benchmarks.jl +[[gemm]] # name registered in src/benchmarks.jl T = "Float32" # element type gpus = 1 cpus = 2 diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index a3eb531d..855a3b8d 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -3,24 +3,42 @@ n_warmup = 5 n_iter = 1000 n_trial = 5 -# GEMM: work ~ 2*N^2*M. Hold N, scale M. -[[sgemm]] +#################################### +# GEMM # +# Work ~ 2*N^2*M. Hold N, scale M. # +#################################### + +[[gemm]] T = ["Float32", "Float64"] gpus = [1, 2, 4, 8] cpus = 2 N = 4096 M = [4096, 8192, 16384, 32768] -# Gray-Scott: work ~ N*M. Hold N, scale M. -[[grayscott]] +################################# +# Gray-Scott # +# Work ~ N*M. Hold N, scale M. # +################################# + +[[grayscott_baseline]] T = "Float32" gpus = [1, 2, 4, 8] cpus = 2 N = 1024 M = [1024, 2048, 4096, 8192] -variants = ["baseline", "lifetimes"] -# Monte Carlo: work ~ N. Scale N linearly. +[[grayscott_lifetimes]] +T = "Float32" +gpus = [1, 2, 4, 8] +cpus = 2 +N = 1024 +M = [1024, 2048, 4096, 8192] + +################################# +# Monte-Carlo Integration # +# Work ~ N. Scale N linearly # +################################# + [[montecarlo]] T = "Float32" gpus = [1, 2, 4, 8] diff --git a/benchmark/run.jl b/benchmark/run.jl index 039306a7..89d6d28b 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -2,7 +2,7 @@ # dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before # launching the worker (single.jl) that actually runs the benchmark. # no args -> one command per benchmarks.toml entry -# with args -> one command from [variant] +# with args -> one command from # Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config, # both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels. @@ -14,14 +14,14 @@ const WORKER = joinpath(@__DIR__, "src/single.jl") banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) -function dispatch(; gpus, cpus, name, T, variant, N, M, n_iter, n_warmup, n_trial) +function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial) # Name validity is checked in the worker (single.jl), which owns the registry. banner( - "$(name) [$(variant)]: T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * + "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", ) - cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial $variant` + cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` try run(cmd) catch e @@ -34,9 +34,14 @@ function run_all_benchmarks(config="benchmarks.toml") for spec in specs N, M = spec.args dispatch(; - gpus=spec.gpus, cpus=spec.cpus, name=spec.name, T=spec.T, - variant=spec.variant, N=N, M=M, - n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial, + gpus=spec.gpus, + cpus=spec.cpus, + name=spec.name, + T=spec.T, + N=N, M=M, + n_iter=gs.n_iter, + n_warmup=gs.n_warmup, + n_trial=gs.n_trial, ) end end @@ -45,9 +50,14 @@ if isempty(ARGS) run_all_benchmarks() else # dispatch on args dispatch(; - gpus=parse(Int, ARGS[1]), cpus=parse(Int, ARGS[2]), name=ARGS[3], T=ARGS[4], - N=parse(Int, ARGS[5]), M=parse(Int, ARGS[6]), - n_iter=parse(Int, ARGS[7]), n_warmup=parse(Int, ARGS[8]), n_trial=parse(Int, ARGS[9]), - variant=(length(ARGS) >= 10 ? ARGS[10] : "baseline"), + gpus=parse(Int, ARGS[1]), + cpus=parse(Int, ARGS[2]), + name=ARGS[3], + T=ARGS[4], + N=parse(Int, ARGS[5]), + M=parse(Int, ARGS[6]), + n_iter=parse(Int, ARGS[7]), + n_warmup=parse(Int, ARGS[8]), + n_trial=parse(Int, ARGS[9]), ) end diff --git a/benchmark/src/benchmarks.jl b/benchmark/src/benchmarks.jl deleted file mode 100644 index 794068fa..00000000 --- a/benchmark/src/benchmarks.jl +++ /dev/null @@ -1,5 +0,0 @@ -# Adding a benchmark is: drop a file in benchmarks/ and include it below. -include("core.jl") -include("benchmarks/gemm.jl") -include("benchmarks/grayscott.jl") -include("benchmarks/montecarlo.jl") diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl index 09ff0c57..e4939df8 100644 --- a/benchmark/src/benchmarks/gemm.jl +++ b/benchmark/src/benchmarks/gemm.jl @@ -3,7 +3,7 @@ Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T} M::Int end -name(::GEMM) = "sgemm" +name(::GEMM) = "gemm" dims(g::GEMM) = (g.N, g.M) data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" @@ -24,4 +24,4 @@ end run!(::GEMM, C, A, B) = mul!(C, A, B) -register_benchmark("sgemm", GEMM) +register_benchmark("gemm", GEMM) diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl index 0a7eb02d..b4c60306 100644 --- a/benchmark/src/benchmarks/grayscott.jl +++ b/benchmark/src/benchmarks/grayscott.jl @@ -11,20 +11,26 @@ function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T} GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k)) end -Base.@kwdef struct GrayScott{T} <: AbstractBenchmark{T} +abstract type AbstractGrayScott{T} <: AbstractBenchmark{T} end + +Base.@kwdef struct GrayScottBaseline{T} <: AbstractGrayScott{T} N::Int M::Int - variant::Symbol = :baseline end -name(::GrayScott) = "grayscott" -dims(b::GrayScott) = (b.N, b.M) -data(b::GrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" -allowed_types(::Type{GrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES -total_flops(b::GrayScott) = b.N * b.M # grid points updated per step +Base.@kwdef struct GrayScottLifetimes{T} <: AbstractGrayScott{T} + N::Int + M::Int +end -function build_benchmark(::Type{GrayScott}, ::Type{T}, N, M, variant) where {T} - GrayScott{T}(; N=N, M=M, variant=Symbol(variant)) +name(::AbstractGrayScott) = "grayscott" +dims(b::AbstractGrayScott) = (b.N, b.M) +data(b::AbstractGrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" +allowed_types(::Type{AbstractGrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES +total_flops(b::AbstractGrayScott) = b.N * b.M # grid points updated per step + +function build_benchmark(::Type{A}, ::Type{T}, N, M) where {A<:AbstractGrayScott,T} + A{T}(; N=N, M=M) end mutable struct GrayScottState{A,P} @@ -35,7 +41,7 @@ mutable struct GrayScottState{A,P} params::P end -function initialize(b::GrayScott{T}) where {T} +function initialize(b::AbstractGrayScott{T}) where {T} d = (b.N, b.M) u = cuNumeric.ones(T, d) v = cuNumeric.zeros(T, d) @@ -104,22 +110,17 @@ let body = quote v_new[1, :] = v[end - 1, :] v_new[end, :] = v[2, :] end - @eval _gs_step!(::Val{:baseline}, u, v, u_new, v_new, args::GSParams) = $body - @eval _gs_step!(::Val{:lifetimes}, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body -end - -# Variants not special-cased (e.g. testing fusion) run the baseline path. -function _gs_step!(::Val, u, v, u_new, v_new, args::GSParams) - _gs_step!(Val(:baseline), u, v, u_new, v_new, args) + @eval _gs_step!(b::GrayScottBaseline, u, v, u_new, v_new, args::GSParams) = $body + @eval _gs_step!(b::GrayScottLifetimes, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body end -function run!(b::GrayScott, st::GrayScottState) - _gs_step!(Val(b.variant), st.u, st.v, st.u_new, st.v_new, st.params) +function run!(b::AbstractGrayScott, st::GrayScottState) + _gs_step!(b, st.u, st.v, st.u_new, st.v_new, st.params) # swap references rather than copy st.u, st.u_new = st.u_new, st.u st.v, st.v_new = st.v_new, st.v return nothing end -register_variant("lifetimes") -register_benchmark("grayscott", GrayScott) +register_benchmark("grayscott_baseline", GrayScottBaseline) +register_benchmark("grayscott_lifetimes", GrayScottLifetimes) diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl index b87c9fd0..698596ac 100644 --- a/benchmark/src/core.jl +++ b/benchmark/src/core.jl @@ -47,32 +47,6 @@ end ######################################### -# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime -# preference); code-path variants leave it a no-op. -struct Variant - name::String - setup::Function -end - -const VARIANTS = Dict{String,Variant}() - -function register_variant(name, setup=() -> nothing) - VARIANTS[name] = Variant(name, setup) -end - -function variant_setup(name) - if haskey(VARIANTS, name) - return VARIANTS[name].setup - end - return () -> nothing -end - -register_variant("baseline") -# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!) -# register_variant("fusion_on", cuNumeric.CNPreferences.enable_broadcast_fusion!) - -######################################### - # Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean # over `n_iter` iterations for trial `i`; the spread across trials gives stddev. struct BenchmarkResult{B<:AbstractBenchmark} @@ -86,7 +60,6 @@ function _trial(b::AbstractBenchmark, gs::GlobalSettings) GC.gc(true) state = initialize(b) - start_time = zero(get_time_microseconds()) for idx in 1:(gs.n_warmup + gs.n_iter) if idx == gs.n_warmup + 1 start_time = get_time_microseconds() @@ -114,7 +87,7 @@ end _std(x) = length(x) > 1 ? std(x) : 0.0 -function save_result(br::BenchmarkResult, gpus, variant) +function save_result(br::BenchmarkResult, gpus) N, M = dims(br.benchmark) path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv") mkpath(dirname(path)) @@ -122,9 +95,35 @@ function save_result(br::BenchmarkResult, gpus, variant) for trial in eachindex(br.times_ms) @printf( io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n", - "cunumeric", variant, gpus, N, M, trial, + "cunumeric", gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], ) end end end + +######################################### + +# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime +# preference); code-path variants leave it a no-op. +# struct Variant +# name::String +# setup::Function +# end + +# const VARIANTS = Dict{String,Variant}() + +# function register_variant(name, setup=() -> nothing) +# VARIANTS[name] = Variant(name, setup) +# end + +# function variant_setup(name) +# if haskey(VARIANTS, name) +# return VARIANTS[name].setup +# end +# return () -> nothing +# end + +# register_variant("baseline") +# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!) +# register_variant("fusion_on", cuNumeric.CNPreferences.enable_broadcast_fusion!) diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl index 7144097c..f5211b5a 100644 --- a/benchmark/src/parse_benchmarks.jl +++ b/benchmark/src/parse_benchmarks.jl @@ -3,15 +3,14 @@ using TOML """ One benchmark invocation parsed from `benchmarks.toml`. `name` selects the benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32"); -`variant` names the run variant (e.g. "baseline", "lifetimes"); `args` are the -sizes (currently `N M`). +`args` are the sizes (currently `N M`). """ struct BenchmarkSpec name::String T::String - variant::String gpus::Int cpus::Int + fusion::Bool args::Vector{Int} end @@ -46,20 +45,19 @@ function parse_config(path) name == "Global" && continue for e in entries types = aslist(get(e, "T", "Float32")) - variants = aslist(get(e, "variants", "baseline")) gpus = aslist(e["gpus"]) cpus = aslist(e["cpus"]) + # fusion = get(e, "fusion", true) N = aslist(e["N"]) M = aslist(get(e, "M", 1)) n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M]) - # `T` and `variants` multiply; gpus/cpus/N/M zip into the sweep. - for T in types, variant in variants, i in 1:n + for T in types for i in 1:n push!( specs, BenchmarkSpec( - name, T, variant, sweep_value(gpus, i), sweep_value(cpus, i), + name, T, sweep_value(gpus, i), sweep_value(cpus, i), [sweep_value(N, i), sweep_value(M, i)], ), ) diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl index 86b09f78..e4ee7cdb 100644 --- a/benchmark/src/single.jl +++ b/benchmark/src/single.jl @@ -5,29 +5,30 @@ using cuNumeric using LinearAlgebra -include("benchmarks.jl") +include("core.jl") +const BENCHMARK_DIR = joinpath(@__DIR__, "benchmarks") +include.(filter(contains(r".jl$"), readdir(BENCHMARK_DIR; join=true))) # Resolve a TOML type string like "Float32" to the actual Julia type. parse_type(s) = getfield(Base, Symbol(s))::DataType -function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, variant) +function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) haskey(BENCHMARKS, name) || error( "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))" ) T = parse_type(T_str) - variant_setup(variant)() # apply any pre-run setup (e.g. flip a runtime preference) - b = build_benchmark(BENCHMARKS[name], T, N, M, variant) + b = build_benchmark(BENCHMARKS[name], T, N, M) gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) println( - "[cuNumeric] $(name) [$(variant)] benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * + "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * "iterations ($(n_warmup) warmup) x $(n_trial) trials", ) br = run_benchmark(b, gs) @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) - save_result(br, gpus, variant) + save_result(br, gpus) end gpus = parse(Int, ARGS[1]) @@ -38,5 +39,4 @@ M = parse(Int, ARGS[5]) n_iter = parse(Int, ARGS[6]) n_warmup = parse(Int, ARGS[7]) n_trial = parse(Int, ARGS[8]) -variant = ARGS[9] -run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, variant) +run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial) From 866892d35b4a42458db88af620eb5db34f148949 Mon Sep 17 00:00:00 2001 From: ejmeitz Date: Wed, 3 Jun 2026 11:43:24 -0500 Subject: [PATCH 12/17] setup things to use CUDACore too --- benchmark/run_benchmark.sh | 2 ++ benchmark/src/benchmarks/gemm.jl | 8 ++++---- benchmark/src/benchmarks/grayscott.jl | 14 +++++++------- benchmark/src/benchmarks/montecarlo.jl | 4 ++-- benchmark/src/core.jl | 18 +++++++++++------- benchmark/src/single.jl | 16 ++++++++++++++++ 6 files changed, 42 insertions(+), 20 deletions(-) diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh index ef54dfa5..752eecde 100755 --- a/benchmark/run_benchmark.sh +++ b/benchmark/run_benchmark.sh @@ -55,6 +55,8 @@ export LD_LIBRARY_PATH="" echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs" +eval "julia --project -e 'using Pkg; Pkg.dev(\"..\"); Pkg.instantiate()'" + CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}" printf "Running: %s\n" "$CMD" diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl index e4939df8..a4356792 100644 --- a/benchmark/src/benchmarks/gemm.jl +++ b/benchmark/src/benchmarks/gemm.jl @@ -14,10 +14,10 @@ end total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1) total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T)) -function initialize(s::GEMM{T}) where {T} - A = cuNumeric.rand(T, s.N, s.M) - B = cuNumeric.rand(T, s.M, s.N) - C = cuNumeric.zeros(T, s.N, s.N) +function initialize(s::GEMM{T}; mod=cuNumeric) where {T} + A = mod.rand(T, s.N, s.M) + B = mod.rand(T, s.M, s.N) + C = mod.zeros(T, s.N, s.N) GC.gc() return C, A, B end diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl index b4c60306..a2d51315 100644 --- a/benchmark/src/benchmarks/grayscott.jl +++ b/benchmark/src/benchmarks/grayscott.jl @@ -41,16 +41,16 @@ mutable struct GrayScottState{A,P} params::P end -function initialize(b::AbstractGrayScott{T}) where {T} +function initialize(b::AbstractGrayScott{T}; mod=cuNumeric) where {T} d = (b.N, b.M) - u = cuNumeric.ones(T, d) - v = cuNumeric.zeros(T, d) - u_new = cuNumeric.zeros(T, d) - v_new = cuNumeric.zeros(T, d) + u = mod.ones(T, d) + v = mod.zeros(T, d) + u_new = mod.zeros(T, d) + v_new = mod.zeros(T, d) seed = min(150, b.N, b.M) - u[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) - v[1:seed, 1:seed] = cuNumeric.rand(T, (seed, seed)) + u[1:seed, 1:seed] = mod.rand(T, (seed, seed)) + v[1:seed, 1:seed] = mod.rand(T, (seed, seed)) return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),) end diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl index 0b5175f6..f82ebba6 100644 --- a/benchmark/src/benchmarks/montecarlo.jl +++ b/benchmark/src/benchmarks/montecarlo.jl @@ -13,9 +13,9 @@ allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) total_flops(s::MonteCarloIntegration) = s.n_samples -function initialize(mci::MonteCarloIntegration{T}) where {T} +function initialize(mci::MonteCarloIntegration{T}; mod=cuNumeric) where {T} # Uniform samples over the integration domain [0, 10]. - x = T(10) .* cuNumeric.rand(T, mci.n_samples) + x = T(10) .* mod.rand(T, mci.n_samples) GC.gc() return (x,) end diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl index 698596ac..f8aab2c9 100644 --- a/benchmark/src/core.jl +++ b/benchmark/src/core.jl @@ -56,9 +56,9 @@ struct BenchmarkResult{B<:AbstractBenchmark} end # One timed trial: warmup, then time `n_iter` iterations of `run!`. -function _trial(b::AbstractBenchmark, gs::GlobalSettings) +function _trial(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric) GC.gc(true) - state = initialize(b) + state = initialize(b; mod=mod) for idx in 1:(gs.n_warmup + gs.n_iter) if idx == gs.n_warmup + 1 @@ -74,11 +74,15 @@ function _trial(b::AbstractBenchmark, gs::GlobalSettings) end # Run `n_trial` independent trials and collect their per-trial measurements. -function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings) +function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric) + + # Can only test CUDA.jl performance with 1 GPU + (mod == CUDACore && gs.n_gpu == 1) || continue + times_ms = Float64[] gflops = Float64[] for _ in 1:gs.n_trial - t, g = _trial(b, gs) + t, g = _trial(b, gs; mod=mod) push!(times_ms, t) push!(gflops, g) end @@ -87,15 +91,15 @@ end _std(x) = length(x) > 1 ? std(x) : 0.0 -function save_result(br::BenchmarkResult, gpus) +function save_result(br::BenchmarkResult, gpus; mod::String="cunumeric") N, M = dims(br.benchmark) - path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark)).csv") + path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark))_$(mod).csv") mkpath(dirname(path)) open(path, "a") do io for trial in eachindex(br.times_ms) @printf( io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n", - "cunumeric", gpus, N, M, trial, + mod, gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], ) end diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl index e4ee7cdb..eb24ae38 100644 --- a/benchmark/src/single.jl +++ b/benchmark/src/single.jl @@ -3,6 +3,7 @@ # Args: using cuNumeric +using CUDACore using LinearAlgebra include("core.jl") @@ -29,6 +30,21 @@ function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) save_result(br, gpus) + + # Run CUDA.jl benchmark + if gpus == 1 + println( + "[CUDA.jl] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * + "iterations ($(n_warmup) warmup) x $(n_trial) trials", + ) + + br = run_benchmark(b, gs; mod=CUDACore) + + @printf("[CUDA.jl] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) + @printf("[CUDA.jl] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) + + save_result(br, gpus; mod="CUDA.jl") + end end gpus = parse(Int, ARGS[1]) From 00ed6bab794c80b3ad25df5df51df1d615ccb487 Mon Sep 17 00:00:00 2001 From: ejmeitz Date: Wed, 3 Jun 2026 12:13:45 -0500 Subject: [PATCH 13/17] kinda runs --- Project.toml | 8 +++----- benchmark/Project.toml | 2 +- benchmark/README.md | 31 +------------------------------ benchmark/run_benchmark.sh | 4 ++-- benchmark/src/core.jl | 11 +++-------- benchmark/src/parse_benchmarks.jl | 8 +++++--- 6 files changed, 15 insertions(+), 49 deletions(-) diff --git a/Project.toml b/Project.toml index c898d6b4..feca11f0 100644 --- a/Project.toml +++ b/Project.toml @@ -2,12 +2,8 @@ name = "cuNumeric" uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620" version = "0.1.1" -[workspace] -projects = ["test", "dev"] - [deps] CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f" -CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0" CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4" JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909" @@ -33,7 +29,6 @@ CUDAExt = "CUDA" [compat] CNPreferences = "0.1.2" CUDA = "5.9" -CUDA_SDK_jll = "13" CxxWrap = "0.17" JuliaFormatter = "2.3.0" Legate = "0.1.2" @@ -47,3 +42,6 @@ StatsBase = "0.34" cunumeric_jl_wrapper_jll = "25.10.3" cupynumeric_jll = "25.10.3" julia = "1.10" + +[workspace] +projects = ["test", "dev"] diff --git a/benchmark/Project.toml b/benchmark/Project.toml index 71a488f9..62eb4c27 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -1,8 +1,8 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620" diff --git a/benchmark/README.md b/benchmark/README.md index 762bffc6..c2345ad5 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -22,11 +22,9 @@ Repeat a `[[name]]` block to add independent configs. ## Lists -Any of `T`, `variants`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along +Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along two axes: -- **`T` and `variants` multiply.** The whole sweep runs once per type and once - per variant. - **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i` of each is paired together. @@ -53,30 +51,3 @@ M = [150, 300, 600] # When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4** combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type to a specific size, use separate `[[name]]` blocks. - -## Variants - -A variant is a named way of running a benchmark. List them per entry with -`variants = [...]` (defaults to `["baseline"]`); they multiply like `T`, and the -chosen variant is recorded as a column in the results CSV so runs can be compared. - -```toml -[[grayscott]] -T = "Float64" -N = 1024 -M = [1024, 2048, 4096] -gpus = [1, 2, 4] -cpus = 2 -variants = ["baseline", "lifetimes"] # 2 variants * 3 sweep points = 6 runs -``` - -There are two kinds, both flowing through the same `variant` string: - -- **Code-path variants** change what the worker runs. The benchmark's `run!` - dispatches on the variant. Example: grayscott's `lifetimes` wraps the step in - `@analyze_lifetimes` (see `src/benchmarks/grayscott.jl`). A benchmark that - doesn't recognize a variant just runs its baseline path. -- **Process-level variants** flip a runtime setting before the run via a setup - thunk registered in `register_variant` (`src/core.jl`). The worker calls it at - startup. Broadcast fusion will plug in here once it lands, e.g. - `register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)`. diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh index 752eecde..4e50f8d0 100755 --- a/benchmark/run_benchmark.sh +++ b/benchmark/run_benchmark.sh @@ -55,9 +55,9 @@ export LD_LIBRARY_PATH="" echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs" -eval "julia --project -e 'using Pkg; Pkg.dev(\"..\"); Pkg.instantiate()'" +eval "julia --project -e 'using Pkg; Pkg.develop(path=\"..\"); Pkg.instantiate()'" -CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}" +CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}" printf "Running: %s\n" "$CMD" eval "$CMD" diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl index f8aab2c9..33c4a4ef 100644 --- a/benchmark/src/core.jl +++ b/benchmark/src/core.jl @@ -39,9 +39,7 @@ function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBe BENCHMARKS[key] = B end -# Default uses (N, M); benchmarks with a code-path variant or different arity -# override this (see grayscott.jl / montecarlo.jl). -function build_benchmark(::Type{B}, ::Type{T}, N, M, variant) where {B<:AbstractBenchmark,T} +function build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} B{T}(; N=N, M=M) end @@ -60,6 +58,7 @@ function _trial(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric) GC.gc(true) state = initialize(b; mod=mod) + start_time = nothing for idx in 1:(gs.n_warmup + gs.n_iter) if idx == gs.n_warmup + 1 start_time = get_time_microseconds() @@ -75,10 +74,6 @@ end # Run `n_trial` independent trials and collect their per-trial measurements. function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric) - - # Can only test CUDA.jl performance with 1 GPU - (mod == CUDACore && gs.n_gpu == 1) || continue - times_ms = Float64[] gflops = Float64[] for _ in 1:gs.n_trial @@ -98,7 +93,7 @@ function save_result(br::BenchmarkResult, gpus; mod::String="cunumeric") open(path, "a") do io for trial in eachindex(br.times_ms) @printf( - io, "%s,%s,%d,%d,%d,%d,%.6f,%.6f\n", + io, "%s,%d,%d,%d,%d,%.6f,%.6f\n", mod, gpus, N, M, trial, br.times_ms[trial], br.gflops[trial], ) diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl index f5211b5a..3b7c21df 100644 --- a/benchmark/src/parse_benchmarks.jl +++ b/benchmark/src/parse_benchmarks.jl @@ -10,7 +10,6 @@ struct BenchmarkSpec T::String gpus::Int cpus::Int - fusion::Bool args::Vector{Int} end @@ -53,11 +52,14 @@ function parse_config(path) n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M]) - for T in types for i in 1:n + for T in types, i in 1:n push!( specs, BenchmarkSpec( - name, T, sweep_value(gpus, i), sweep_value(cpus, i), + name, + T, + sweep_value(gpus, i), + sweep_value(cpus, i), [sweep_value(N, i), sweep_value(M, i)], ), ) From 92d8ac367a8191b6a15af3799faa794e4a0fcdaa Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 3 Jun 2026 19:34:21 -0500 Subject: [PATCH 14/17] add a python benchmarks, install_cupynumeric.sh, and python benchmark runner. run.jl will construct and orchistrate the python runs as well. --- .gitignore | 3 + benchmark/README.md | 25 ++++++++ benchmark/benchmarks.toml | 2 + benchmark/install_cupynumeric.sh | 73 +++++++++++++++++++++++ benchmark/run.jl | 52 +++++++++++++--- benchmark/run_benchmark.sh | 19 +++++- benchmark/src/benchmarks/montecarlo.jl | 4 +- benchmark/src/core.jl | 2 + benchmark/src/parse_benchmarks.jl | 21 ++++++- benchmark/src/single.jl | 48 +++++++-------- benchmark/src_py/benchmarks/__init__.py | 8 +++ benchmark/src_py/benchmarks/gemm.py | 29 +++++++++ benchmark/src_py/benchmarks/grayscott.py | 71 ++++++++++++++++++++++ benchmark/src_py/benchmarks/montecarlo.py | 28 +++++++++ benchmark/src_py/core.py | 57 ++++++++++++++++++ benchmark/src_py/single.py | 48 +++++++++++++++ 16 files changed, 449 insertions(+), 41 deletions(-) create mode 100755 benchmark/install_cupynumeric.sh create mode 100644 benchmark/src_py/benchmarks/__init__.py create mode 100644 benchmark/src_py/benchmarks/gemm.py create mode 100644 benchmark/src_py/benchmarks/grayscott.py create mode 100644 benchmark/src_py/benchmarks/montecarlo.py create mode 100644 benchmark/src_py/core.py create mode 100644 benchmark/src_py/single.py diff --git a/.gitignore b/.gitignore index c2af1d47..29f3d39d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ help.sh docker.log docs/package-lock.json +__pycache__ +*.pyc + # auto-generated script build_wrapper.sh diff --git a/benchmark/README.md b/benchmark/README.md index c2345ad5..bd646666 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -2,6 +2,31 @@ Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it. +## Running + +```bash +julia --project run.jl # runs whatever benchmarks.toml configures +``` + +`run.jl` runs each (benchmark, backend) pair in its own process via +`run_benchmark.sh`, so backends never share a GPU/runtime within a measurement. +cuNumeric always runs; extra comparison backends are toggled in `[Global]`: + +- `cuda = true` → also run under CUDA.jl (single-GPU configs only; CUDA.jl is + single-device). +- `cupynumeric = true` → also run under cupynumeric (see below). + +### Comparing against cupynumeric + +cupynumeric runs in a conda env whose major.minor matches this project's +resolved `cupynumeric_jll`. Build it once: + +```bash +./install_cupynumeric.sh # creates env cupynumeric-bench- +``` + +`run.jl` derives the env name automatically; override it with `CUPYNUMERIC_ENV`. + ## Layout ```toml diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml index 855a3b8d..688ae9e2 100644 --- a/benchmark/benchmarks.toml +++ b/benchmark/benchmarks.toml @@ -2,6 +2,8 @@ n_warmup = 5 n_iter = 1000 n_trial = 5 +cupynumeric = true # (needs install_cupynumeric.sh) +cuda = false # compare against CUDA.jl (single-GPU configs only) #################################### # GEMM # diff --git a/benchmark/install_cupynumeric.sh b/benchmark/install_cupynumeric.sh new file mode 100755 index 00000000..541a654c --- /dev/null +++ b/benchmark/install_cupynumeric.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Install a cupynumeric conda env matching the cupynumeric_jll our project resolves. +# The conda package and the JLL share the calendar-versioning scheme (e.g. 25.10), +# so we pin major.minor (patch ignored) and install from the legate channel. +# +# Usage: +# ./install_cupynumeric.sh # create a fresh env named cupynumeric-bench- +# ./install_cupynumeric.sh --name myenv # override the env name +# ./install_cupynumeric.sh --into existing # install into an existing env instead of creating one +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +ENV_NAME="" +INTO_ENV="" + +while [[ $# -gt 0 ]]; do + case $1 in + --name) + ENV_NAME=$2 + shift 2 + ;; + --into) + INTO_ENV=$2 + shift 2 + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 [--name ] [--into ]" + exit 1 + ;; + esac +done + +# Resolve the JLL version Julia actually instantiated for this project, then keep +# major.minor only — conda packages are not published per patch. +echo "Detecting cupynumeric_jll version from the benchmark project..." +VER=$(cd "$SCRIPT_DIR" && julia --project -e ' +using Pkg +for (_, info) in Pkg.dependencies() + info.name == "cupynumeric_jll" || continue + v = info.version + isnothing(v) && continue + println("$(v.major).$(v.minor)") +end' | tail -1) + +if [[ -z "$VER" ]]; then + echo "Error: could not detect cupynumeric_jll version. Has the project been instantiated?" + exit 1 +fi + +echo "cupynumeric_jll major.minor: $VER" +SPEC="cupynumeric=$VER.*" + +if [[ -n "$INTO_ENV" ]]; then + echo "Installing $SPEC into existing env '$INTO_ENV'..." + conda install -y -n "$INTO_ENV" -c conda-forge -c legate "$SPEC" + echo "Done. Activate with: conda activate $INTO_ENV" + exit 0 +fi + +[[ -z "$ENV_NAME" ]] && ENV_NAME="cupynumeric-bench-$VER" + +if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then + echo "Env '$ENV_NAME' already exists with $SPEC; nothing to do." + echo "Activate with: conda activate $ENV_NAME" + exit 0 +fi + +echo "Creating env '$ENV_NAME' with $SPEC..." +conda create -y -n "$ENV_NAME" -c conda-forge -c legate "$SPEC" + +echo "Done. Activate with: conda activate $ENV_NAME" diff --git a/benchmark/run.jl b/benchmark/run.jl index 89d6d28b..ff40e21d 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -6,26 +6,61 @@ # Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config, # both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels. + +using Pkg + include("src/core.jl") include("src/parse_benchmarks.jl") const RUNNER = joinpath(@__DIR__, "run_benchmark.sh") const WORKER = joinpath(@__DIR__, "src/single.jl") +const PY_WORKER = joinpath(@__DIR__, "src_py/single.py") banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) -function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial) - # Name validity is checked in the worker (single.jl), which owns the registry. +# ensure things are resolved and devlop'd properly +function ensure_project_ready() + Pkg.develop(; path=joinpath(@__DIR__, "..")) + Pkg.instantiate() +end + +# default env name mirrors install_cupynumeric.sh: cupynumeric-bench-. +# CUPYNUMERIC_ENV overrides it. +function cupynumeric_env_name() + haskey(ENV, "CUPYNUMERIC_ENV") && return ENV["CUPYNUMERIC_ENV"] + for (_, info) in Pkg.dependencies() + info.name == "cupynumeric_jll" || continue + info.version === nothing && continue + return "cupynumeric-bench-$(info.version.major).$(info.version.minor)" + end + error("could not resolve cupynumeric_jll version; set CUPYNUMERIC_ENV explicitly") +end + +function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial, + cupynumeric=false, cuda=false) banner( "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", ) - cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` - try - run(cmd) - catch e - @error "Benchmark '$(name)' failed; continuing." exception = e + # each backend runs in its own worker process + args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` + cmds = [`bash $RUNNER $WORKER $args cunumeric`] + # CUDA.jl is single-GPU only + if cuda && gpus == 1 + push!(cmds, `bash $RUNNER $WORKER $args cudajl`) + end + # cupynumeric has no code-path variants; only baseline benchmarks compare against it + if cupynumeric && !endswith(name, "_lifetimes") + push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`) + end + + for cmd in cmds + try + run(cmd) + catch e + @error "Benchmark '$(name)' failed; continuing." exception = e + end end end @@ -42,10 +77,13 @@ function run_all_benchmarks(config="benchmarks.toml") n_iter=gs.n_iter, n_warmup=gs.n_warmup, n_trial=gs.n_trial, + cupynumeric=gs.cupynumeric, + cuda=gs.cuda, ) end end +ensure_project_ready() if isempty(ARGS) run_all_benchmarks() else # dispatch on args diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh index 4e50f8d0..b802f7bc 100755 --- a/benchmark/run_benchmark.sh +++ b/benchmark/run_benchmark.sh @@ -11,6 +11,7 @@ shift GPUS=0 CPUS=1 +PYENV="" while [[ $# -gt 0 ]]; do case $1 in @@ -22,6 +23,10 @@ while [[ $# -gt 0 ]]; do CPUS=$2 shift 2 ;; + --pyenv) + PYENV=$2 + shift 2 + ;; *) # Collect all other arguments as extra arguments EXTRA_ARGS+=("$1") @@ -55,9 +60,17 @@ export LD_LIBRARY_PATH="" echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs" -eval "julia --project -e 'using Pkg; Pkg.develop(path=\"..\"); Pkg.instantiate()'" - -CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}" +# Python (cupynumeric) workers run in the conda env built by install_cupynumeric.sh; +# Julia (cuNumeric) workers run against the local project. +if [[ $FILENAME == *.py ]]; then + if [[ -z $PYENV ]]; then + echo "Error: running a .py worker requires --pyenv (run install_cupynumeric.sh first)." + exit 1 + fi + CMD="conda run --no-capture-output -n $PYENV python $FILENAME $GPUS ${EXTRA_ARGS[@]}" +else + CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}" +fi printf "Running: %s\n" "$CMD" eval "$CMD" diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl index f82ebba6..1df91c97 100644 --- a/benchmark/src/benchmarks/montecarlo.jl +++ b/benchmark/src/benchmarks/montecarlo.jl @@ -23,8 +23,8 @@ end _domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) -# n_samples comes in as N; M and variant are unused. -function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M, variant) where {T} +# n_samples comes in as N; M is unused. +function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T} MonteCarloIntegration{T}(; n_samples=N) end diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl index 33c4a4ef..db452c6e 100644 --- a/benchmark/src/core.jl +++ b/benchmark/src/core.jl @@ -17,6 +17,8 @@ Base.@kwdef struct GlobalSettings n_iter::Int # Number of iterations to run per trial n_trial::Int = 1 # Number of independent trials to run. Benchmark n_gpu::Int = 0 + cupynumeric::Bool = false # also run baselines under cupynumeric for comparison + cuda::Bool = false # also run under CUDA.jl for comparison (single-GPU only) end ######################################### diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl index 3b7c21df..605c5002 100644 --- a/benchmark/src/parse_benchmarks.jl +++ b/benchmark/src/parse_benchmarks.jl @@ -31,17 +31,32 @@ function sweep_length(name, fields) return first(lengths) end +# Names of the `[[name]]` blocks in the order they appear in the file. TOML.jl +# parses into an unordered Dict, so we scan the source to preserve run order. +function declared_order(path) + order = String[] + for line in eachline(path) + header = strip(line) + startswith(header, "[[") && endswith(header, "]]") || continue + name = strip(header[3:(end - 2)]) + name in order || push!(order, name) # if not in list, push to ordered list + end + return order +end + function parse_config(path) raw = TOML.parsefile(path) g = raw["Global"] global_settings = GlobalSettings(; - n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1) + n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1), + cupynumeric=get(g, "cupynumeric", false), + cuda=get(g, "cuda", false), ) specs = BenchmarkSpec[] - for (name, entries) in raw - name == "Global" && continue + for name in declared_order(path) + entries = raw[name] for e in entries types = aslist(get(e, "T", "Float32")) gpus = aslist(e["gpus"]) diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl index eb24ae38..5b2fff54 100644 --- a/benchmark/src/single.jl +++ b/benchmark/src/single.jl @@ -1,6 +1,7 @@ -# single.jl: worker that runs exactly one benchmark. Launched by run_benchmark.sh -# (dispatched from run.jl), which sets LEGATE_CONFIG in the env before julia starts. -# Args: +# single.jl: worker that runs exactly one benchmark under one backend. Launched by +# run_benchmark.sh (dispatched from run.jl), which sets LEGATE_CONFIG before julia starts. +# Args: +# backend is "cunumeric" or "cudajl"; run.jl launches one worker per backend. using cuNumeric using CUDACore @@ -13,38 +14,32 @@ include.(filter(contains(r".jl$"), readdir(BENCHMARK_DIR; join=true))) # Resolve a TOML type string like "Float32" to the actual Julia type. parse_type(s) = getfield(Base, Symbol(s))::DataType -function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial) +# mod runs the kernels; label tags stdout; save_as names the results CSV. +const BACKENDS = Dict( + "cunumeric" => (mod=cuNumeric, label="cuNumeric", save_as="cunumeric"), + "cudajl" => (mod=CUDACore, label="CUDA.jl", save_as="CUDA.jl"), +) + +function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, backend) haskey(BENCHMARKS, name) || error( "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))" ) + haskey(BACKENDS, backend) || error( + "Unknown backend '$(backend)'. Known: $(join(sort(collect(keys(BACKENDS))), ", "))" + ) + bk = BACKENDS[backend] T = parse_type(T_str) b = build_benchmark(BENCHMARKS[name], T, N, M) gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) println( - "[cuNumeric] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * + "[$(bk.label)] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * "iterations ($(n_warmup) warmup) x $(n_trial) trials", ) - br = run_benchmark(b, gs) - @printf("[cuNumeric] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) - @printf("[cuNumeric] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) - - save_result(br, gpus) - - # Run CUDA.jl benchmark - if gpus == 1 - println( - "[CUDA.jl] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * - "iterations ($(n_warmup) warmup) x $(n_trial) trials", - ) - - br = run_benchmark(b, gs; mod=CUDACore) - - @printf("[CUDA.jl] Mean Run Time: %.5f ± %.5f ms\n", mean(br.times_ms), _std(br.times_ms)) - @printf("[CUDA.jl] FLOPS: %.5f ± %.5f GFLOPS\n", mean(br.gflops), _std(br.gflops)) - - save_result(br, gpus; mod="CUDA.jl") - end + br = run_benchmark(b, gs; mod=bk.mod) + @printf("[%s] Mean Run Time: %.5f ± %.5f ms\n", bk.label, mean(br.times_ms), _std(br.times_ms)) + @printf("[%s] FLOPS: %.5f ± %.5f GFLOPS\n", bk.label, mean(br.gflops), _std(br.gflops)) + save_result(br, gpus; mod=bk.save_as) end gpus = parse(Int, ARGS[1]) @@ -55,4 +50,5 @@ M = parse(Int, ARGS[5]) n_iter = parse(Int, ARGS[6]) n_warmup = parse(Int, ARGS[7]) n_trial = parse(Int, ARGS[8]) -run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial) +backend = ARGS[9] +run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, backend) diff --git a/benchmark/src_py/benchmarks/__init__.py b/benchmark/src_py/benchmarks/__init__.py new file mode 100644 index 00000000..2eee4477 --- /dev/null +++ b/benchmark/src_py/benchmarks/__init__.py @@ -0,0 +1,8 @@ +import importlib +import pkgutil + +from core import BENCHMARKS + +# Import each module so it self-registers into BENCHMARKS. +for _info in pkgutil.iter_modules(__path__): + importlib.import_module(f"{__name__}.{_info.name}") diff --git a/benchmark/src_py/benchmarks/gemm.py b/benchmark/src_py/benchmarks/gemm.py new file mode 100644 index 00000000..b5d1a4b3 --- /dev/null +++ b/benchmark/src_py/benchmarks/gemm.py @@ -0,0 +1,29 @@ +import cupynumeric as np + +from core import register_benchmark + + +class GEMM: + name = "gemm" + + def __init__(self, T, N, M): + self.T, self.N, self.M = T, N, M + + def dims(self): + return self.N, self.M + + def total_flops(self): + return self.N * self.N * (2 * self.M - 1) + + def initialize(self): + A = np.random.rand(self.N, self.M).astype(self.T) + B = np.random.rand(self.M, self.N).astype(self.T) + C = np.zeros((self.N, self.N), dtype=self.T) + return (C, A, B) + + def run(self, state): + C, A, B = state + np.matmul(A, B, out=C) + + +register_benchmark("gemm", GEMM) diff --git a/benchmark/src_py/benchmarks/grayscott.py b/benchmark/src_py/benchmarks/grayscott.py new file mode 100644 index 00000000..a1a89e73 --- /dev/null +++ b/benchmark/src_py/benchmarks/grayscott.py @@ -0,0 +1,71 @@ +import cupynumeric as np + +from core import register_benchmark + + +class GrayScott: + name = "grayscott" + + # dt = dx/5; c_u, c_v, f, k as in grayscott.jl's GSParams defaults. + def __init__(self, T, N, M, dx=1.0, c_u=1.0, c_v=0.3, f=0.03, k=0.06): + self.T, self.N, self.M = T, N, M + self.dx = T(dx) + self.dt = T(dx / 5) + self.c_u, self.c_v, self.f, self.k = T(c_u), T(c_v), T(f), T(k) + + def dims(self): + return self.N, self.M + + def total_flops(self): + return self.N * self.M + + def initialize(self): + d = (self.N, self.M) + u = np.ones(d, dtype=self.T) + v = np.zeros(d, dtype=self.T) + u_new = np.zeros(d, dtype=self.T) + v_new = np.zeros(d, dtype=self.T) + + seed = min(150, self.N, self.M) + u[:seed, :seed] = np.random.rand(seed, seed).astype(self.T) + v[:seed, :seed] = np.random.rand(seed, seed).astype(self.T) + # mutable list so run() can swap buffers in place + return [u, v, u_new, v_new] + + def run(self, state): + u, v, u_new, v_new = state + ui = u[1:-1, 1:-1] + vi = v[1:-1, 1:-1] + + F_u = (-ui * (vi * vi)) + self.f * (1 - ui) + F_v = (ui * (vi * vi)) - (self.f + self.k) * vi + + dx2 = self.dx * self.dx + u_lap = ( + (u[2:, 1:-1] - 2 * ui + u[:-2, 1:-1]) / dx2 + + (u[1:-1, 2:] - 2 * ui + u[1:-1, :-2]) / dx2 + ) + v_lap = ( + (v[2:, 1:-1] - 2 * vi + v[:-2, 1:-1]) / dx2 + + (v[1:-1, 2:] - 2 * vi + v[1:-1, :-2]) / dx2 + ) + + u_new[1:-1, 1:-1] = (self.c_u * u_lap + F_u) * self.dt + ui + v_new[1:-1, 1:-1] = (self.c_v * v_lap + F_v) * self.dt + vi + + # periodic boundary conditions + u_new[:, 0] = u[:, -2] + u_new[:, -1] = u[:, 1] + u_new[0, :] = u[-2, :] + u_new[-1, :] = u[1, :] + v_new[:, 0] = v[:, -2] + v_new[:, -1] = v[:, 1] + v_new[0, :] = v[-2, :] + v_new[-1, :] = v[1, :] + + # swap references rather than copy + state[0], state[2] = u_new, u + state[1], state[3] = v_new, v + + +register_benchmark("grayscott_baseline", GrayScott) diff --git a/benchmark/src_py/benchmarks/montecarlo.py b/benchmark/src_py/benchmarks/montecarlo.py new file mode 100644 index 00000000..370fc7b9 --- /dev/null +++ b/benchmark/src_py/benchmarks/montecarlo.py @@ -0,0 +1,28 @@ +import cupynumeric as np + +from core import register_benchmark + + +class MonteCarlo: + name = "montecarlo" + + def __init__(self, T, N, M): + self.T = T + self.n_samples = N + + def dims(self): + return self.n_samples, 1 + + def total_flops(self): + return self.n_samples + + def initialize(self): + x = (self.T(10) * np.random.rand(self.n_samples)).astype(self.T) + return (x,) + + def run(self, state): + (x,) = state + return (self.T(10) / self.n_samples) * np.sum(np.exp(-(x * x))) + + +register_benchmark("montecarlo", MonteCarlo) diff --git a/benchmark/src_py/core.py b/benchmark/src_py/core.py new file mode 100644 index 00000000..f32a4fc3 --- /dev/null +++ b/benchmark/src_py/core.py @@ -0,0 +1,57 @@ +import os +import math + +import cupynumeric as np +from legate.timing import time # blocks on preceding legate ops; returns microseconds + +MOD = "cupynumeric" +RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "results") + +DTYPES = {"Float32": np.float32, "Float64": np.float64} + + +def parse_type(s): + if s not in DTYPES: + raise ValueError(f"Unsupported type '{s}'. Known: {', '.join(DTYPES)}") + return DTYPES[s] + + +BENCHMARKS = {} + + +def register_benchmark(key, cls): + BENCHMARKS[key] = cls + + +def trial(bench, n_warmup, n_iter): + state = bench.initialize() + start = None + for idx in range(n_warmup + n_iter): + if idx == n_warmup: + start = time() + bench.run(state) + total_us = time() - start + + mean_time_ms = total_us / (n_iter * 1e3) + gflops = bench.total_flops() / (mean_time_ms * 1e6) + return mean_time_ms, gflops + + +def _mean(x): + return sum(x) / len(x) + + +def _std(x): + if len(x) < 2: + return 0.0 + m = _mean(x) + return math.sqrt(sum((v - m) ** 2 for v in x) / (len(x) - 1)) + + +def save_result(name, dims, gpus, times_ms, gflops): + os.makedirs(RESULTS_DIR, exist_ok=True) + N, M = dims + path = os.path.join(RESULTS_DIR, f"{name}_{MOD}.csv") + with open(path, "a") as io: + for i, (t, g) in enumerate(zip(times_ms, gflops), start=1): + io.write(f"{MOD},{gpus},{N},{M},{i},{t:.6f},{g:.6f}\n") diff --git a/benchmark/src_py/single.py b/benchmark/src_py/single.py new file mode 100644 index 00000000..005cda31 --- /dev/null +++ b/benchmark/src_py/single.py @@ -0,0 +1,48 @@ +# cupynumeric worker, run by run_benchmark.sh which sets LEGATE_CONFIG first. +# Args: +import os +import sys + +# Make `core` and the `benchmarks` package importable when run as a script. +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from core import MOD, parse_type, trial, save_result, _mean, _std +from benchmarks import BENCHMARKS # import populates BENCHMARKS + + +def main(): + gpus = int(sys.argv[1]) + name = sys.argv[2] + T_str = sys.argv[3] + N = int(sys.argv[4]) + M = int(sys.argv[5]) + n_iter = int(sys.argv[6]) + n_warmup = int(sys.argv[7]) + n_trial = int(sys.argv[8]) + + if name not in BENCHMARKS: + raise ValueError( + f"No benchmark registered for '{name}'. Known: {', '.join(sorted(BENCHMARKS))}" + ) + T = parse_type(T_str) + bench = BENCHMARKS[name](T, N, M) + + print( + f"[{MOD}] {name} benchmark ({T_str}) on {N}x{M} for {n_iter} " + f"iterations ({n_warmup} warmup) x {n_trial} trials" + ) + + times_ms, gflops = [], [] + for _ in range(n_trial): + t, g = trial(bench, n_warmup, n_iter) + times_ms.append(t) + gflops.append(g) + + print(f"[{MOD}] Mean Run Time: {_mean(times_ms):.5f} ± {_std(times_ms):.5f} ms") + print(f"[{MOD}] FLOPS: {_mean(gflops):.5f} ± {_std(gflops):.5f} GFLOPS") + + save_result(bench.name, bench.dims(), gpus, times_ms, gflops) + + +if __name__ == "__main__": + main() From c068593fa67d617fbd3cfd90ef303ba8b23eb79b Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 3 Jun 2026 19:38:21 -0500 Subject: [PATCH 15/17] cuda -> cudajl as name --- benchmark/run.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/run.jl b/benchmark/run.jl index ff40e21d..6db80516 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -37,7 +37,7 @@ function cupynumeric_env_name() end function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial, - cupynumeric=false, cuda=false) + cupynumeric=false, cudajl=false) banner( "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", @@ -47,7 +47,7 @@ function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial, args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` cmds = [`bash $RUNNER $WORKER $args cunumeric`] # CUDA.jl is single-GPU only - if cuda && gpus == 1 + if cudajl && gpus == 1 push!(cmds, `bash $RUNNER $WORKER $args cudajl`) end # cupynumeric has no code-path variants; only baseline benchmarks compare against it @@ -78,7 +78,7 @@ function run_all_benchmarks(config="benchmarks.toml") n_warmup=gs.n_warmup, n_trial=gs.n_trial, cupynumeric=gs.cupynumeric, - cuda=gs.cuda, + cudajl=gs.cuda, ) end end From 3679fc92d28aca7633c45d0de8521c3278715e35 Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 3 Jun 2026 19:40:31 -0500 Subject: [PATCH 16/17] patch cudajl run path for cunumeric specific tests --- benchmark/run.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/run.jl b/benchmark/run.jl index 6db80516..7d6c3bb4 100644 --- a/benchmark/run.jl +++ b/benchmark/run.jl @@ -18,6 +18,9 @@ const PY_WORKER = joinpath(@__DIR__, "src_py/single.py") banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) +# `_lifetimes` is a cuNumeric-only code-path variant (@analyze_lifetimes) +cunumeric_only(name) = endswith(name, "_lifetimes") + # ensure things are resolved and devlop'd properly function ensure_project_ready() Pkg.develop(; path=joinpath(@__DIR__, "..")) @@ -47,11 +50,10 @@ function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial, args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` cmds = [`bash $RUNNER $WORKER $args cunumeric`] # CUDA.jl is single-GPU only - if cudajl && gpus == 1 + if cudajl && gpus == 1 && !cunumeric_only(name) push!(cmds, `bash $RUNNER $WORKER $args cudajl`) end - # cupynumeric has no code-path variants; only baseline benchmarks compare against it - if cupynumeric && !endswith(name, "_lifetimes") + if cupynumeric && !cunumeric_only(name) push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`) end From 943b1ec17d20fd685b124cde9819ed7bb75cde5a Mon Sep 17 00:00:00 2001 From: krasow Date: Wed, 3 Jun 2026 19:42:28 -0500 Subject: [PATCH 17/17] update readme --- benchmark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/README.md b/benchmark/README.md index bd646666..753a0416 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -49,7 +49,7 @@ Repeat a `[[name]]` block to add independent configs. Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along two axes: - +- **`T` multiply.** The whole sweep runs once per type. - **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i` of each is paired together.