From f1431e6fd4f655dcc24867d3844d067efbfbc198 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Fri, 15 May 2026 12:26:34 +0200 Subject: [PATCH 01/10] single mallocAsync call in umami on GPU --- .../iolibs/template_files/madmatrix/umami.cc | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index 05731edf8..2a65060d5 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -336,22 +336,38 @@ extern "C" unsigned int *flavor_indices, *diagram_index; std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup; - gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &flavor_indices, rounded_count * sizeof( unsigned int ), gpu_stream ); - gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); - gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); - gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); - gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + std::array, 16> ptrs_and_sizes = {{ + {&momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype )}, + {&couplings, rounded_count * n_coup * 2 * sizeof( fptype )}, + {&g_s, rounded_count * sizeof( fptype )}, + {&flavor_indices, rounded_count * sizeof( unsigned int )}, + {&helicity_random, rounded_count * sizeof( fptype )}, + {&color_random, rounded_count * sizeof( fptype )}, + {&diagram_random, rounded_count * sizeof( fptype )}, + {&matrix_elements, rounded_count * sizeof( fptype )}, + {&diagram_index, rounded_count * sizeof( unsigned int )}, + {&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, + {&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )}, + {&denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype )}, + {&helicity_index, rounded_count * sizeof( int )}, + {&color_index, rounded_count * sizeof( int )}, + {&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype )}, + {&ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, + }}; + std::size_t total_size = 0; + for (auto [ptr, size] : ptrs_and_sizes) { + std::size_t aligned_size = (size + 7) / 8 * 8; + total_size += aligned_size; + } + uint8_t* buffer; + // we can consider caching this between matrix element calls + gpuMallocAsync( &buffer, total_size, gpu_stream ); + std::size_t offset = 0; + for (auto [ptr, size] : ptrs_and_sizes) { + std::size_t aligned_size = (size + 7) / 8 * 8; + *ptr = buffer + offset; + offset += aligned_size; + } copy_inputs<<>>( momenta_in, @@ -424,22 +440,7 @@ extern "C" offset ); checkGpu( gpuPeekAtLastError() ); - gpuFreeAsync( momenta, gpu_stream ); - gpuFreeAsync( couplings, gpu_stream ); - gpuFreeAsync( flavor_indices, gpu_stream ); - gpuFreeAsync( g_s, gpu_stream ); - gpuFreeAsync( helicity_random, gpu_stream ); - gpuFreeAsync( color_random, gpu_stream ); - gpuFreeAsync( diagram_random, gpu_stream ); - gpuFreeAsync( matrix_elements, gpu_stream ); - gpuFreeAsync( diagram_index, gpu_stream ); - gpuFreeAsync( color_jamps, gpu_stream ); - gpuFreeAsync( numerators, gpu_stream ); - gpuFreeAsync( denominators, gpu_stream ); - gpuFreeAsync( helicity_index, gpu_stream ); - gpuFreeAsync( color_index, gpu_stream ); - gpuFreeAsync( ghel_matrix_elements, gpu_stream ); - gpuFreeAsync( ghel_jamps, gpu_stream ); + gpuFreeAsync( buffer, gpu_stream ); #else // MGONGPUCPP_GPUIMPL // need to round to round to double page size for some reason std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; From 082c05391548dba853826507b4c3e5ab4b010217 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Fri, 15 May 2026 12:40:46 +0200 Subject: [PATCH 02/10] fix common memory allocation --- .../iolibs/template_files/madmatrix/umami.cc | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index 2a65060d5..0ccf38b9a 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -12,6 +12,8 @@ #include "MemoryBuffers.h" #include +#include +#include #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; @@ -216,19 +218,19 @@ extern "C" } UmamiStatus umami_set_parameter( - UmamiHandle handle, - char const* name, - double parameter_real, - double parameter_imag ) + [[maybe_unused]] UmamiHandle handle, + [[maybe_unused]] char const* name, + [[maybe_unused]] double parameter_real, + [[maybe_unused]] double parameter_imag ) { return UMAMI_ERROR_NOT_IMPLEMENTED; } UmamiStatus umami_get_parameter( - UmamiHandle handle, - char const* name, - double* parameter_real, - double* parameter_imag ) + [[maybe_unused]] UmamiHandle handle, + [[maybe_unused]] char const* name, + [[maybe_unused]] double* parameter_real, + [[maybe_unused]] double* parameter_imag ) { return UMAMI_ERROR_NOT_IMPLEMENTED; } @@ -251,7 +253,7 @@ extern "C" const double* random_color_in = nullptr; const double* random_helicity_in = nullptr; const double* random_diagram_in = nullptr; - const int* diagram_in = nullptr; // TODO: unused + //const int* diagram_in = nullptr; // TODO: unused for( std::size_t i = 0; i < input_count; ++i ) { @@ -337,22 +339,22 @@ extern "C" std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup; std::array, 16> ptrs_and_sizes = {{ - {&momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype )}, - {&couplings, rounded_count * n_coup * 2 * sizeof( fptype )}, - {&g_s, rounded_count * sizeof( fptype )}, - {&flavor_indices, rounded_count * sizeof( unsigned int )}, - {&helicity_random, rounded_count * sizeof( fptype )}, - {&color_random, rounded_count * sizeof( fptype )}, - {&diagram_random, rounded_count * sizeof( fptype )}, - {&matrix_elements, rounded_count * sizeof( fptype )}, - {&diagram_index, rounded_count * sizeof( unsigned int )}, - {&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, - {&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )}, - {&denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype )}, - {&helicity_index, rounded_count * sizeof( int )}, - {&color_index, rounded_count * sizeof( int )}, - {&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype )}, - {&ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, + {reinterpret_cast(&momenta), rounded_count * CPPProcess::npar * 4 * sizeof( fptype )}, + {reinterpret_cast(&couplings), rounded_count * n_coup * 2 * sizeof( fptype )}, + {reinterpret_cast(&g_s), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&flavor_indices), rounded_count * sizeof( unsigned int )}, + {reinterpret_cast(&helicity_random), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&color_random), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&diagram_random), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&matrix_elements), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&diagram_index), rounded_count * sizeof( unsigned int )}, + {reinterpret_cast(&color_jamps), rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, + {reinterpret_cast(&numerators), rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )}, + {reinterpret_cast(&denominators), rounded_count * CPPProcess::ncomb * sizeof( fptype )}, + {reinterpret_cast(&helicity_index), rounded_count * sizeof( int )}, + {reinterpret_cast(&color_index), rounded_count * sizeof( int )}, + {reinterpret_cast(&ghel_matrix_elements), rounded_count * CPPProcess::ncomb * sizeof( fptype )}, + {reinterpret_cast(&ghel_jamps), rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, }}; std::size_t total_size = 0; for (auto [ptr, size] : ptrs_and_sizes) { @@ -362,11 +364,11 @@ extern "C" uint8_t* buffer; // we can consider caching this between matrix element calls gpuMallocAsync( &buffer, total_size, gpu_stream ); - std::size_t offset = 0; + std::size_t buf_offset = 0; for (auto [ptr, size] : ptrs_and_sizes) { std::size_t aligned_size = (size + 7) / 8 * 8; - *ptr = buffer + offset; - offset += aligned_size; + *ptr = buffer + buf_offset; + buf_offset += aligned_size; } copy_inputs<<>>( From b61c32cb75881d392770e627a04b16ef6a2ad4f2 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Fri, 15 May 2026 12:42:08 +0200 Subject: [PATCH 03/10] update run_card.toml --- madgraph/iolibs/template_files/mg7/run_card.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml index f965b2707..de6cacb8b 100644 --- a/madgraph/iolibs/template_files/mg7/run_card.toml +++ b/madgraph/iolibs/template_files/mg7/run_card.toml @@ -15,6 +15,7 @@ output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe verbosity = "pretty" # options: silent, pretty, log dummy_matrix_element = false save_gridpack = false +gridpack_include_source = false [beam] e_cm = 13000.0 From 4f11d133cccc5e3b9848437c3eef815f7bd452e8 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Fri, 15 May 2026 12:47:01 +0200 Subject: [PATCH 04/10] fix unused warning, fix madnis.enable setting --- madgraph/iolibs/template_files/madmatrix/umami.cc | 2 +- madgraph/iolibs/template_files/mg7/madevent.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index 0ccf38b9a..932a8a7b3 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -253,7 +253,7 @@ extern "C" const double* random_color_in = nullptr; const double* random_helicity_in = nullptr; const double* random_diagram_in = nullptr; - //const int* diagram_in = nullptr; // TODO: unused + [[maybe_unused]] const int* diagram_in = nullptr; // TODO: unused for( std::size_t i = 0; i < input_count; ++i ) { diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py index 47e7e680d..c2120e892 100644 --- a/madgraph/iolibs/template_files/mg7/madevent.py +++ b/madgraph/iolibs/template_files/mg7/madevent.py @@ -393,12 +393,15 @@ def survey(self) -> None: def train_madnis(self) -> None: madnis_args = self.run_card["madnis"] - gen_args = self.run_card["generation"] - run_args = self.run_card["run"] + if not madnis_args["enable"]: + return if madnis_args.get("old", False): self.train_madnis_old() return + gen_args = self.run_card["generation"] + run_args = self.run_card["run"] + config = ms.MadnisConfig() config.verbosity = run_args["verbosity"] config.learning_rate = madnis_args["lr"] From fb610e9a7ec2d28bf3ff253d9871c5609597a1f9 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Thu, 21 May 2026 15:06:26 +0200 Subject: [PATCH 05/10] fully async gpu matrix elements --- .../template_files/madmatrix/GpuAbstraction.h | 2 + .../madmatrix/MatrixElementKernels.cc | 2 +- .../template_files/madmatrix/color_sum.cc | 40 +++++++++++-------- .../template_files/madmatrix/color_sum.h | 24 ++++++----- .../process_function_definitions.inc | 3 +- .../template_files/madmatrix/process_h.inc | 1 + .../madmatrix/process_sigmaKin_function.inc | 28 +++++++------ .../iolibs/template_files/madmatrix/umami.cc | 15 +------ madmatrix/model_handling.py | 10 ++++- 9 files changed, 70 insertions(+), 55 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h index ac2cbc5a1..6f709dfe9 100644 --- a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h +++ b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h @@ -46,6 +46,7 @@ #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) @@ -113,6 +114,7 @@ #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) diff --git a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc index a019296df..872e4795e 100644 --- a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc +++ b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc @@ -490,7 +490,7 @@ namespace mg5amcGpu gpuBlasHandle_t* pBlasHandle = nullptr; #endif const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, false, m_gpublocks, m_gputhreads ); #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?! diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.cc b/madgraph/iolibs/template_files/madmatrix/color_sum.cc index fcfa4ee51..75202e963 100644 --- a/madgraph/iolibs/template_files/madmatrix/color_sum.cc +++ b/madgraph/iolibs/template_files/madmatrix/color_sum.cc @@ -377,21 +377,25 @@ namespace mg5amcCpu gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) const int nGoodHel, // input: number of good helicities - const int gpublocks, // input: cuda gpublocks - const int gputhreads ) // input: cuda gputhreads + const int gputhreads, // input: cuda gputhreads + const bool async ) // input: if true, run everything asynchronously in first stream in ghelStreams { const int nevt = gpublocks * gputhreads; // CASE 1: KERNEL if( !pBlasHandle ) { assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set - // Loop over helicities - for( int ighel = 0; ighel < nGoodHel; ighel++ ) - { - fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel - const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel - gpuStream_t hStream = ghelStreams[ighel]; - gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + if (async) { + gpuLaunchKernel2D( calculate_jamps, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt ); + } else { + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel, 0 ); + } } } // CASE 2: BLAS @@ -400,15 +404,19 @@ namespace mg5amcCpu #ifdef MGONGPU_HAS_NO_BLAS assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas #else - checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed - // Reset the tmp buffer + if (async) { + assert( false ); // BLAS in async mode not supported for now + } else { + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); #else - gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); -#endif - // Delegate the color sum to BLAS for - color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); + } #endif } } diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.h b/madgraph/iolibs/template_files/madmatrix/color_sum.h index b64febcdb..347184c4e 100644 --- a/madgraph/iolibs/template_files/madmatrix/color_sum.h +++ b/madgraph/iolibs/template_files/madmatrix/color_sum.h @@ -78,23 +78,25 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL void - color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) - const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities - fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) - gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle - gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) - const int nGoodHel, // input: number of good helicities - const int gpublocks, // input: cuda gpublocks - const int gputhreads ); // input: cuda gputhreads + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads, // input: cuda gputhreads + const bool processAllHelicities); // input: if true, use blockIdx.y to index helicities #endif //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL __global__ void - color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity - const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity - const int nGoodHel ); // input: number of good helicities + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel, // input: number of good helicities + const int nevtIfAllHelicities); // input: zero in single-helicity mode, number of events in multi-helicity mode #endif //-------------------------------------------------------------------------- diff --git a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc index c0f130b95..ee36fb590 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc @@ -462,7 +462,7 @@ namespace mg5amcCpu // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); - gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel ); + gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 ); gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost ); //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl; for( int ievt = 0; ievt < maxtry; ++ievt ) @@ -806,6 +806,7 @@ namespace mg5amcCpu fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const bool async, const int gpublocks, // input: cuda gpublocks const int gputhreads // input: cuda gputhreads #else diff --git a/madgraph/iolibs/template_files/madmatrix/process_h.inc b/madgraph/iolibs/template_files/madmatrix/process_h.inc index 3fb87b052..fb715a877 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_h.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_h.inc @@ -99,6 +99,7 @@ namespace mg5amcCpu fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const bool async, // input: if true, run everything asynchronously in first stream in ghelStreams const int gpublocks, // input: cuda gpublocks const int gputhreads ); // input: cuda gputhreads #else diff --git a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc index 6db9d8079..eafb0eece 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc @@ -51,19 +51,23 @@ // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - const int ihel = cGoodHel[ighel]; - fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; - fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + if (async) { + gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], allmomenta, allcouplings, ghelAllJamps, allChannelIds, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true ); + } else { + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false ); + } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color } - // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps - color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); - checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed - // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index 932a8a7b3..05ad57498 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -158,9 +158,6 @@ namespace struct InterfaceInstance { bool initialized = false; -#ifdef MGONGPUCPP_GPUIMPL - gpuStream_t hel_streams[CPPProcess::ncomb]; -#endif }; } @@ -208,12 +205,6 @@ extern "C" process.initProc( param_card_path ); auto instance = new InterfaceInstance(); *handle = instance; -#ifdef MGONGPUCPP_GPUIMPL - for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) - { - gpuStreamCreate( &instance->hel_streams[ihel] ); - } -#endif return UMAMI_SUCCESS; } @@ -389,9 +380,6 @@ extern "C" offset ); computeDependentCouplings<<>>( g_s, couplings ); checkGpu( gpuPeekAtLastError() ); - // TODO: make things fully async (requires using events instead of synchronize in - // the sigmaKin implementation) - gpuStreamSynchronize( gpu_stream ); InterfaceInstance* instance = static_cast( handle ); if( !instance->initialized ) @@ -421,7 +409,8 @@ extern "C" ghel_jamps, nullptr, nullptr, - instance->hel_streams, + &gpu_stream, + true, n_blocks, n_threads ); diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py index 94215209b..e1e007fab 100644 --- a/madmatrix/model_handling.py +++ b/madmatrix/model_handling.py @@ -1725,7 +1725,8 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) - const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + const int nevt, // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + const bool processAllHelicities // input: if true, use blockIdx.y to index helicities #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity bool storeChannelWeights, @@ -1765,6 +1766,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel ); #else //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + if (processAllHelicities) { + int ighel = blockIdx.y; + ihel = dcGoodHel[ighel]; + allJamps = allJamps + ighel * nevt; + allNumerators = allNumerators + ighel * nevt; + allDenominators = allDenominators + ighel * nevt; + } //debug = ( ievt == 0 ); //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel ); #endif /* clang-format on */""") From a85f8ca3f5d7b749e25975c841eba29909607c28 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Fri, 22 May 2026 12:23:58 +0200 Subject: [PATCH 06/10] reorder events in simd mode such that the flavor index is the same for all events in a vector --- .../iolibs/template_files/madmatrix/umami.cc | 82 ++++++++++++++----- 1 file changed, 63 insertions(+), 19 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index 05ad57498..bc3e74c90 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -12,6 +12,7 @@ #include "MemoryBuffers.h" #include +#include #include #include @@ -74,18 +75,18 @@ namespace __device__ #endif void - transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event_in, std::size_t i_event_out, std::size_t stride ) { std::size_t page_size = MemoryAccessMomentaBase::neppM; - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; + std::size_t i_page = i_event_out / page_size; + std::size_t i_vector = i_event_out % page_size; for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) { for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) { momenta_out[i_page * CPPProcess::npar * 4 * page_size + - i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event_in]; } } } @@ -112,7 +113,7 @@ namespace std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; if( i_event >= count ) return; - transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + transpose_momenta( &momenta_in[offset], momenta, i_event, i_event, stride ); diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; @@ -433,14 +434,45 @@ extern "C" gpuFreeAsync( buffer, gpu_stream ); #else // MGONGPUCPP_GPUIMPL + constexpr std::size_t vector_size = MemoryAccessMomentaBase::neppM; // need to round to round to double page size for some reason - std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; - std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + constexpr std::size_t page_size2 = 2 * vector_size; + std::vector permutation; + std::size_t rounded_count; + + constexpr std::size_t flavor_count = CPPProcess::nmaxflavor; + HostBufferBase flavor_indices( ((count + page_size2 - 1) / page_size2 + flavor_count) * page_size2 ); + bool sort_flavors = vector_size > 1 && flavor_count > 1 && flavor_indices_in; + if ( sort_flavors ) { + permutation.resize(count); + std::size_t voffset = 0; + std::size_t vector_indices[flavor_count] = {}; + std::size_t vector_counts[flavor_count] = {}; + // determine permutation of inputs such that all entries in a SIMD vector + // have the same flavor index + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + unsigned int flav = flavor_indices_in[i_event + offset]; + auto& vcount = vector_counts[flav]; + auto& vindex = vector_indices[flav]; + if (vcount == 0) { + vindex = voffset * page_size2; + for ( std::size_t i = 0; i < page_size2; ++i) { + flavor_indices[voffset * page_size2 + i] = flav; + } + voffset += 1; + } + permutation[i_event] = vindex + vcount; + vcount = (vcount + 1) % page_size2; + } + rounded_count = voffset * page_size2; + } else { + rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + } HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_dependentCouplings::ndcoup * 2 ); HostBufferBase g_s( rounded_count ); - HostBufferBase flavor_indices( rounded_count ); HostBufferBase helicity_random( rounded_count ); HostBufferBase color_random( rounded_count ); HostBufferBase diagram_random( rounded_count ); @@ -450,17 +482,29 @@ extern "C" HostBufferBase denominators( rounded_count ); HostBufferBase helicity_index( rounded_count ); HostBufferBase color_index( rounded_count ); - for( std::size_t i_event = 0; i_event < count; ++i_event ) - { - transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); - helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; - color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; - diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; - g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; - flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0; - } - for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) { - flavor_indices[i_event] = 0; + if ( sort_flavors ) { + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_sorted = permutation[i_event]; + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_sorted, stride ); + helicity_random[i_sorted] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_sorted] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_sorted] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_sorted] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + } else { + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0; + } + for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) { + flavor_indices[i_event] = 0; + } } computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); From cfd6aed3188546ab16a49df4f26286625dbdec56 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Fri, 22 May 2026 14:16:56 +0200 Subject: [PATCH 07/10] apply flavor ordering to matrix element outputs --- .../iolibs/template_files/madmatrix/umami.cc | 88 +++++++++++++------ 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index bc3e74c90..aa457c7a6 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -443,7 +443,8 @@ extern "C" constexpr std::size_t flavor_count = CPPProcess::nmaxflavor; HostBufferBase flavor_indices( ((count + page_size2 - 1) / page_size2 + flavor_count) * page_size2 ); bool sort_flavors = vector_size > 1 && flavor_count > 1 && flavor_indices_in; - if ( sort_flavors ) { + if ( sort_flavors ) + { permutation.resize(count); std::size_t voffset = 0; std::size_t vector_indices[flavor_count] = {}; @@ -455,7 +456,8 @@ extern "C" unsigned int flav = flavor_indices_in[i_event + offset]; auto& vcount = vector_counts[flav]; auto& vindex = vector_indices[flav]; - if (vcount == 0) { + if ( vcount == 0 ) + { vindex = voffset * page_size2; for ( std::size_t i = 0; i < page_size2; ++i) { flavor_indices[voffset * page_size2 + i] = flav; @@ -539,35 +541,71 @@ extern "C" false, rounded_count ); - std::size_t page_size = MemoryAccessMomentaBase::neppM; - for( std::size_t i_event = 0; i_event < count; ++i_event ) + if ( sort_flavors ) { - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; - - double denominator = denominators[i_event]; - if( m2_out != nullptr ) - { - m2_out[i_event + offset] = matrix_elements[i_event]; - } - if( amp2_out != nullptr ) + for( std::size_t i_event = 0; i_event < count; ++i_event ) { - for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + std::size_t i_sorted = permutation[i_event]; + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_sorted / page_size; + std::size_t i_vector = i_sorted % page_size; + + double denominator = denominators[i_sorted]; + if( m2_out != nullptr ) { - amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + m2_out[i_event + offset] = matrix_elements[i_sorted]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_sorted] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_sorted] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_sorted] - 1; } } - if( diagram_out != nullptr ) - { - diagram_out[i_event + offset] = diagram_index[i_event] - 1; - } - if( color_out != nullptr ) - { - color_out[i_event + offset] = color_index[i_event] - 1; - } - if( helicity_out != nullptr ) + } else { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) { - helicity_out[i_event + offset] = helicity_index[i_event] - 1; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } } } #endif // MGONGPUCPP_GPUIMPL From d147d36ea0ae0803207663890bcb381c69505902 Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Wed, 27 May 2026 16:37:14 +0200 Subject: [PATCH 08/10] fix SIMD flavor selection --- madmatrix/model_handling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py index e1e007fab..95a16eee8 100644 --- a/madmatrix/model_handling.py +++ b/madmatrix/model_handling.py @@ -2308,10 +2308,11 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi // Scalar iflavor for the current event // for GPU it is an int // for SIMD it is also an int, since it is constant across the SIMD vector - const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavorVec ); #ifdef MGONGPUCPP_GPUIMPL - const unsigned int iflavor = iflavor_sv; + const unsigned int iflavor = F_ACCESS::kernelAccessConst( iflavorVec ); #else + const unsigned int* iflavor_rec = F_ACCESS::ieventAccessRecordConst( iflavorVec, ievt0 ); + const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavor_rec ); const unsigned int iflavor = reinterpret_cast(&iflavor_sv)[0]; #endif """) From 952ede5bdae2aed43b7b47c178a44e813a8db41b Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Thu, 28 May 2026 09:42:36 +0200 Subject: [PATCH 09/10] fix flavor sampling and limitation of open files --- madgraph/iolibs/template_files/mg7/madevent.py | 7 ++++++- madspace/src/phasespace/integrand.cpp | 12 +++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py index c2120e892..55bc70bdb 100644 --- a/madgraph/iolibs/template_files/mg7/madevent.py +++ b/madgraph/iolibs/template_files/mg7/madevent.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from typing import Literal, NamedTuple import tomllib +import resource if "LHAPDF_DATA_PATH" in os.environ: PDF_PATH = os.environ["LHAPDF_DATA_PATH"] @@ -1167,7 +1168,7 @@ def build_vegas(self, mapping: ms.PhaseSpaceMapping, prefix: str) -> ms.VegasMap def build_discrete( self, permutation_count: int, flavor_count: int, prefix: str ) -> tuple[ms.DiscreteSampler | None, ms.DiscreteSampler | None]: - return None, None + #return None, None discrete_before = None #if permutation_count > 1: # discrete_before = ms.DiscreteSampler( @@ -1347,6 +1348,10 @@ def main() -> None: if args.ask_edit_cards: ask_edit_cards() + # Remove soft limit on number of open files as it can be quite low on some systems + soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (hard_lim, hard_lim)) + process = MadgraphProcess() process.survey() process.train_madnis() diff --git a/madspace/src/phasespace/integrand.cpp b/madspace/src/phasespace/integrand.cpp index fa5453a41..ef86b23b0 100644 --- a/madspace/src/phasespace/integrand.cpp +++ b/madspace/src/phasespace/integrand.cpp @@ -129,14 +129,14 @@ Integrand::Integrand( } if (flav_count > 1 && !std::holds_alternative(discrete_after)) { - ret_types.push_back("flavor_index", batch_int); + ret_types.push_back("discrete_flavor_index", batch_int); } } if (flags & return_discrete_latent) { ret_types.push_back("channel_index_in_group", batch_int); if (flav_count > 1 && !std::holds_alternative(discrete_after)) { - ret_types.push_back("flavor_index", batch_int); + ret_types.push_back("discrete_flavor_index", batch_int); if (pdf_grid && energy_scale) { ret_types.push_back("pdf_prior", batch_float_array(flav_count)); } @@ -640,7 +640,8 @@ NamedVector Integrand::build_common_part( !std::holds_alternative(_discrete_after)) { auto zeros = fb.full({static_cast(0), args.batch_size}); outputs.push_back( - "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id()) + "discrete_flavor_index", + scatter_or_drop(fb, result, zeros, result.flavor_id()) ); } } @@ -653,7 +654,8 @@ NamedVector Integrand::build_common_part( !std::holds_alternative(_discrete_after)) { auto zeros = fb.full({static_cast(0), args.batch_size}); outputs.push_back( - "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id()) + "discrete_flavor_index", + scatter_or_drop(fb, result, zeros, result.flavor_id()) ); if (args.has_pdf_prior) { auto flav_count = _diff_xs.pid_options().size(); @@ -795,7 +797,7 @@ IntegrandProbability::IntegrandProbability(const Integrand& integrand) : auto flavor_count = integrand._diff_xs.pid_options().size(); if (flavor_count > 1 && !std::holds_alternative(integrand._discrete_after)) { - arg_types.push_back("flavor_index", batch_int); + arg_types.push_back("discrete_flavor_index", batch_int); if ((integrand._pdfs.at(0) || integrand._pdfs.at(1)) && integrand._energy_scale) { arg_types.push_back("pdf_prior", batch_float_array(flavor_count)); From 412df9d75e7e65710f1beec728caf5f00220a9ea Mon Sep 17 00:00:00 2001 From: Theo Heimel Date: Thu, 28 May 2026 12:35:00 +0200 Subject: [PATCH 10/10] fix async gpu matrix element --- .../template_files/madmatrix/color_sum.cc | 21 ++++++++++++------- .../process_function_definitions.inc | 2 +- .../madmatrix/process_sigmaKin_function.inc | 8 +++---- .../iolibs/template_files/madmatrix/umami.cc | 6 ------ madmatrix/model_handling.py | 6 +++--- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.cc b/madgraph/iolibs/template_files/madmatrix/color_sum.cc index 75202e963..30c679993 100644 --- a/madgraph/iolibs/template_files/madmatrix/color_sum.cc +++ b/madgraph/iolibs/template_files/madmatrix/color_sum.cc @@ -156,10 +156,16 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL __global__ void - color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity - const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity - const int nGoodHel ) // input: number of good helicities + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel, // input: number of good helicities + const int nevtIfAllHelicities ) // input: zero in single-helicity mode, number of events in multi-helicity mode { + if (nevtIfAllHelicities) { + int ighel = blockIdx.y; + allMEs = allMEs + ighel * nevtIfAllHelicities; // MEs for one specific helicity ighel + allJamps = allJamps + ighel * nevtIfAllHelicities; // Jamps for one specific helicity ighel + } using J_ACCESS = DeviceAccessJamp; fptype jampR[ncolor]; fptype jampI[ncolor]; @@ -377,16 +383,17 @@ namespace mg5amcCpu gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks const int gputhreads, // input: cuda gputhreads - const bool async ) // input: if true, run everything asynchronously in first stream in ghelStreams + const bool processAllHelicities ) // input: if true, use blockIdx.y to index helicities { const int nevt = gpublocks * gputhreads; // CASE 1: KERNEL if( !pBlasHandle ) { assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set - if (async) { - gpuLaunchKernel2D( calculate_jamps, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt ); + if (processAllHelicities) { + gpuLaunchKernel2D( color_sum_kernel, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt ); } else { // Loop over helicities for( int ighel = 0; ighel < nGoodHel; ighel++ ) @@ -404,7 +411,7 @@ namespace mg5amcCpu #ifdef MGONGPU_HAS_NO_BLAS assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas #else - if (async) { + if (processAllHelicities) { assert( false ); // BLAS in async mode not supported for now } else { checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed diff --git a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc index ee36fb590..815fe5e40 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc @@ -461,7 +461,7 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads, false ); gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 ); gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost ); //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl; diff --git a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc index eafb0eece..227301a6e 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc @@ -51,8 +51,10 @@ // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; if (async) { - gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], allmomenta, allcouplings, ghelAllJamps, allChannelIds, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true ); + gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], 0, allmomenta, allcouplings, iflavorVec, ghelAllJamps, storeChannelWeights, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true ); + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, true ); } else { for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { @@ -60,18 +62,16 @@ fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false ); } // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps - color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, false ); checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color } // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); - bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); // Event-by-event random choice of color and diagram #402 diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index aa457c7a6..c3c90e983 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -615,12 +615,6 @@ extern "C" UmamiStatus umami_free( UmamiHandle handle ) { InterfaceInstance* instance = static_cast( handle ); -#ifdef MGONGPUCPP_GPUIMPL - for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) - { - if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); - } -#endif delete instance; return UMAMI_SUCCESS; } diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py index e1e007fab..00d94df4f 100644 --- a/madmatrix/model_handling.py +++ b/madmatrix/model_handling.py @@ -1766,15 +1766,15 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel ); #else //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel ); if (processAllHelicities) { int ighel = blockIdx.y; ihel = dcGoodHel[ighel]; allJamps = allJamps + ighel * nevt; - allNumerators = allNumerators + ighel * nevt; + allNumerators = allNumerators + ighel * nevt * processConfig::ndiagrams; allDenominators = allDenominators + ighel * nevt; } - //debug = ( ievt == 0 ); - //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel ); #endif /* clang-format on */""") nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() ret_lines.append("""