diff --git a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h index ac2cbc5a1..6f709dfe9 100644 --- a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h +++ b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h @@ -46,6 +46,7 @@ #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) @@ -113,6 +114,7 @@ #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) diff --git a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc index a019296df..872e4795e 100644 --- a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc +++ b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc @@ -490,7 +490,7 @@ namespace mg5amcGpu gpuBlasHandle_t* pBlasHandle = nullptr; #endif const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, false, m_gpublocks, m_gputhreads ); #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?! diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.cc b/madgraph/iolibs/template_files/madmatrix/color_sum.cc index fcfa4ee51..30c679993 100644 --- a/madgraph/iolibs/template_files/madmatrix/color_sum.cc +++ b/madgraph/iolibs/template_files/madmatrix/color_sum.cc @@ -156,10 +156,16 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL __global__ void - color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity - const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity - const int nGoodHel ) // input: number of good helicities + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel, // input: number of good helicities + const int nevtIfAllHelicities ) // input: zero in single-helicity mode, number of events in multi-helicity mode { + if (nevtIfAllHelicities) { + int ighel = blockIdx.y; + allMEs = allMEs + ighel * nevtIfAllHelicities; // MEs for one specific helicity ighel + allJamps = allJamps + ighel * nevtIfAllHelicities; // Jamps for one specific helicity ighel + } using J_ACCESS = DeviceAccessJamp; fptype jampR[ncolor]; fptype jampI[ncolor]; @@ -378,20 +384,25 @@ namespace mg5amcCpu gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) const int nGoodHel, // input: number of good helicities const int gpublocks, // input: cuda gpublocks - const int gputhreads ) // input: cuda gputhreads + const int gputhreads, // input: cuda gputhreads + const bool processAllHelicities ) // input: if true, use blockIdx.y to index helicities { const int nevt = gpublocks * gputhreads; // CASE 1: KERNEL if( !pBlasHandle ) { assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set - // Loop over helicities - for( int ighel = 0; ighel < nGoodHel; ighel++ ) - { - fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel - const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel - gpuStream_t hStream = ghelStreams[ighel]; - gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + if (processAllHelicities) { + gpuLaunchKernel2D( color_sum_kernel, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt ); + } else { + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel, 0 ); + } } } // CASE 2: BLAS @@ -400,15 +411,19 @@ namespace mg5amcCpu #ifdef MGONGPU_HAS_NO_BLAS assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas #else - checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed - // Reset the tmp buffer + if (processAllHelicities) { + assert( false ); // BLAS in async mode not supported for now + } else { + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); #else - gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); -#endif - // Delegate the color sum to BLAS for - color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); + } #endif } } diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.h b/madgraph/iolibs/template_files/madmatrix/color_sum.h index b64febcdb..347184c4e 100644 --- a/madgraph/iolibs/template_files/madmatrix/color_sum.h +++ b/madgraph/iolibs/template_files/madmatrix/color_sum.h @@ -78,23 +78,25 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL void - color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) - const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities - fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) - gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle - gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) - const int nGoodHel, // input: number of good helicities - const int gpublocks, // input: cuda gpublocks - const int gputhreads ); // input: cuda gputhreads + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads, // input: cuda gputhreads + const bool processAllHelicities); // input: if true, use blockIdx.y to index helicities #endif //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL __global__ void - color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity - const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity - const int nGoodHel ); // input: number of good helicities + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel, // input: number of good helicities + const int nevtIfAllHelicities); // input: zero in single-helicity mode, number of events in multi-helicity mode #endif //-------------------------------------------------------------------------- diff --git a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc index c0f130b95..815fe5e40 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc @@ -461,8 +461,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); - gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel ); + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads, false ); + gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 ); gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost ); //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl; for( int ievt = 0; ievt < maxtry; ++ievt ) @@ -806,6 +806,7 @@ namespace mg5amcCpu fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const bool async, const int gpublocks, // input: cuda gpublocks const int gputhreads // input: cuda gputhreads #else diff --git a/madgraph/iolibs/template_files/madmatrix/process_h.inc b/madgraph/iolibs/template_files/madmatrix/process_h.inc index 3fb87b052..fb715a877 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_h.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_h.inc @@ -99,6 +99,7 @@ namespace mg5amcCpu fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const bool async, // input: if true, run everything asynchronously in first stream in ghelStreams const int gpublocks, // input: cuda gpublocks const int gputhreads ); // input: cuda gputhreads #else diff --git a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc index 6db9d8079..227301a6e 100644 --- a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc +++ b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc @@ -51,23 +51,27 @@ // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - const int ihel = cGoodHel[ighel]; - fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; - fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + if (async) { + gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], 0, allmomenta, allcouplings, iflavorVec, ghelAllJamps, storeChannelWeights, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true ); + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, true ); + } else { + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false ); + } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, false ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color } - // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps - color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); - checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed - // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); - bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); // Event-by-event random choice of color and diagram #402 diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc index 05731edf8..c3c90e983 100644 --- a/madgraph/iolibs/template_files/madmatrix/umami.cc +++ b/madgraph/iolibs/template_files/madmatrix/umami.cc @@ -12,6 +12,9 @@ #include "MemoryBuffers.h" #include +#include +#include +#include #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; @@ -72,18 +75,18 @@ namespace __device__ #endif void - transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event_in, std::size_t i_event_out, std::size_t stride ) { std::size_t page_size = MemoryAccessMomentaBase::neppM; - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; + std::size_t i_page = i_event_out / page_size; + std::size_t i_vector = i_event_out % page_size; for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) { for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) { momenta_out[i_page * CPPProcess::npar * 4 * page_size + - i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event_in]; } } } @@ -110,7 +113,7 @@ namespace std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; if( i_event >= count ) return; - transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + transpose_momenta( &momenta_in[offset], momenta, i_event, i_event, stride ); diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; @@ -156,9 +159,6 @@ namespace struct InterfaceInstance { bool initialized = false; -#ifdef MGONGPUCPP_GPUIMPL - gpuStream_t hel_streams[CPPProcess::ncomb]; -#endif }; } @@ -206,29 +206,23 @@ extern "C" process.initProc( param_card_path ); auto instance = new InterfaceInstance(); *handle = instance; -#ifdef MGONGPUCPP_GPUIMPL - for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) - { - gpuStreamCreate( &instance->hel_streams[ihel] ); - } -#endif return UMAMI_SUCCESS; } UmamiStatus umami_set_parameter( - UmamiHandle handle, - char const* name, - double parameter_real, - double parameter_imag ) + [[maybe_unused]] UmamiHandle handle, + [[maybe_unused]] char const* name, + [[maybe_unused]] double parameter_real, + [[maybe_unused]] double parameter_imag ) { return UMAMI_ERROR_NOT_IMPLEMENTED; } UmamiStatus umami_get_parameter( - UmamiHandle handle, - char const* name, - double* parameter_real, - double* parameter_imag ) + [[maybe_unused]] UmamiHandle handle, + [[maybe_unused]] char const* name, + [[maybe_unused]] double* parameter_real, + [[maybe_unused]] double* parameter_imag ) { return UMAMI_ERROR_NOT_IMPLEMENTED; } @@ -251,7 +245,7 @@ extern "C" const double* random_color_in = nullptr; const double* random_helicity_in = nullptr; const double* random_diagram_in = nullptr; - const int* diagram_in = nullptr; // TODO: unused + [[maybe_unused]] const int* diagram_in = nullptr; // TODO: unused for( std::size_t i = 0; i < input_count; ++i ) { @@ -336,22 +330,38 @@ extern "C" unsigned int *flavor_indices, *diagram_index; std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup; - gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &flavor_indices, rounded_count * sizeof( unsigned int ), gpu_stream ); - gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); - gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); - gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); - gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); - gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + std::array, 16> ptrs_and_sizes = {{ + {reinterpret_cast(&momenta), rounded_count * CPPProcess::npar * 4 * sizeof( fptype )}, + {reinterpret_cast(&couplings), rounded_count * n_coup * 2 * sizeof( fptype )}, + {reinterpret_cast(&g_s), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&flavor_indices), rounded_count * sizeof( unsigned int )}, + {reinterpret_cast(&helicity_random), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&color_random), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&diagram_random), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&matrix_elements), rounded_count * sizeof( fptype )}, + {reinterpret_cast(&diagram_index), rounded_count * sizeof( unsigned int )}, + {reinterpret_cast(&color_jamps), rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, + {reinterpret_cast(&numerators), rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )}, + {reinterpret_cast(&denominators), rounded_count * CPPProcess::ncomb * sizeof( fptype )}, + {reinterpret_cast(&helicity_index), rounded_count * sizeof( int )}, + {reinterpret_cast(&color_index), rounded_count * sizeof( int )}, + {reinterpret_cast(&ghel_matrix_elements), rounded_count * CPPProcess::ncomb * sizeof( fptype )}, + {reinterpret_cast(&ghel_jamps), rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )}, + }}; + std::size_t total_size = 0; + for (auto [ptr, size] : ptrs_and_sizes) { + std::size_t aligned_size = (size + 7) / 8 * 8; + total_size += aligned_size; + } + uint8_t* buffer; + // we can consider caching this between matrix element calls + gpuMallocAsync( &buffer, total_size, gpu_stream ); + std::size_t buf_offset = 0; + for (auto [ptr, size] : ptrs_and_sizes) { + std::size_t aligned_size = (size + 7) / 8 * 8; + *ptr = buffer + buf_offset; + buf_offset += aligned_size; + } copy_inputs<<>>( momenta_in, @@ -371,9 +381,6 @@ extern "C" offset ); computeDependentCouplings<<>>( g_s, couplings ); checkGpu( gpuPeekAtLastError() ); - // TODO: make things fully async (requires using events instead of synchronize in - // the sigmaKin implementation) - gpuStreamSynchronize( gpu_stream ); InterfaceInstance* instance = static_cast( handle ); if( !instance->initialized ) @@ -403,7 +410,8 @@ extern "C" ghel_jamps, nullptr, nullptr, - instance->hel_streams, + &gpu_stream, + true, n_blocks, n_threads ); @@ -424,31 +432,49 @@ extern "C" offset ); checkGpu( gpuPeekAtLastError() ); - gpuFreeAsync( momenta, gpu_stream ); - gpuFreeAsync( couplings, gpu_stream ); - gpuFreeAsync( flavor_indices, gpu_stream ); - gpuFreeAsync( g_s, gpu_stream ); - gpuFreeAsync( helicity_random, gpu_stream ); - gpuFreeAsync( color_random, gpu_stream ); - gpuFreeAsync( diagram_random, gpu_stream ); - gpuFreeAsync( matrix_elements, gpu_stream ); - gpuFreeAsync( diagram_index, gpu_stream ); - gpuFreeAsync( color_jamps, gpu_stream ); - gpuFreeAsync( numerators, gpu_stream ); - gpuFreeAsync( denominators, gpu_stream ); - gpuFreeAsync( helicity_index, gpu_stream ); - gpuFreeAsync( color_index, gpu_stream ); - gpuFreeAsync( ghel_matrix_elements, gpu_stream ); - gpuFreeAsync( ghel_jamps, gpu_stream ); + gpuFreeAsync( buffer, gpu_stream ); #else // MGONGPUCPP_GPUIMPL + constexpr std::size_t vector_size = MemoryAccessMomentaBase::neppM; // need to round to round to double page size for some reason - std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; - std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + constexpr std::size_t page_size2 = 2 * vector_size; + std::vector permutation; + std::size_t rounded_count; + + constexpr std::size_t flavor_count = CPPProcess::nmaxflavor; + HostBufferBase flavor_indices( ((count + page_size2 - 1) / page_size2 + flavor_count) * page_size2 ); + bool sort_flavors = vector_size > 1 && flavor_count > 1 && flavor_indices_in; + if ( sort_flavors ) + { + permutation.resize(count); + std::size_t voffset = 0; + std::size_t vector_indices[flavor_count] = {}; + std::size_t vector_counts[flavor_count] = {}; + // determine permutation of inputs such that all entries in a SIMD vector + // have the same flavor index + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + unsigned int flav = flavor_indices_in[i_event + offset]; + auto& vcount = vector_counts[flav]; + auto& vindex = vector_indices[flav]; + if ( vcount == 0 ) + { + vindex = voffset * page_size2; + for ( std::size_t i = 0; i < page_size2; ++i) { + flavor_indices[voffset * page_size2 + i] = flav; + } + voffset += 1; + } + permutation[i_event] = vindex + vcount; + vcount = (vcount + 1) % page_size2; + } + rounded_count = voffset * page_size2; + } else { + rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + } HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_dependentCouplings::ndcoup * 2 ); HostBufferBase g_s( rounded_count ); - HostBufferBase flavor_indices( rounded_count ); HostBufferBase helicity_random( rounded_count ); HostBufferBase color_random( rounded_count ); HostBufferBase diagram_random( rounded_count ); @@ -458,17 +484,29 @@ extern "C" HostBufferBase denominators( rounded_count ); HostBufferBase helicity_index( rounded_count ); HostBufferBase color_index( rounded_count ); - for( std::size_t i_event = 0; i_event < count; ++i_event ) - { - transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); - helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; - color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; - diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; - g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; - flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0; - } - for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) { - flavor_indices[i_event] = 0; + if ( sort_flavors ) { + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_sorted = permutation[i_event]; + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_sorted, stride ); + helicity_random[i_sorted] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_sorted] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_sorted] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_sorted] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + } else { + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0; + } + for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) { + flavor_indices[i_event] = 0; + } } computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); @@ -503,35 +541,71 @@ extern "C" false, rounded_count ); - std::size_t page_size = MemoryAccessMomentaBase::neppM; - for( std::size_t i_event = 0; i_event < count; ++i_event ) + if ( sort_flavors ) { - std::size_t i_page = i_event / page_size; - std::size_t i_vector = i_event % page_size; - - double denominator = denominators[i_event]; - if( m2_out != nullptr ) - { - m2_out[i_event + offset] = matrix_elements[i_event]; - } - if( amp2_out != nullptr ) + for( std::size_t i_event = 0; i_event < count; ++i_event ) { - for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + std::size_t i_sorted = permutation[i_event]; + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_sorted / page_size; + std::size_t i_vector = i_sorted % page_size; + + double denominator = denominators[i_sorted]; + if( m2_out != nullptr ) { - amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + m2_out[i_event + offset] = matrix_elements[i_sorted]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_sorted] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_sorted] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_sorted] - 1; } } - if( diagram_out != nullptr ) - { - diagram_out[i_event + offset] = diagram_index[i_event] - 1; - } - if( color_out != nullptr ) - { - color_out[i_event + offset] = color_index[i_event] - 1; - } - if( helicity_out != nullptr ) + } else { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) { - helicity_out[i_event + offset] = helicity_index[i_event] - 1; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } } } #endif // MGONGPUCPP_GPUIMPL @@ -541,12 +615,6 @@ extern "C" UmamiStatus umami_free( UmamiHandle handle ) { InterfaceInstance* instance = static_cast( handle ); -#ifdef MGONGPUCPP_GPUIMPL - for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) - { - if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); - } -#endif delete instance; return UMAMI_SUCCESS; } diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py index 47e7e680d..55bc70bdb 100644 --- a/madgraph/iolibs/template_files/mg7/madevent.py +++ b/madgraph/iolibs/template_files/mg7/madevent.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from typing import Literal, NamedTuple import tomllib +import resource if "LHAPDF_DATA_PATH" in os.environ: PDF_PATH = os.environ["LHAPDF_DATA_PATH"] @@ -393,12 +394,15 @@ def survey(self) -> None: def train_madnis(self) -> None: madnis_args = self.run_card["madnis"] - gen_args = self.run_card["generation"] - run_args = self.run_card["run"] + if not madnis_args["enable"]: + return if madnis_args.get("old", False): self.train_madnis_old() return + gen_args = self.run_card["generation"] + run_args = self.run_card["run"] + config = ms.MadnisConfig() config.verbosity = run_args["verbosity"] config.learning_rate = madnis_args["lr"] @@ -1164,7 +1168,7 @@ def build_vegas(self, mapping: ms.PhaseSpaceMapping, prefix: str) -> ms.VegasMap def build_discrete( self, permutation_count: int, flavor_count: int, prefix: str ) -> tuple[ms.DiscreteSampler | None, ms.DiscreteSampler | None]: - return None, None + #return None, None discrete_before = None #if permutation_count > 1: # discrete_before = ms.DiscreteSampler( @@ -1344,6 +1348,10 @@ def main() -> None: if args.ask_edit_cards: ask_edit_cards() + # Remove soft limit on number of open files as it can be quite low on some systems + soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (hard_lim, hard_lim)) + process = MadgraphProcess() process.survey() process.train_madnis() diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml index f965b2707..de6cacb8b 100644 --- a/madgraph/iolibs/template_files/mg7/run_card.toml +++ b/madgraph/iolibs/template_files/mg7/run_card.toml @@ -15,6 +15,7 @@ output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe verbosity = "pretty" # options: silent, pretty, log dummy_matrix_element = false save_gridpack = false +gridpack_include_source = false [beam] e_cm = 13000.0 diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py index 94215209b..ca9abf2bd 100644 --- a/madmatrix/model_handling.py +++ b/madmatrix/model_handling.py @@ -1725,7 +1725,8 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) - const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + const int nevt, // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + const bool processAllHelicities // input: if true, use blockIdx.y to index helicities #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity bool storeChannelWeights, @@ -1767,6 +1768,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; //debug = ( ievt == 0 ); //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel ); + if (processAllHelicities) { + int ighel = blockIdx.y; + ihel = dcGoodHel[ighel]; + allJamps = allJamps + ighel * nevt; + allNumerators = allNumerators + ighel * nevt * processConfig::ndiagrams; + allDenominators = allDenominators + ighel * nevt; + } #endif /* clang-format on */""") nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() ret_lines.append(""" @@ -2300,10 +2308,11 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi // Scalar iflavor for the current event // for GPU it is an int // for SIMD it is also an int, since it is constant across the SIMD vector - const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavorVec ); #ifdef MGONGPUCPP_GPUIMPL - const unsigned int iflavor = iflavor_sv; + const unsigned int iflavor = F_ACCESS::kernelAccessConst( iflavorVec ); #else + const unsigned int* iflavor_rec = F_ACCESS::ieventAccessRecordConst( iflavorVec, ievt0 ); + const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavor_rec ); const unsigned int iflavor = reinterpret_cast(&iflavor_sv)[0]; #endif """) diff --git a/madspace/src/phasespace/integrand.cpp b/madspace/src/phasespace/integrand.cpp index fa5453a41..ef86b23b0 100644 --- a/madspace/src/phasespace/integrand.cpp +++ b/madspace/src/phasespace/integrand.cpp @@ -129,14 +129,14 @@ Integrand::Integrand( } if (flav_count > 1 && !std::holds_alternative(discrete_after)) { - ret_types.push_back("flavor_index", batch_int); + ret_types.push_back("discrete_flavor_index", batch_int); } } if (flags & return_discrete_latent) { ret_types.push_back("channel_index_in_group", batch_int); if (flav_count > 1 && !std::holds_alternative(discrete_after)) { - ret_types.push_back("flavor_index", batch_int); + ret_types.push_back("discrete_flavor_index", batch_int); if (pdf_grid && energy_scale) { ret_types.push_back("pdf_prior", batch_float_array(flav_count)); } @@ -640,7 +640,8 @@ NamedVector Integrand::build_common_part( !std::holds_alternative(_discrete_after)) { auto zeros = fb.full({static_cast(0), args.batch_size}); outputs.push_back( - "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id()) + "discrete_flavor_index", + scatter_or_drop(fb, result, zeros, result.flavor_id()) ); } } @@ -653,7 +654,8 @@ NamedVector Integrand::build_common_part( !std::holds_alternative(_discrete_after)) { auto zeros = fb.full({static_cast(0), args.batch_size}); outputs.push_back( - "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id()) + "discrete_flavor_index", + scatter_or_drop(fb, result, zeros, result.flavor_id()) ); if (args.has_pdf_prior) { auto flav_count = _diff_xs.pid_options().size(); @@ -795,7 +797,7 @@ IntegrandProbability::IntegrandProbability(const Integrand& integrand) : auto flavor_count = integrand._diff_xs.pid_options().size(); if (flavor_count > 1 && !std::holds_alternative(integrand._discrete_after)) { - arg_types.push_back("flavor_index", batch_int); + arg_types.push_back("discrete_flavor_index", batch_int); if ((integrand._pdfs.at(0) || integrand._pdfs.at(1)) && integrand._energy_scale) { arg_types.push_back("pdf_prior", batch_float_array(flavor_count));