Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<<dim3(blocks_x, blocks_y, 1), dim3(threads, 1, 1), 0, stream>>>( __VA_ARGS__ )

#define gpuStream_t cudaStream_t
#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
Expand Down Expand Up @@ -113,6 +114,7 @@
#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<<dim3(blocks_x, blocks_y, 1), dim3(threads, 1, 1), 0, stream>>>( __VA_ARGS__ )

#define gpuStream_t hipStream_t
#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ namespace mg5amcGpu
gpuBlasHandle_t* pBlasHandle = nullptr;
#endif
const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, false, m_gpublocks, m_gputhreads );
#ifdef MGONGPU_CHANNELID_DEBUG
//std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?!
Expand Down
51 changes: 33 additions & 18 deletions madgraph/iolibs/template_files/madmatrix/color_sum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,16 @@ namespace mg5amcCpu

#ifdef MGONGPUCPP_GPUIMPL
__global__ void
color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity
const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
const int nGoodHel ) // input: number of good helicities
color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity
const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
const int nGoodHel, // input: number of good helicities
const int nevtIfAllHelicities ) // input: zero in single-helicity mode, number of events in multi-helicity mode
{
if (nevtIfAllHelicities) {
int ighel = blockIdx.y;
allMEs = allMEs + ighel * nevtIfAllHelicities; // MEs for one specific helicity ighel
allJamps = allJamps + ighel * nevtIfAllHelicities; // Jamps for one specific helicity ighel
}
using J_ACCESS = DeviceAccessJamp;
fptype jampR[ncolor];
fptype jampI[ncolor];
Expand Down Expand Up @@ -378,20 +384,25 @@ namespace mg5amcCpu
gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
const int nGoodHel, // input: number of good helicities
const int gpublocks, // input: cuda gpublocks
const int gputhreads ) // input: cuda gputhreads
const int gputhreads, // input: cuda gputhreads
const bool processAllHelicities ) // input: if true, use blockIdx.y to index helicities
{
const int nevt = gpublocks * gputhreads;
// CASE 1: KERNEL
if( !pBlasHandle )
{
assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
// Loop over helicities
for( int ighel = 0; ighel < nGoodHel; ighel++ )
{
fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel
const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
gpuStream_t hStream = ghelStreams[ighel];
gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
if (processAllHelicities) {
gpuLaunchKernel2D( color_sum_kernel, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt );
} else {
// Loop over helicities
for( int ighel = 0; ighel < nGoodHel; ighel++ )
{
fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel
const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
gpuStream_t hStream = ghelStreams[ighel];
gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel, 0 );
}
}
}
// CASE 2: BLAS
Expand All @@ -400,15 +411,19 @@ namespace mg5amcCpu
#ifdef MGONGPU_HAS_NO_BLAS
assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
// Reset the tmp buffer
if (processAllHelicities) {
assert( false ); // BLAS in async mode not supported for now
} else {
checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
// Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
// Delegate the color sum to BLAS for
color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
// Delegate the color sum to BLAS for
color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
}
#endif
}
}
Expand Down
24 changes: 13 additions & 11 deletions madgraph/iolibs/template_files/madmatrix/color_sum.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,23 +78,25 @@ namespace mg5amcCpu

#ifdef MGONGPUCPP_GPUIMPL
void
color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
const int nGoodHel, // input: number of good helicities
const int gpublocks, // input: cuda gpublocks
const int gputhreads ); // input: cuda gputhreads
color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
const int nGoodHel, // input: number of good helicities
const int gpublocks, // input: cuda gpublocks
const int gputhreads, // input: cuda gputhreads
const bool processAllHelicities); // input: if true, use blockIdx.y to index helicities
#endif

//--------------------------------------------------------------------------

#ifdef MGONGPUCPP_GPUIMPL
__global__ void
color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity
const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
const int nGoodHel ); // input: number of good helicities
color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity
const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
const int nGoodHel, // input: number of good helicities
const int nevtIfAllHelicities); // input: zero in single-helicity mode, number of events in multi-helicity mode
#endif

//--------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,8 @@ namespace mg5amcCpu
gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
// NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads, false );
gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 );
gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
//std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
for( int ievt = 0; ievt < maxtry; ++ievt )
Expand Down Expand Up @@ -806,6 +806,7 @@ namespace mg5amcCpu
fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
const bool async,
const int gpublocks, // input: cuda gpublocks
const int gputhreads // input: cuda gputhreads
#else
Expand Down
1 change: 1 addition & 0 deletions madgraph/iolibs/template_files/madmatrix/process_h.inc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ namespace mg5amcCpu
fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
const bool async, // input: if true, run everything asynchronously in first stream in ghelStreams
const int gpublocks, // input: cuda gpublocks
const int gputhreads ); // input: cuda gputhreads
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,27 @@
// Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
// (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
// In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
for( int ighel = 0; ighel < cNGoodHel; ighel++ )
{
const int ihel = cGoodHel[ighel];
fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
if (async) {
gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], 0, allmomenta, allcouplings, iflavorVec, ghelAllJamps, storeChannelWeights, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true );
color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, true );
} else {
for( int ighel = 0; ighel < cNGoodHel; ighel++ )
{
const int ihel = cGoodHel[ighel];
fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false );
}
// (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, false );
checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
// (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
}
// (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
// (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
// Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );

bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );

// Event-by-event random choice of color and diagram #402
Expand Down
Loading
Loading