diff --git a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
index ac2cbc5a1..6f709dfe9 100644
--- a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
+++ b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
@@ -46,6 +46,7 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
 #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<<dim3(blocks_x, blocks_y, 1), dim3(threads, 1, 1), 0, stream>>>( __VA_ARGS__ )
 
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
@@ -113,6 +114,7 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
 #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<<dim3(blocks_x, blocks_y, 1), dim3(threads, 1, 1), 0, stream>>>( __VA_ARGS__ )
 
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
diff --git a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc
index a019296df..872e4795e 100644
--- a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc
+++ b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc
@@ -490,7 +490,7 @@ namespace mg5amcGpu
     gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, false, m_gpublocks, m_gputhreads );
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
     copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?!
diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.cc b/madgraph/iolibs/template_files/madmatrix/color_sum.cc
index fcfa4ee51..30c679993 100644
--- a/madgraph/iolibs/template_files/madmatrix/color_sum.cc
+++ b/madgraph/iolibs/template_files/madmatrix/color_sum.cc
@@ -156,10 +156,16 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
   __global__ void
-  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
-                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
-                    const int nGoodHel )    // input: number of good helicities
+  color_sum_kernel( fptype* allMEs,                 // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps,         // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel,             // input: number of good helicities
+                    const int nevtIfAllHelicities ) // input: zero in single-helicity mode, number of events in multi-helicity mode
   {
+    if (nevtIfAllHelicities) {
+      int ighel = blockIdx.y;
+      allMEs = allMEs + ighel * nevtIfAllHelicities; // MEs for one specific helicity ighel
+      allJamps = allJamps + ighel * nevtIfAllHelicities; // Jamps for one specific helicity ighel
+    }
     using J_ACCESS = DeviceAccessJamp;
     fptype jampR[ncolor];
     fptype jampI[ncolor];
@@ -378,20 +384,25 @@ namespace mg5amcCpu
                  gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
                  const int nGoodHel,               // input: number of good helicities
                  const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+                 const int gputhreads,             // input: cuda gputhreads
+                 const bool processAllHelicities ) // input: if true, use blockIdx.y to index helicities
   {
     const int nevt = gpublocks * gputhreads;
     // CASE 1: KERNEL
     if( !pBlasHandle )
     {
       assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
-      // Loop over helicities
-      for( int ighel = 0; ighel < nGoodHel; ighel++ )
-      {
-        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
-        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
-        gpuStream_t hStream = ghelStreams[ighel];
-        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      if (processAllHelicities) {
+        gpuLaunchKernel2D( color_sum_kernel, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt );
+      } else {
+        // Loop over helicities
+        for( int ighel = 0; ighel < nGoodHel; ighel++ )
+        {
+          fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+          const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+          gpuStream_t hStream = ghelStreams[ighel];
+          gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel, 0 );
+        }
       }
     }
     // CASE 2: BLAS
@@ -400,15 +411,19 @@ namespace mg5amcCpu
 #ifdef MGONGPU_HAS_NO_BLAS
       assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
 #else
-      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+      if (processAllHelicities) {
+        assert( false ); // BLAS in async mode not supported for now
+      } else {
+        checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+        // Reset the tmp buffer
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+        gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
 #else
-      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
-#endif
-      // Delegate the color sum to BLAS for 
-      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+        gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif  
+        // Delegate the color sum to BLAS for 
+        color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+      }
 #endif
     }
   }
diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.h b/madgraph/iolibs/template_files/madmatrix/color_sum.h
index b64febcdb..347184c4e 100644
--- a/madgraph/iolibs/template_files/madmatrix/color_sum.h
+++ b/madgraph/iolibs/template_files/madmatrix/color_sum.h
@@ -78,23 +78,25 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
   void
-  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,           // input: number of good helicities
-                 const int gpublocks,          // input: cuda gpublocks
-                 const int gputhreads );       // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads,             // input: cuda gputhreads
+                 const bool processAllHelicities); // input: if true, use blockIdx.y to index helicities
 #endif
 
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL
   __global__ void
-  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
-                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
-                    const int nGoodHel );   // input: number of good helicities
+  color_sum_kernel( fptype* allMEs,                 // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps,         // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel,             // input: number of good helicities
+                    const int nevtIfAllHelicities); // input: zero in single-helicity mode, number of events in multi-helicity mode
 #endif
 
   //--------------------------------------------------------------------------
diff --git a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
index c0f130b95..815fe5e40 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
@@ -461,8 +461,8 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
-      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads, false );
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 );
       gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
       //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
       for( int ievt = 0; ievt < maxtry; ++ievt )
@@ -806,6 +806,7 @@ namespace mg5amcCpu
             fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
             gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const bool async,
             const int gpublocks,                // input: cuda gpublocks
             const int gputhreads                // input: cuda gputhreads
 #else
diff --git a/madgraph/iolibs/template_files/madmatrix/process_h.inc b/madgraph/iolibs/template_files/madmatrix/process_h.inc
index 3fb87b052..fb715a877 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_h.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_h.inc
@@ -99,6 +99,7 @@ namespace mg5amcCpu
             fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
             gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
             gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const bool async,                   // input: if true, run everything asynchronously in first stream in ghelStreams
             const int gpublocks,                // input: cuda gpublocks
             const int gputhreads );             // input: cuda gputhreads
 #else
diff --git a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
index 6db9d8079..227301a6e 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
@@ -51,23 +51,27 @@
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      const int ihel = cGoodHel[ighel];
-      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
-      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    if (async) {
+      gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], 0, allmomenta, allcouplings, iflavorVec, ghelAllJamps, storeChannelWeights, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true );
+      color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, true );
+    } else {
+      for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+      {
+        const int ihel = cGoodHel[ighel];
+        fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false );
+      }
+      // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+      color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, false );
+      checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+      // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     }
-    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
-    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
-    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
-    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 
-    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
     gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
 
     // Event-by-event random choice of color and diagram #402
diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index 05731edf8..c3c90e983 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -12,6 +12,9 @@
 #include "MemoryBuffers.h"
 
 #include <cmath>
+#include <vector>
+#include <array>
+#include <utility>
 
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
@@ -72,18 +75,18 @@ namespace
   __device__
 #endif
     void
-    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event_in, std::size_t i_event_out, std::size_t stride )
   {
     std::size_t page_size = MemoryAccessMomentaBase::neppM;
-    std::size_t i_page = i_event / page_size;
-    std::size_t i_vector = i_event % page_size;
+    std::size_t i_page = i_event_out / page_size;
+    std::size_t i_vector = i_event_out % page_size;
 
     for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
     {
       for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
       {
         momenta_out[i_page * CPPProcess::npar * 4 * page_size +
-                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event_in];
       }
     }
   }
@@ -110,7 +113,7 @@ namespace
     std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
     if( i_event >= count ) return;
 
-    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    transpose_momenta( &momenta_in[offset], momenta, i_event, i_event, stride );
     diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
     helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
     color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
@@ -156,9 +159,6 @@ namespace
   struct InterfaceInstance
   {
     bool initialized = false;
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuStream_t hel_streams[CPPProcess::ncomb];
-#endif
   };
 
 }
@@ -206,29 +206,23 @@ extern "C"
     process.initProc( param_card_path );
     auto instance = new InterfaceInstance();
     *handle = instance;
-#ifdef MGONGPUCPP_GPUIMPL
-    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
-    {
-      gpuStreamCreate( &instance->hel_streams[ihel] );
-    }
-#endif
     return UMAMI_SUCCESS;
   }
 
   UmamiStatus umami_set_parameter(
-    UmamiHandle handle,
-    char const* name,
-    double parameter_real,
-    double parameter_imag )
+    [[maybe_unused]] UmamiHandle handle,
+    [[maybe_unused]] char const* name,
+    [[maybe_unused]] double parameter_real,
+    [[maybe_unused]] double parameter_imag )
   {
     return UMAMI_ERROR_NOT_IMPLEMENTED;
   }
 
   UmamiStatus umami_get_parameter(
-    UmamiHandle handle,
-    char const* name,
-    double* parameter_real,
-    double* parameter_imag )
+    [[maybe_unused]] UmamiHandle handle,
+    [[maybe_unused]] char const* name,
+    [[maybe_unused]] double* parameter_real,
+    [[maybe_unused]] double* parameter_imag )
   {
     return UMAMI_ERROR_NOT_IMPLEMENTED;
   }
@@ -251,7 +245,7 @@ extern "C"
     const double* random_color_in = nullptr;
     const double* random_helicity_in = nullptr;
     const double* random_diagram_in = nullptr;
-    const int* diagram_in = nullptr; // TODO: unused
+    [[maybe_unused]] const int* diagram_in = nullptr; // TODO: unused
 
     for( std::size_t i = 0; i < input_count; ++i )
     {
@@ -336,22 +330,38 @@ extern "C"
     unsigned int *flavor_indices, *diagram_index;
 
     std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup;
-    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &flavor_indices, rounded_count * sizeof( unsigned int ), gpu_stream );
-    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
-    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
-    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
-    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    std::array<std::pair<void**, std::size_t>, 16> ptrs_and_sizes = {{
+        {reinterpret_cast<void**>(&momenta), rounded_count * CPPProcess::npar * 4 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&couplings), rounded_count * n_coup * 2 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&g_s), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&flavor_indices), rounded_count * sizeof( unsigned int )},
+        {reinterpret_cast<void**>(&helicity_random), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&color_random), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&diagram_random), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&matrix_elements), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&diagram_index), rounded_count * sizeof( unsigned int )},
+        {reinterpret_cast<void**>(&color_jamps), rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&numerators), rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )},
+        {reinterpret_cast<void**>(&denominators), rounded_count * CPPProcess::ncomb * sizeof( fptype )},
+        {reinterpret_cast<void**>(&helicity_index), rounded_count * sizeof( int )},
+        {reinterpret_cast<void**>(&color_index), rounded_count * sizeof( int )},
+        {reinterpret_cast<void**>(&ghel_matrix_elements), rounded_count * CPPProcess::ncomb * sizeof( fptype )},
+        {reinterpret_cast<void**>(&ghel_jamps), rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
+    }};
+    std::size_t total_size = 0;
+    for (auto [ptr, size] : ptrs_and_sizes) {
+        std::size_t aligned_size = (size + 7) / 8 * 8;
+        total_size += aligned_size;
+    }
+    uint8_t* buffer;
+    // we can consider caching this between matrix element calls
+    gpuMallocAsync( &buffer, total_size, gpu_stream );
+    std::size_t buf_offset = 0;
+    for (auto [ptr, size] : ptrs_and_sizes) {
+        std::size_t aligned_size = (size + 7) / 8 * 8;
+        *ptr = buffer + buf_offset;
+        buf_offset += aligned_size;
+    }
 
     copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
       momenta_in,
@@ -371,9 +381,6 @@ extern "C"
       offset );
     computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
     checkGpu( gpuPeekAtLastError() );
-    // TODO: make things fully async (requires using events instead of synchronize in
-    //       the sigmaKin implementation)
-    gpuStreamSynchronize( gpu_stream );
 
     InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
     if( !instance->initialized )
@@ -403,7 +410,8 @@ extern "C"
       ghel_jamps,
       nullptr,
       nullptr,
-      instance->hel_streams,
+      &gpu_stream,
+      true,
       n_blocks,
       n_threads );
 
@@ -424,31 +432,49 @@ extern "C"
       offset );
     checkGpu( gpuPeekAtLastError() );
 
-    gpuFreeAsync( momenta, gpu_stream );
-    gpuFreeAsync( couplings, gpu_stream );
-    gpuFreeAsync( flavor_indices, gpu_stream );
-    gpuFreeAsync( g_s, gpu_stream );
-    gpuFreeAsync( helicity_random, gpu_stream );
-    gpuFreeAsync( color_random, gpu_stream );
-    gpuFreeAsync( diagram_random, gpu_stream );
-    gpuFreeAsync( matrix_elements, gpu_stream );
-    gpuFreeAsync( diagram_index, gpu_stream );
-    gpuFreeAsync( color_jamps, gpu_stream );
-    gpuFreeAsync( numerators, gpu_stream );
-    gpuFreeAsync( denominators, gpu_stream );
-    gpuFreeAsync( helicity_index, gpu_stream );
-    gpuFreeAsync( color_index, gpu_stream );
-    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
-    gpuFreeAsync( ghel_jamps, gpu_stream );
+    gpuFreeAsync( buffer, gpu_stream );
 #else  // MGONGPUCPP_GPUIMPL
+    constexpr std::size_t vector_size = MemoryAccessMomentaBase::neppM;
     // need to round to round to double page size for some reason
-    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
-    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+    constexpr std::size_t page_size2 = 2 * vector_size;
+    std::vector<std::size_t> permutation;
+    std::size_t rounded_count;
+
+    constexpr std::size_t flavor_count = CPPProcess::nmaxflavor;
+    HostBufferBase<unsigned int, false> flavor_indices( ((count + page_size2 - 1) / page_size2 + flavor_count) * page_size2 );
+    bool sort_flavors = vector_size > 1 && flavor_count > 1 && flavor_indices_in;
+    if ( sort_flavors ) 
+    {
+      permutation.resize(count);
+      std::size_t voffset = 0;
+      std::size_t vector_indices[flavor_count] = {};
+      std::size_t vector_counts[flavor_count] = {};
+      // determine permutation of inputs such that all entries in a SIMD vector
+      // have the same flavor index
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
+      {
+        unsigned int flav = flavor_indices_in[i_event + offset];
+        auto& vcount = vector_counts[flav];
+        auto& vindex = vector_indices[flav];
+        if ( vcount == 0 )
+        {
+          vindex = voffset * page_size2;
+          for ( std::size_t i = 0; i < page_size2; ++i) {
+            flavor_indices[voffset * page_size2 + i] = flav;
+          }
+          voffset += 1;
+        }
+        permutation[i_event] = vindex + vcount;
+        vcount = (vcount + 1) % page_size2;
+      }
+      rounded_count = voffset * page_size2;
+    } else {
+      rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+    }
 
     HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
     HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_dependentCouplings::ndcoup * 2 );
     HostBufferBase<fptype, false> g_s( rounded_count );
-    HostBufferBase<unsigned int, false> flavor_indices( rounded_count );
     HostBufferBase<fptype, false> helicity_random( rounded_count );
     HostBufferBase<fptype, false> color_random( rounded_count );
     HostBufferBase<fptype, false> diagram_random( rounded_count );
@@ -458,17 +484,29 @@ extern "C"
     HostBufferBase<fptype, false> denominators( rounded_count );
     HostBufferBase<int, false> helicity_index( rounded_count );
     HostBufferBase<int, false> color_index( rounded_count );
-    for( std::size_t i_event = 0; i_event < count; ++i_event )
-    {
-      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
-      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
-      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
-      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
-      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
-      flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0;
-    }
-    for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) {
-      flavor_indices[i_event] = 0;
+    if ( sort_flavors ) {
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
+      {
+        std::size_t i_sorted = permutation[i_event];
+        transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_sorted, stride );
+        helicity_random[i_sorted] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+        color_random[i_sorted] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+        diagram_random[i_sorted] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+        g_s[i_sorted] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+      }
+    } else {
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
+      {
+        transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_event, stride );
+        helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+        color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+        diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+        g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+        flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0;
+      }
+      for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) {
+        flavor_indices[i_event] = 0;
+      }
     }
     computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
 
@@ -503,35 +541,71 @@ extern "C"
       false,
       rounded_count );
 
-    std::size_t page_size = MemoryAccessMomentaBase::neppM;
-    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    if ( sort_flavors )
     {
-      std::size_t i_page = i_event / page_size;
-      std::size_t i_vector = i_event % page_size;
-
-      double denominator = denominators[i_event];
-      if( m2_out != nullptr )
-      {
-        m2_out[i_event + offset] = matrix_elements[i_event];
-      }
-      if( amp2_out != nullptr )
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
       {
-        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        std::size_t i_sorted = permutation[i_event];
+        std::size_t page_size = MemoryAccessMomentaBase::neppM;
+        std::size_t i_page = i_sorted / page_size;
+        std::size_t i_vector = i_sorted % page_size;
+
+        double denominator = denominators[i_sorted];
+        if( m2_out != nullptr )
         {
-          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+          m2_out[i_event + offset] = matrix_elements[i_sorted];
+        }
+        if( amp2_out != nullptr )
+        {
+          for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+          {
+            amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+          }
+        }
+        if( diagram_out != nullptr )
+        {
+          diagram_out[i_event + offset] = diagram_index[i_sorted] - 1;
+        }
+        if( color_out != nullptr )
+        {
+          color_out[i_event + offset] = color_index[i_sorted] - 1;
+        }
+        if( helicity_out != nullptr )
+        {
+          helicity_out[i_event + offset] = helicity_index[i_sorted] - 1;
         }
       }
-      if( diagram_out != nullptr )
-      {
-        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
-      }
-      if( color_out != nullptr )
-      {
-        color_out[i_event + offset] = color_index[i_event] - 1;
-      }
-      if( helicity_out != nullptr )
+    } else {
+      std::size_t page_size = MemoryAccessMomentaBase::neppM;
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
       {
-        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+        std::size_t i_page = i_event / page_size;
+        std::size_t i_vector = i_event % page_size;
+
+        double denominator = denominators[i_event];
+        if( m2_out != nullptr )
+        {
+          m2_out[i_event + offset] = matrix_elements[i_event];
+        }
+        if( amp2_out != nullptr )
+        {
+          for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+          {
+            amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+          }
+        }
+        if( diagram_out != nullptr )
+        {
+          diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+        }
+        if( color_out != nullptr )
+        {
+          color_out[i_event + offset] = color_index[i_event] - 1;
+        }
+        if( helicity_out != nullptr )
+        {
+          helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+        }
       }
     }
 #endif // MGONGPUCPP_GPUIMPL
@@ -541,12 +615,6 @@ extern "C"
   UmamiStatus umami_free( UmamiHandle handle )
   {
     InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
-#ifdef MGONGPUCPP_GPUIMPL
-    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
-    {
-      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
-    }
-#endif
     delete instance;
     return UMAMI_SUCCESS;
   }
diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py
index 47e7e680d..55bc70bdb 100644
--- a/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/madgraph/iolibs/template_files/mg7/madevent.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 from typing import Literal, NamedTuple
 import tomllib
+import resource
 
 if "LHAPDF_DATA_PATH" in os.environ:
     PDF_PATH = os.environ["LHAPDF_DATA_PATH"]
@@ -393,12 +394,15 @@ def survey(self) -> None:
 
     def train_madnis(self) -> None:
         madnis_args = self.run_card["madnis"]
-        gen_args = self.run_card["generation"]
-        run_args = self.run_card["run"]
+        if not madnis_args["enable"]:
+            return
         if madnis_args.get("old", False):
             self.train_madnis_old()
             return
 
+        gen_args = self.run_card["generation"]
+        run_args = self.run_card["run"]
+
         config = ms.MadnisConfig()
         config.verbosity = run_args["verbosity"]
         config.learning_rate = madnis_args["lr"]
@@ -1164,7 +1168,7 @@ def build_vegas(self, mapping: ms.PhaseSpaceMapping, prefix: str) -> ms.VegasMap
     def build_discrete(
         self, permutation_count: int, flavor_count: int, prefix: str
     ) -> tuple[ms.DiscreteSampler | None, ms.DiscreteSampler | None]:
-        return None, None
+        #return None, None
         discrete_before = None
         #if permutation_count > 1:
         #    discrete_before = ms.DiscreteSampler(
@@ -1344,6 +1348,10 @@ def main() -> None:
     if args.ask_edit_cards:
         ask_edit_cards()
 
+    # Remove soft limit on number of open files as it can be quite low on some systems
+    soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (hard_lim, hard_lim))
+
     process = MadgraphProcess()
     process.survey()
     process.train_madnis()
diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml
index f965b2707..de6cacb8b 100644
--- a/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -15,6 +15,7 @@ output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe
 verbosity = "pretty" # options: silent, pretty, log
 dummy_matrix_element = false
 save_gridpack = false
+gridpack_include_source = false
 
 [beam]
 e_cm = 13000.0
diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py
index 94215209b..ca9abf2bd 100644
--- a/madmatrix/model_handling.py
+++ b/madmatrix/model_handling.py
@@ -1725,7 +1725,8 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
-                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                   const int nevt,                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                   const bool processAllHelicities    // input: if true, use blockIdx.y to index helicities
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
                    bool storeChannelWeights,
@@ -1767,6 +1768,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
     //debug = ( ievt == 0 );
     //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel );
+    if (processAllHelicities) {
+      int ighel = blockIdx.y;
+      ihel = dcGoodHel[ighel];
+      allJamps = allJamps + ighel * nevt;
+      allNumerators = allNumerators + ighel * nevt * processConfig::ndiagrams;
+      allDenominators = allDenominators + ighel * nevt;
+    }
 #endif /* clang-format on */""")
             nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
             ret_lines.append("""
@@ -2300,10 +2308,11 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       // Scalar iflavor for the current event
       // for GPU it is an int
       // for SIMD it is also an int, since it is constant across the SIMD vector
-      const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavorVec );
 #ifdef MGONGPUCPP_GPUIMPL
-      const unsigned int iflavor = iflavor_sv;
+      const unsigned int iflavor = F_ACCESS::kernelAccessConst( iflavorVec );
 #else
+      const unsigned int* iflavor_rec = F_ACCESS::ieventAccessRecordConst( iflavorVec, ievt0 );
+      const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavor_rec );
       const unsigned int iflavor = reinterpret_cast<const unsigned int*>(&iflavor_sv)[0];
 #endif
 """)
diff --git a/madspace/src/phasespace/integrand.cpp b/madspace/src/phasespace/integrand.cpp
index fa5453a41..ef86b23b0 100644
--- a/madspace/src/phasespace/integrand.cpp
+++ b/madspace/src/phasespace/integrand.cpp
@@ -129,14 +129,14 @@ Integrand::Integrand(
                 }
                 if (flav_count > 1 &&
                     !std::holds_alternative<std::monostate>(discrete_after)) {
-                    ret_types.push_back("flavor_index", batch_int);
+                    ret_types.push_back("discrete_flavor_index", batch_int);
                 }
             }
             if (flags & return_discrete_latent) {
                 ret_types.push_back("channel_index_in_group", batch_int);
                 if (flav_count > 1 &&
                     !std::holds_alternative<std::monostate>(discrete_after)) {
-                    ret_types.push_back("flavor_index", batch_int);
+                    ret_types.push_back("discrete_flavor_index", batch_int);
                     if (pdf_grid && energy_scale) {
                         ret_types.push_back("pdf_prior", batch_float_array(flav_count));
                     }
@@ -640,7 +640,8 @@ NamedVector<Value> Integrand::build_common_part(
             !std::holds_alternative<std::monostate>(_discrete_after)) {
             auto zeros = fb.full({static_cast<me_int_t>(0), args.batch_size});
             outputs.push_back(
-                "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id())
+                "discrete_flavor_index",
+                scatter_or_drop(fb, result, zeros, result.flavor_id())
             );
         }
     }
@@ -653,7 +654,8 @@ NamedVector<Value> Integrand::build_common_part(
             !std::holds_alternative<std::monostate>(_discrete_after)) {
             auto zeros = fb.full({static_cast<me_int_t>(0), args.batch_size});
             outputs.push_back(
-                "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id())
+                "discrete_flavor_index",
+                scatter_or_drop(fb, result, zeros, result.flavor_id())
             );
             if (args.has_pdf_prior) {
                 auto flav_count = _diff_xs.pid_options().size();
@@ -795,7 +797,7 @@ IntegrandProbability::IntegrandProbability(const Integrand& integrand) :
             auto flavor_count = integrand._diff_xs.pid_options().size();
             if (flavor_count > 1 &&
                 !std::holds_alternative<std::monostate>(integrand._discrete_after)) {
-                arg_types.push_back("flavor_index", batch_int);
+                arg_types.push_back("discrete_flavor_index", batch_int);
                 if ((integrand._pdfs.at(0) || integrand._pdfs.at(1)) &&
                     integrand._energy_scale) {
                     arg_types.push_back("pdf_prior", batch_float_array(flavor_count));