From f1431e6fd4f655dcc24867d3844d067efbfbc198 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Fri, 15 May 2026 12:26:34 +0200
Subject: [PATCH 01/10] single mallocAsync call in umami on GPU

---
 .../iolibs/template_files/madmatrix/umami.cc  | 65 ++++++++++---------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index 05731edf8..2a65060d5 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -336,22 +336,38 @@ extern "C"
     unsigned int *flavor_indices, *diagram_index;
 
     std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup;
-    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &flavor_indices, rounded_count * sizeof( unsigned int ), gpu_stream );
-    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
-    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
-    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
-    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
-    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    std::array<std::pair<void**, std::size_t>, 16> ptrs_and_sizes = {{
+        {&momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype )},
+        {&couplings, rounded_count * n_coup * 2 * sizeof( fptype )},
+        {&g_s, rounded_count * sizeof( fptype )},
+        {&flavor_indices, rounded_count * sizeof( unsigned int )},
+        {&helicity_random, rounded_count * sizeof( fptype )},
+        {&color_random, rounded_count * sizeof( fptype )},
+        {&diagram_random, rounded_count * sizeof( fptype )},
+        {&matrix_elements, rounded_count * sizeof( fptype )},
+        {&diagram_index, rounded_count * sizeof( unsigned int )},
+        {&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
+        {&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )},
+        {&denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype )},
+        {&helicity_index, rounded_count * sizeof( int )},
+        {&color_index, rounded_count * sizeof( int )},
+        {&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype )},
+        {&ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
+    }};
+    std::size_t total_size = 0;
+    for (auto [ptr, size] : ptrs_and_sizes) {
+        std::size_t aligned_size = (size + 7) / 8 * 8;
+        total_size += aligned_size;
+    }
+    uint8_t* buffer;
+    // we can consider caching this between matrix element calls
+    gpuMallocAsync( &buffer, total_size, gpu_stream );
+    std::size_t offset = 0;
+    for (auto [ptr, size] : ptrs_and_sizes) {
+        std::size_t aligned_size = (size + 7) / 8 * 8;
+        *ptr = buffer + offset;
+        offset += aligned_size;
+    }
 
     copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
       momenta_in,
@@ -424,22 +440,7 @@ extern "C"
       offset );
     checkGpu( gpuPeekAtLastError() );
 
-    gpuFreeAsync( momenta, gpu_stream );
-    gpuFreeAsync( couplings, gpu_stream );
-    gpuFreeAsync( flavor_indices, gpu_stream );
-    gpuFreeAsync( g_s, gpu_stream );
-    gpuFreeAsync( helicity_random, gpu_stream );
-    gpuFreeAsync( color_random, gpu_stream );
-    gpuFreeAsync( diagram_random, gpu_stream );
-    gpuFreeAsync( matrix_elements, gpu_stream );
-    gpuFreeAsync( diagram_index, gpu_stream );
-    gpuFreeAsync( color_jamps, gpu_stream );
-    gpuFreeAsync( numerators, gpu_stream );
-    gpuFreeAsync( denominators, gpu_stream );
-    gpuFreeAsync( helicity_index, gpu_stream );
-    gpuFreeAsync( color_index, gpu_stream );
-    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
-    gpuFreeAsync( ghel_jamps, gpu_stream );
+    gpuFreeAsync( buffer, gpu_stream );
 #else  // MGONGPUCPP_GPUIMPL
     // need to round to round to double page size for some reason
     std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;

From 082c05391548dba853826507b4c3e5ab4b010217 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Fri, 15 May 2026 12:40:46 +0200
Subject: [PATCH 02/10] fix common memory allocation

---
 .../iolibs/template_files/madmatrix/umami.cc  | 58 ++++++++++---------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index 2a65060d5..0ccf38b9a 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -12,6 +12,8 @@
 #include "MemoryBuffers.h"
 
 #include <cmath>
+#include <array>
+#include <utility>
 
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
@@ -216,19 +218,19 @@ extern "C"
   }
 
   UmamiStatus umami_set_parameter(
-    UmamiHandle handle,
-    char const* name,
-    double parameter_real,
-    double parameter_imag )
+    [[maybe_unused]] UmamiHandle handle,
+    [[maybe_unused]] char const* name,
+    [[maybe_unused]] double parameter_real,
+    [[maybe_unused]] double parameter_imag )
   {
     return UMAMI_ERROR_NOT_IMPLEMENTED;
   }
 
   UmamiStatus umami_get_parameter(
-    UmamiHandle handle,
-    char const* name,
-    double* parameter_real,
-    double* parameter_imag )
+    [[maybe_unused]] UmamiHandle handle,
+    [[maybe_unused]] char const* name,
+    [[maybe_unused]] double* parameter_real,
+    [[maybe_unused]] double* parameter_imag )
   {
     return UMAMI_ERROR_NOT_IMPLEMENTED;
   }
@@ -251,7 +253,7 @@ extern "C"
     const double* random_color_in = nullptr;
     const double* random_helicity_in = nullptr;
     const double* random_diagram_in = nullptr;
-    const int* diagram_in = nullptr; // TODO: unused
+    //const int* diagram_in = nullptr; // TODO: unused
 
     for( std::size_t i = 0; i < input_count; ++i )
     {
@@ -337,22 +339,22 @@ extern "C"
 
     std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup;
     std::array<std::pair<void**, std::size_t>, 16> ptrs_and_sizes = {{
-        {&momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype )},
-        {&couplings, rounded_count * n_coup * 2 * sizeof( fptype )},
-        {&g_s, rounded_count * sizeof( fptype )},
-        {&flavor_indices, rounded_count * sizeof( unsigned int )},
-        {&helicity_random, rounded_count * sizeof( fptype )},
-        {&color_random, rounded_count * sizeof( fptype )},
-        {&diagram_random, rounded_count * sizeof( fptype )},
-        {&matrix_elements, rounded_count * sizeof( fptype )},
-        {&diagram_index, rounded_count * sizeof( unsigned int )},
-        {&color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
-        {&numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )},
-        {&denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype )},
-        {&helicity_index, rounded_count * sizeof( int )},
-        {&color_index, rounded_count * sizeof( int )},
-        {&ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype )},
-        {&ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&momenta), rounded_count * CPPProcess::npar * 4 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&couplings), rounded_count * n_coup * 2 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&g_s), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&flavor_indices), rounded_count * sizeof( unsigned int )},
+        {reinterpret_cast<void**>(&helicity_random), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&color_random), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&diagram_random), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&matrix_elements), rounded_count * sizeof( fptype )},
+        {reinterpret_cast<void**>(&diagram_index), rounded_count * sizeof( unsigned int )},
+        {reinterpret_cast<void**>(&color_jamps), rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
+        {reinterpret_cast<void**>(&numerators), rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype )},
+        {reinterpret_cast<void**>(&denominators), rounded_count * CPPProcess::ncomb * sizeof( fptype )},
+        {reinterpret_cast<void**>(&helicity_index), rounded_count * sizeof( int )},
+        {reinterpret_cast<void**>(&color_index), rounded_count * sizeof( int )},
+        {reinterpret_cast<void**>(&ghel_matrix_elements), rounded_count * CPPProcess::ncomb * sizeof( fptype )},
+        {reinterpret_cast<void**>(&ghel_jamps), rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype )},
     }};
     std::size_t total_size = 0;
     for (auto [ptr, size] : ptrs_and_sizes) {
@@ -362,11 +364,11 @@ extern "C"
     uint8_t* buffer;
     // we can consider caching this between matrix element calls
     gpuMallocAsync( &buffer, total_size, gpu_stream );
-    std::size_t offset = 0;
+    std::size_t buf_offset = 0;
     for (auto [ptr, size] : ptrs_and_sizes) {
         std::size_t aligned_size = (size + 7) / 8 * 8;
-        *ptr = buffer + offset;
-        offset += aligned_size;
+        *ptr = buffer + buf_offset;
+        buf_offset += aligned_size;
     }
 
     copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(

From b61c32cb75881d392770e627a04b16ef6a2ad4f2 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Fri, 15 May 2026 12:42:08 +0200
Subject: [PATCH 03/10] update run_card.toml

---
 madgraph/iolibs/template_files/mg7/run_card.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml
index f965b2707..de6cacb8b 100644
--- a/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -15,6 +15,7 @@ output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe
 verbosity = "pretty" # options: silent, pretty, log
 dummy_matrix_element = false
 save_gridpack = false
+gridpack_include_source = false
 
 [beam]
 e_cm = 13000.0

From 4f11d133cccc5e3b9848437c3eef815f7bd452e8 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Fri, 15 May 2026 12:47:01 +0200
Subject: [PATCH 04/10] fix unused warning, fix madnis.enable setting

---
 madgraph/iolibs/template_files/madmatrix/umami.cc | 2 +-
 madgraph/iolibs/template_files/mg7/madevent.py    | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index 0ccf38b9a..932a8a7b3 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -253,7 +253,7 @@ extern "C"
     const double* random_color_in = nullptr;
     const double* random_helicity_in = nullptr;
     const double* random_diagram_in = nullptr;
-    //const int* diagram_in = nullptr; // TODO: unused
+    [[maybe_unused]] const int* diagram_in = nullptr; // TODO: unused
 
     for( std::size_t i = 0; i < input_count; ++i )
     {
diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py
index 47e7e680d..c2120e892 100644
--- a/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/madgraph/iolibs/template_files/mg7/madevent.py
@@ -393,12 +393,15 @@ def survey(self) -> None:
 
     def train_madnis(self) -> None:
         madnis_args = self.run_card["madnis"]
-        gen_args = self.run_card["generation"]
-        run_args = self.run_card["run"]
+        if not madnis_args["enable"]:
+            return
         if madnis_args.get("old", False):
             self.train_madnis_old()
             return
 
+        gen_args = self.run_card["generation"]
+        run_args = self.run_card["run"]
+
         config = ms.MadnisConfig()
         config.verbosity = run_args["verbosity"]
         config.learning_rate = madnis_args["lr"]

From fb610e9a7ec2d28bf3ff253d9871c5609597a1f9 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Thu, 21 May 2026 15:06:26 +0200
Subject: [PATCH 05/10] fully async gpu matrix elements

---
 .../template_files/madmatrix/GpuAbstraction.h |  2 +
 .../madmatrix/MatrixElementKernels.cc         |  2 +-
 .../template_files/madmatrix/color_sum.cc     | 40 +++++++++++--------
 .../template_files/madmatrix/color_sum.h      | 24 ++++++-----
 .../process_function_definitions.inc          |  3 +-
 .../template_files/madmatrix/process_h.inc    |  1 +
 .../madmatrix/process_sigmaKin_function.inc   | 28 +++++++------
 .../iolibs/template_files/madmatrix/umami.cc  | 15 +------
 madmatrix/model_handling.py                   | 10 ++++-
 9 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
index ac2cbc5a1..6f709dfe9 100644
--- a/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
+++ b/madgraph/iolibs/template_files/madmatrix/GpuAbstraction.h
@@ -46,6 +46,7 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
 #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<<dim3(blocks_x, blocks_y, 1), dim3(threads, 1, 1), 0, stream>>>( __VA_ARGS__ )
 
 #define gpuStream_t cudaStream_t
 #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
@@ -113,6 +114,7 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 //#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
 #define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+#define gpuLaunchKernel2D( kernel, blocks_x, blocks_y, threads, stream, ... ) kernel<<<dim3(blocks_x, blocks_y, 1), dim3(threads, 1, 1), 0, stream>>>( __VA_ARGS__ )
 
 #define gpuStream_t hipStream_t
 #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
diff --git a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc
index a019296df..872e4795e 100644
--- a/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc
+++ b/madgraph/iolibs/template_files/madmatrix/MatrixElementKernels.cc
@@ -490,7 +490,7 @@ namespace mg5amcGpu
     gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, false, m_gpublocks, m_gputhreads );
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
     copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?!
diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.cc b/madgraph/iolibs/template_files/madmatrix/color_sum.cc
index fcfa4ee51..75202e963 100644
--- a/madgraph/iolibs/template_files/madmatrix/color_sum.cc
+++ b/madgraph/iolibs/template_files/madmatrix/color_sum.cc
@@ -377,21 +377,25 @@ namespace mg5amcCpu
                  gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
                  gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
                  const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+                 const int gputhreads,             // input: cuda gputhreads
+                 const bool async )                // input: if true, run everything asynchronously in first stream in ghelStreams
   {
     const int nevt = gpublocks * gputhreads;
     // CASE 1: KERNEL
     if( !pBlasHandle )
     {
       assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
-      // Loop over helicities
-      for( int ighel = 0; ighel < nGoodHel; ighel++ )
-      {
-        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
-        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
-        gpuStream_t hStream = ghelStreams[ighel];
-        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      if (async) {
+        gpuLaunchKernel2D( calculate_jamps, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt );
+      } else {
+        // Loop over helicities
+        for( int ighel = 0; ighel < nGoodHel; ighel++ )
+        {
+          fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+          const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+          gpuStream_t hStream = ghelStreams[ighel];
+          gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel, 0 );
+        }
       }
     }
     // CASE 2: BLAS
@@ -400,15 +404,19 @@ namespace mg5amcCpu
 #ifdef MGONGPU_HAS_NO_BLAS
       assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
 #else
-      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+      if (async) {
+        assert( false ); // BLAS in async mode not supported for now
+      } else {
+        checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+        // Reset the tmp buffer
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+        gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
 #else
-      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
-#endif
-      // Delegate the color sum to BLAS for 
-      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+        gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif  
+        // Delegate the color sum to BLAS for 
+        color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+      }
 #endif
     }
   }
diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.h b/madgraph/iolibs/template_files/madmatrix/color_sum.h
index b64febcdb..347184c4e 100644
--- a/madgraph/iolibs/template_files/madmatrix/color_sum.h
+++ b/madgraph/iolibs/template_files/madmatrix/color_sum.h
@@ -78,23 +78,25 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
   void
-  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,           // input: number of good helicities
-                 const int gpublocks,          // input: cuda gpublocks
-                 const int gputhreads );       // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads,             // input: cuda gputhreads
+                 const bool processAllHelicities); // input: if true, use blockIdx.y to index helicities
 #endif
 
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL
   __global__ void
-  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
-                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
-                    const int nGoodHel );   // input: number of good helicities
+  color_sum_kernel( fptype* allMEs,                 // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps,         // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel,             // input: number of good helicities
+                    const int nevtIfAllHelicities); // input: zero in single-helicity mode, number of events in multi-helicity mode
 #endif
 
   //--------------------------------------------------------------------------
diff --git a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
index c0f130b95..ee36fb590 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
@@ -462,7 +462,7 @@ namespace mg5amcCpu
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
-      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 );
       gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
       //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
       for( int ievt = 0; ievt < maxtry; ++ievt )
@@ -806,6 +806,7 @@ namespace mg5amcCpu
             fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
             gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const bool async,
             const int gpublocks,                // input: cuda gpublocks
             const int gputhreads                // input: cuda gputhreads
 #else
diff --git a/madgraph/iolibs/template_files/madmatrix/process_h.inc b/madgraph/iolibs/template_files/madmatrix/process_h.inc
index 3fb87b052..fb715a877 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_h.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_h.inc
@@ -99,6 +99,7 @@ namespace mg5amcCpu
             fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
             gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
             gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const bool async,                   // input: if true, run everything asynchronously in first stream in ghelStreams
             const int gpublocks,                // input: cuda gpublocks
             const int gputhreads );             // input: cuda gputhreads
 #else
diff --git a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
index 6db9d8079..eafb0eece 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
@@ -51,19 +51,23 @@
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      const int ihel = cGoodHel[ighel];
-      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
-      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
-      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+    if (async) {
+      gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], allmomenta, allcouplings, ghelAllJamps, allChannelIds, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true );
+    } else {
+      for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+      {
+        const int ihel = cGoodHel[ighel];
+        fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false );
+      }
+      // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+      color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+      checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+      // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     }
-    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
-    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
-    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
-    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 
diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index 932a8a7b3..05ad57498 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -158,9 +158,6 @@ namespace
   struct InterfaceInstance
   {
     bool initialized = false;
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuStream_t hel_streams[CPPProcess::ncomb];
-#endif
   };
 
 }
@@ -208,12 +205,6 @@ extern "C"
     process.initProc( param_card_path );
     auto instance = new InterfaceInstance();
     *handle = instance;
-#ifdef MGONGPUCPP_GPUIMPL
-    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
-    {
-      gpuStreamCreate( &instance->hel_streams[ihel] );
-    }
-#endif
     return UMAMI_SUCCESS;
   }
 
@@ -389,9 +380,6 @@ extern "C"
       offset );
     computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
     checkGpu( gpuPeekAtLastError() );
-    // TODO: make things fully async (requires using events instead of synchronize in
-    //       the sigmaKin implementation)
-    gpuStreamSynchronize( gpu_stream );
 
     InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
     if( !instance->initialized )
@@ -421,7 +409,8 @@ extern "C"
       ghel_jamps,
       nullptr,
       nullptr,
-      instance->hel_streams,
+      &gpu_stream,
+      true,
       n_blocks,
       n_threads );
 
diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py
index 94215209b..e1e007fab 100644
--- a/madmatrix/model_handling.py
+++ b/madmatrix/model_handling.py
@@ -1725,7 +1725,8 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
-                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                   const int nevt,                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                   const bool processAllHelicities    // input: if true, use blockIdx.y to index helicities
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
                    bool storeChannelWeights,
@@ -1765,6 +1766,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel );
 #else
     //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    if (processAllHelicities) {
+      int ighel = blockIdx.y;
+      ihel = dcGoodHel[ighel];
+      allJamps = allJamps + ighel * nevt;
+      allNumerators = allNumerators + ighel * nevt;
+      allDenominators = allDenominators + ighel * nevt;
+    }
     //debug = ( ievt == 0 );
     //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel );
 #endif /* clang-format on */""")

From a85f8ca3f5d7b749e25975c841eba29909607c28 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Fri, 22 May 2026 12:23:58 +0200
Subject: [PATCH 06/10] reorder events in simd mode such that the flavor index
 is the same for all events in a vector

---
 .../iolibs/template_files/madmatrix/umami.cc  | 82 ++++++++++++++-----
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index 05ad57498..bc3e74c90 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -12,6 +12,7 @@
 #include "MemoryBuffers.h"
 
 #include <cmath>
+#include <vector>
 #include <array>
 #include <utility>
 
@@ -74,18 +75,18 @@ namespace
   __device__
 #endif
     void
-    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event_in, std::size_t i_event_out, std::size_t stride )
   {
     std::size_t page_size = MemoryAccessMomentaBase::neppM;
-    std::size_t i_page = i_event / page_size;
-    std::size_t i_vector = i_event % page_size;
+    std::size_t i_page = i_event_out / page_size;
+    std::size_t i_vector = i_event_out % page_size;
 
     for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
     {
       for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
       {
         momenta_out[i_page * CPPProcess::npar * 4 * page_size +
-                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event_in];
       }
     }
   }
@@ -112,7 +113,7 @@ namespace
     std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
     if( i_event >= count ) return;
 
-    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    transpose_momenta( &momenta_in[offset], momenta, i_event, i_event, stride );
     diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
     helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
     color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
@@ -433,14 +434,45 @@ extern "C"
 
     gpuFreeAsync( buffer, gpu_stream );
 #else  // MGONGPUCPP_GPUIMPL
+    constexpr std::size_t vector_size = MemoryAccessMomentaBase::neppM;
     // need to round to round to double page size for some reason
-    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
-    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+    constexpr std::size_t page_size2 = 2 * vector_size;
+    std::vector<std::size_t> permutation;
+    std::size_t rounded_count;
+
+    constexpr std::size_t flavor_count = CPPProcess::nmaxflavor;
+    HostBufferBase<unsigned int, false> flavor_indices( ((count + page_size2 - 1) / page_size2 + flavor_count) * page_size2 );
+    bool sort_flavors = vector_size > 1 && flavor_count > 1 && flavor_indices_in;
+    if ( sort_flavors ) {
+      permutation.resize(count);
+      std::size_t voffset = 0;
+      std::size_t vector_indices[flavor_count] = {};
+      std::size_t vector_counts[flavor_count] = {};
+      // determine permutation of inputs such that all entries in a SIMD vector
+      // have the same flavor index
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
+      {
+        unsigned int flav = flavor_indices_in[i_event + offset];
+        auto& vcount = vector_counts[flav];
+        auto& vindex = vector_indices[flav];
+        if (vcount == 0) {
+          vindex = voffset * page_size2;
+          for ( std::size_t i = 0; i < page_size2; ++i) {
+            flavor_indices[voffset * page_size2 + i] = flav;
+          }
+          voffset += 1;
+        }
+        permutation[i_event] = vindex + vcount;
+        vcount = (vcount + 1) % page_size2;
+      }
+      rounded_count = voffset * page_size2;
+    } else {
+      rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+    }
 
     HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
     HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_dependentCouplings::ndcoup * 2 );
     HostBufferBase<fptype, false> g_s( rounded_count );
-    HostBufferBase<unsigned int, false> flavor_indices( rounded_count );
     HostBufferBase<fptype, false> helicity_random( rounded_count );
     HostBufferBase<fptype, false> color_random( rounded_count );
     HostBufferBase<fptype, false> diagram_random( rounded_count );
@@ -450,17 +482,29 @@ extern "C"
     HostBufferBase<fptype, false> denominators( rounded_count );
     HostBufferBase<int, false> helicity_index( rounded_count );
     HostBufferBase<int, false> color_index( rounded_count );
-    for( std::size_t i_event = 0; i_event < count; ++i_event )
-    {
-      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
-      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
-      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
-      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
-      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
-      flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0;
-    }
-    for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) {
-      flavor_indices[i_event] = 0;
+    if ( sort_flavors ) {
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
+      {
+        std::size_t i_sorted = permutation[i_event];
+        transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_sorted, stride );
+        helicity_random[i_sorted] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+        color_random[i_sorted] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+        diagram_random[i_sorted] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+        g_s[i_sorted] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+      }
+    } else {
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
+      {
+        transpose_momenta( &momenta_in[offset], momenta.data(), i_event, i_event, stride );
+        helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+        color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+        diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+        g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+        flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0;
+      }
+      for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) {
+        flavor_indices[i_event] = 0;
+      }
     }
     computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
 

From cfd6aed3188546ab16a49df4f26286625dbdec56 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Fri, 22 May 2026 14:16:56 +0200
Subject: [PATCH 07/10] apply flavor ordering to matrix element outputs

---
 .../iolibs/template_files/madmatrix/umami.cc  | 88 +++++++++++++------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index bc3e74c90..aa457c7a6 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -443,7 +443,8 @@ extern "C"
     constexpr std::size_t flavor_count = CPPProcess::nmaxflavor;
     HostBufferBase<unsigned int, false> flavor_indices( ((count + page_size2 - 1) / page_size2 + flavor_count) * page_size2 );
     bool sort_flavors = vector_size > 1 && flavor_count > 1 && flavor_indices_in;
-    if ( sort_flavors ) {
+    if ( sort_flavors ) 
+    {
       permutation.resize(count);
       std::size_t voffset = 0;
       std::size_t vector_indices[flavor_count] = {};
@@ -455,7 +456,8 @@ extern "C"
         unsigned int flav = flavor_indices_in[i_event + offset];
         auto& vcount = vector_counts[flav];
         auto& vindex = vector_indices[flav];
-        if (vcount == 0) {
+        if ( vcount == 0 )
+        {
           vindex = voffset * page_size2;
           for ( std::size_t i = 0; i < page_size2; ++i) {
             flavor_indices[voffset * page_size2 + i] = flav;
@@ -539,35 +541,71 @@ extern "C"
       false,
       rounded_count );
 
-    std::size_t page_size = MemoryAccessMomentaBase::neppM;
-    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    if ( sort_flavors )
     {
-      std::size_t i_page = i_event / page_size;
-      std::size_t i_vector = i_event % page_size;
-
-      double denominator = denominators[i_event];
-      if( m2_out != nullptr )
-      {
-        m2_out[i_event + offset] = matrix_elements[i_event];
-      }
-      if( amp2_out != nullptr )
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
       {
-        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        std::size_t i_sorted = permutation[i_event];
+        std::size_t page_size = MemoryAccessMomentaBase::neppM;
+        std::size_t i_page = i_sorted / page_size;
+        std::size_t i_vector = i_sorted % page_size;
+
+        double denominator = denominators[i_sorted];
+        if( m2_out != nullptr )
         {
-          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+          m2_out[i_event + offset] = matrix_elements[i_sorted];
+        }
+        if( amp2_out != nullptr )
+        {
+          for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+          {
+            amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+          }
+        }
+        if( diagram_out != nullptr )
+        {
+          diagram_out[i_event + offset] = diagram_index[i_sorted] - 1;
+        }
+        if( color_out != nullptr )
+        {
+          color_out[i_event + offset] = color_index[i_sorted] - 1;
+        }
+        if( helicity_out != nullptr )
+        {
+          helicity_out[i_event + offset] = helicity_index[i_sorted] - 1;
         }
       }
-      if( diagram_out != nullptr )
-      {
-        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
-      }
-      if( color_out != nullptr )
-      {
-        color_out[i_event + offset] = color_index[i_event] - 1;
-      }
-      if( helicity_out != nullptr )
+    } else {
+      std::size_t page_size = MemoryAccessMomentaBase::neppM;
+      for( std::size_t i_event = 0; i_event < count; ++i_event )
       {
-        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+        std::size_t i_page = i_event / page_size;
+        std::size_t i_vector = i_event % page_size;
+
+        double denominator = denominators[i_event];
+        if( m2_out != nullptr )
+        {
+          m2_out[i_event + offset] = matrix_elements[i_event];
+        }
+        if( amp2_out != nullptr )
+        {
+          for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+          {
+            amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+          }
+        }
+        if( diagram_out != nullptr )
+        {
+          diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+        }
+        if( color_out != nullptr )
+        {
+          color_out[i_event + offset] = color_index[i_event] - 1;
+        }
+        if( helicity_out != nullptr )
+        {
+          helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+        }
       }
     }
 #endif // MGONGPUCPP_GPUIMPL

From d147d36ea0ae0803207663890bcb381c69505902 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Wed, 27 May 2026 16:37:14 +0200
Subject: [PATCH 08/10] fix SIMD flavor selection

---
 madmatrix/model_handling.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py
index e1e007fab..95a16eee8 100644
--- a/madmatrix/model_handling.py
+++ b/madmatrix/model_handling.py
@@ -2308,10 +2308,11 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       // Scalar iflavor for the current event
       // for GPU it is an int
       // for SIMD it is also an int, since it is constant across the SIMD vector
-      const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavorVec );
 #ifdef MGONGPUCPP_GPUIMPL
-      const unsigned int iflavor = iflavor_sv;
+      const unsigned int iflavor = F_ACCESS::kernelAccessConst( iflavorVec );
 #else
+      const unsigned int* iflavor_rec = F_ACCESS::ieventAccessRecordConst( iflavorVec, ievt0 );
+      const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavor_rec );
       const unsigned int iflavor = reinterpret_cast<const unsigned int*>(&iflavor_sv)[0];
 #endif
 """)

From 952ede5bdae2aed43b7b47c178a44e813a8db41b Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Thu, 28 May 2026 09:42:36 +0200
Subject: [PATCH 09/10] fix flavor sampling and limitation of open files

---
 madgraph/iolibs/template_files/mg7/madevent.py |  7 ++++++-
 madspace/src/phasespace/integrand.cpp          | 12 +++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py
index c2120e892..55bc70bdb 100644
--- a/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/madgraph/iolibs/template_files/mg7/madevent.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 from typing import Literal, NamedTuple
 import tomllib
+import resource
 
 if "LHAPDF_DATA_PATH" in os.environ:
     PDF_PATH = os.environ["LHAPDF_DATA_PATH"]
@@ -1167,7 +1168,7 @@ def build_vegas(self, mapping: ms.PhaseSpaceMapping, prefix: str) -> ms.VegasMap
     def build_discrete(
         self, permutation_count: int, flavor_count: int, prefix: str
     ) -> tuple[ms.DiscreteSampler | None, ms.DiscreteSampler | None]:
-        return None, None
+        #return None, None
         discrete_before = None
         #if permutation_count > 1:
         #    discrete_before = ms.DiscreteSampler(
@@ -1347,6 +1348,10 @@ def main() -> None:
     if args.ask_edit_cards:
         ask_edit_cards()
 
+    # Remove soft limit on number of open files as it can be quite low on some systems
+    soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (hard_lim, hard_lim))
+
     process = MadgraphProcess()
     process.survey()
     process.train_madnis()
diff --git a/madspace/src/phasespace/integrand.cpp b/madspace/src/phasespace/integrand.cpp
index fa5453a41..ef86b23b0 100644
--- a/madspace/src/phasespace/integrand.cpp
+++ b/madspace/src/phasespace/integrand.cpp
@@ -129,14 +129,14 @@ Integrand::Integrand(
                 }
                 if (flav_count > 1 &&
                     !std::holds_alternative<std::monostate>(discrete_after)) {
-                    ret_types.push_back("flavor_index", batch_int);
+                    ret_types.push_back("discrete_flavor_index", batch_int);
                 }
             }
             if (flags & return_discrete_latent) {
                 ret_types.push_back("channel_index_in_group", batch_int);
                 if (flav_count > 1 &&
                     !std::holds_alternative<std::monostate>(discrete_after)) {
-                    ret_types.push_back("flavor_index", batch_int);
+                    ret_types.push_back("discrete_flavor_index", batch_int);
                     if (pdf_grid && energy_scale) {
                         ret_types.push_back("pdf_prior", batch_float_array(flav_count));
                     }
@@ -640,7 +640,8 @@ NamedVector<Value> Integrand::build_common_part(
             !std::holds_alternative<std::monostate>(_discrete_after)) {
             auto zeros = fb.full({static_cast<me_int_t>(0), args.batch_size});
             outputs.push_back(
-                "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id())
+                "discrete_flavor_index",
+                scatter_or_drop(fb, result, zeros, result.flavor_id())
             );
         }
     }
@@ -653,7 +654,8 @@ NamedVector<Value> Integrand::build_common_part(
             !std::holds_alternative<std::monostate>(_discrete_after)) {
             auto zeros = fb.full({static_cast<me_int_t>(0), args.batch_size});
             outputs.push_back(
-                "flavor_index", scatter_or_drop(fb, result, zeros, result.flavor_id())
+                "discrete_flavor_index",
+                scatter_or_drop(fb, result, zeros, result.flavor_id())
             );
             if (args.has_pdf_prior) {
                 auto flav_count = _diff_xs.pid_options().size();
@@ -795,7 +797,7 @@ IntegrandProbability::IntegrandProbability(const Integrand& integrand) :
             auto flavor_count = integrand._diff_xs.pid_options().size();
             if (flavor_count > 1 &&
                 !std::holds_alternative<std::monostate>(integrand._discrete_after)) {
-                arg_types.push_back("flavor_index", batch_int);
+                arg_types.push_back("discrete_flavor_index", batch_int);
                 if ((integrand._pdfs.at(0) || integrand._pdfs.at(1)) &&
                     integrand._energy_scale) {
                     arg_types.push_back("pdf_prior", batch_float_array(flavor_count));

From 412df9d75e7e65710f1beec728caf5f00220a9ea Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Thu, 28 May 2026 12:35:00 +0200
Subject: [PATCH 10/10] fix async gpu matrix element

---
 .../template_files/madmatrix/color_sum.cc     | 21 ++++++++++++-------
 .../process_function_definitions.inc          |  2 +-
 .../madmatrix/process_sigmaKin_function.inc   |  8 +++----
 .../iolibs/template_files/madmatrix/umami.cc  |  6 ------
 madmatrix/model_handling.py                   |  6 +++---
 5 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/madgraph/iolibs/template_files/madmatrix/color_sum.cc b/madgraph/iolibs/template_files/madmatrix/color_sum.cc
index 75202e963..30c679993 100644
--- a/madgraph/iolibs/template_files/madmatrix/color_sum.cc
+++ b/madgraph/iolibs/template_files/madmatrix/color_sum.cc
@@ -156,10 +156,16 @@ namespace mg5amcCpu
 
 #ifdef MGONGPUCPP_GPUIMPL
   __global__ void
-  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
-                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
-                    const int nGoodHel )    // input: number of good helicities
+  color_sum_kernel( fptype* allMEs,                 // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps,         // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel,             // input: number of good helicities
+                    const int nevtIfAllHelicities ) // input: zero in single-helicity mode, number of events in multi-helicity mode
   {
+    if (nevtIfAllHelicities) {
+      int ighel = blockIdx.y;
+      allMEs = allMEs + ighel * nevtIfAllHelicities; // MEs for one specific helicity ighel
+      allJamps = allJamps + ighel * nevtIfAllHelicities; // Jamps for one specific helicity ighel
+    }
     using J_ACCESS = DeviceAccessJamp;
     fptype jampR[ncolor];
     fptype jampI[ncolor];
@@ -377,16 +383,17 @@ namespace mg5amcCpu
                  gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
                  gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
                  const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
                  const int gputhreads,             // input: cuda gputhreads
-                 const bool async )                // input: if true, run everything asynchronously in first stream in ghelStreams
+                 const bool processAllHelicities ) // input: if true, use blockIdx.y to index helicities
   {
     const int nevt = gpublocks * gputhreads;
     // CASE 1: KERNEL
     if( !pBlasHandle )
     {
       assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
-      if (async) {
-        gpuLaunchKernel2D( calculate_jamps, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt );
+      if (processAllHelicities) {
+        gpuLaunchKernel2D( color_sum_kernel, gpublocks, nGoodHel, gputhreads, ghelStreams[0], ghelAllMEs, ghelAllJamps, nGoodHel, nevt );
       } else {
         // Loop over helicities
         for( int ighel = 0; ighel < nGoodHel; ighel++ )
@@ -404,7 +411,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_HAS_NO_BLAS
       assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
 #else
-      if (async) {
+      if (processAllHelicities) {
         assert( false ); // BLAS in async mode not supported for now
       } else {
         checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
diff --git a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
index ee36fb590..815fe5e40 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_function_definitions.inc
@@ -461,7 +461,7 @@ namespace mg5amcCpu
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads, false );
       gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel, 0 );
       gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
       //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
diff --git a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
index eafb0eece..227301a6e 100644
--- a/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
+++ b/madgraph/iolibs/template_files/madmatrix/process_sigmaKin_function.inc
@@ -51,8 +51,10 @@
     // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
     // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
     // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
     if (async) {
-      gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], allmomenta, allcouplings, ghelAllJamps, allChannelIds, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true );
+      gpuLaunchKernel2D( calculate_jamps, gpublocks, cNGoodHel, gputhreads, ghelStreams[0], 0, allmomenta, allcouplings, iflavorVec, ghelAllJamps, storeChannelWeights, ghelAllNumerators, ghelAllDenominators, colAllJamp2s, nevt, true );
+      color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, true );
     } else {
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
@@ -60,18 +62,16 @@
         fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
         fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
         fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
-        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
         gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt, false );
       }
       // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
-      color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+      color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads, false );
       checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
       // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
     }
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 
-    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
     gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
 
     // Event-by-event random choice of color and diagram #402
diff --git a/madgraph/iolibs/template_files/madmatrix/umami.cc b/madgraph/iolibs/template_files/madmatrix/umami.cc
index aa457c7a6..c3c90e983 100644
--- a/madgraph/iolibs/template_files/madmatrix/umami.cc
+++ b/madgraph/iolibs/template_files/madmatrix/umami.cc
@@ -615,12 +615,6 @@ extern "C"
   UmamiStatus umami_free( UmamiHandle handle )
   {
     InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
-#ifdef MGONGPUCPP_GPUIMPL
-    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
-    {
-      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
-    }
-#endif
     delete instance;
     return UMAMI_SUCCESS;
   }
diff --git a/madmatrix/model_handling.py b/madmatrix/model_handling.py
index e1e007fab..00d94df4f 100644
--- a/madmatrix/model_handling.py
+++ b/madmatrix/model_handling.py
@@ -1766,15 +1766,15 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel );
 #else
     //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel );
     if (processAllHelicities) {
       int ighel = blockIdx.y;
       ihel = dcGoodHel[ighel];
       allJamps = allJamps + ighel * nevt;
-      allNumerators = allNumerators + ighel * nevt;
+      allNumerators = allNumerators + ighel * nevt * processConfig::ndiagrams;
       allDenominators = allDenominators + ighel * nevt;
     }
-    //debug = ( ievt == 0 );
-    //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel );
 #endif /* clang-format on */""")
             nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
             ret_lines.append("""