From d860feaf388b6a951ca3b6c97708775b6dc9216e Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 6 Nov 2024 14:21:23 -0800
Subject: [PATCH] Derive the device_task_t from a parsec_object_t

Introduce the parsec_gpu_flow_info_s info structure to combine the flow
information needed by the GPU code.

Allow the standard device tasks (aka. parsec_gpu_dsl_task_t) to contain
the flow_info array inside the task, while allowing other DSL to have
their own type of device task (derived from parsec_gpu_task_t)

Enhance the mechanism to release the device tasks via the release_device_task
function pointer. The device code will call this function to let the DSL
decide how to device task release should be handled. Some DSL (PTG and
DTD as of now) will call OBJ_RELEASE on it (and the free is automatic),
while others (TTG as an example) will have its own handling.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/arena.c                                |   2 +-
 parsec/data.c                                 |   6 +-
 parsec/data_dist/matrix/broadcast.jdf         |   2 +-
 parsec/data_internal.h                        |   2 +-
 parsec/interfaces/dtd/insert_function.c       |  12 +-
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c    |  51 +++--
 parsec/mca/device/device_gpu.c                | 177 +++++++++++-------
 parsec/mca/device/device_gpu.h                |  56 ++++--
 parsec/mca/device/transfer_gpu.c              |   9 +-
 .../redistribute/testing_redistribute.c       |  10 +-
 tests/dsl/ptg/choice/choice_data.c            |   2 +-
 tests/runtime/cuda/nvlink_wrapper.c           |   2 +-
 tests/runtime/cuda/stage_custom.jdf           |  17 +-
 13 files changed, 200 insertions(+), 148 deletions(-)

diff --git a/parsec/arena.c b/parsec/arena.c
index ca7def080..321c94971 100644
--- a/parsec/arena.c
+++ b/parsec/arena.c
@@ -223,7 +223,7 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     assert(0 == (((ptrdiff_t)chunk->data) % arena->alignment));
     assert((arena->elem_size + (ptrdiff_t)chunk->data)  <= (size + (ptrdiff_t)chunk));
 
-    data->nb_elts = count * arena->elem_size;
+    data->span = count * arena->elem_size;
 
     copy->flags = PARSEC_DATA_FLAG_ARENA |
                   PARSEC_DATA_FLAG_PARSEC_OWNED |
diff --git a/parsec/data.c b/parsec/data.c
index a90b87f7d..5fa7decd9 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -70,7 +70,7 @@ static void parsec_data_construct(parsec_data_t* obj )
     obj->owner_device     = -1;
     obj->preferred_device = -1;
     obj->key              = 0;
-    obj->nb_elts          = 0;
+    obj->span          = 0;
     for( uint32_t i = 0; i < parsec_nb_devices;
          obj->device_copies[i] = NULL, i++ );
     obj->dc               = NULL;
@@ -511,7 +511,7 @@ parsec_data_create( parsec_data_t **holder,
         data->owner_device = 0;
         data->key = key;
         data->dc = desc;
-        data->nb_elts = size;
+        data->span = size;
         parsec_data_copy_attach(data, data_copy, 0);
 
         if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
@@ -548,7 +548,7 @@ parsec_data_create_with_type( parsec_data_collection_t *desc,
     clone->owner_device = 0;
     clone->key = key;
     clone->dc = desc;
-    clone->nb_elts = size;
+    clone->span = size;
     parsec_data_copy_attach(clone, data_copy, 0);
 
     return clone;
diff --git a/parsec/data_dist/matrix/broadcast.jdf b/parsec/data_dist/matrix/broadcast.jdf
index ff9e78074..ff2562480 100644
--- a/parsec/data_dist/matrix/broadcast.jdf
+++ b/parsec/data_dist/matrix/broadcast.jdf
@@ -59,7 +59,7 @@ static parsec_data_t* data_of(parsec_data_collection_t *desc, ...)
             data->owner_device = 0;
             data->key = k;
             data->dc = (parsec_data_collection_t*)desc;
-            data->nb_elts = 1;
+            data->span = 1;
             parsec_data_copy_t* data_copy = (parsec_data_copy_t*)PARSEC_OBJ_NEW(parsec_data_copy_t);
             parsec_data_copy_attach(data, data_copy, 0);
             data_copy->device_private = NULL;
diff --git a/parsec/data_internal.h b/parsec/data_internal.h
index 4c1567668..2010909db 100644
--- a/parsec/data_internal.h
+++ b/parsec/data_internal.h
@@ -36,7 +36,7 @@ struct parsec_data_s {
                                                   * which device this data should be modified RW when there
                                                   * are multiple choices. -1 means no preference. */
     struct parsec_data_collection_s*     dc;
-    size_t                     nb_elts;          /* size in bytes of the memory layout */
+    size_t                     span;          /* size in bytes of the memory layout */
     struct parsec_data_copy_s *device_copies[];  /* this array allocated according to the number of devices
                                                   * (parsec_nb_devices). It points to the most recent
                                                   * version of the data.
diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index e9f1c6f09..828a9bb43 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -2398,20 +2398,18 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) || defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
     parsec_dtd_task_t *dtd_task = (parsec_dtd_task_t *)this_task;
     parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)this_task->task_class;
-    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *) calloc(1, sizeof(parsec_gpu_task_t));
-    PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-    gpu_task->release_device_task = free;  /* by default free the device task */
+    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     gpu_task->ec = (parsec_task_t *) this_task;
     gpu_task->submit = dtd_tc->gpu_func_ptr;
-    gpu_task->task_type = 0;
-    gpu_task->last_data_check_epoch = -1;       /* force at least one validation for the task */
+    gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;
     gpu_task->pushout = 0;
+    gpu_task->nb_flows = dtd_tc->super.nb_flows;  /* inherit the flows from the task class */
     for(int i = 0; i < dtd_tc->super.nb_flows; i++) {
         parsec_dtd_flow_info_t *flow = FLOW_OF(dtd_task, i);
         if(flow->op_type & PARSEC_PUSHOUT)
             gpu_task->pushout |= 1<<i;
-        gpu_task->flow[i] = dtd_tc->super.in[i];
-        gpu_task->flow_nb_elts[i] = this_task->data[i].data_in->original->nb_elts;
+        gpu_task->flow_info[i].flow = dtd_tc->super.in[i];
+        gpu_task->flow_info[i].flow_span = this_task->data[i].data_in->original->span;
     }
 
     parsec_device_module_t *device = this_task->selected_device;
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 4369a78d9..33c76f76d 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2009-2024 The University of Tennessee and The University
+ * Copyright (c) 2009-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
@@ -6828,43 +6828,33 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
             "  assert(NULL != dev);\n"
             "  assert(PARSEC_DEV_IS_GPU(dev->type));\n"
             "\n"
-            "  gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));\n"
-            "  PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);\n"
-            "  gpu_task->release_device_task = free;  /* by default free the device task */\n"
+            "  gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);"
             "  gpu_task->ec = (parsec_task_t*)this_task;\n"
             "  gpu_task->submit = &%s_kernel_submit_%s_%s;\n"
-            "  gpu_task->task_type = 0;\n"
-            "  gpu_task->last_data_check_epoch = -1;  /* force at least one validation for the task */\n",
+            "  gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;\n",
             dev_lower, jdf_basename, f->fname);
 
     /* Set up stage in/out callbacks */
     jdf_find_property(body->properties, "stage_in", &stage_in_property);
-    jdf_find_property(body->properties, "stage_out", &stage_out_property);
-
-    if(stage_in_property == NULL) {
-        coutput("  gpu_task->stage_in  = parsec_default_gpu_stage_in;\n");
-    }else{
-        coutput("  gpu_task->stage_in  = %s;\n", dump_expr((void**)stage_in_property->expr, &info));
-    }
+    coutput("  gpu_task->stage_in  = %s;\n", (NULL == stage_in_property) ? "parsec_default_gpu_stage_in"
+                                                                         : dump_expr((void **)stage_in_property->expr, &info));
 
-    if(stage_out_property == NULL) {
-        coutput("  gpu_task->stage_out = parsec_default_gpu_stage_out;\n");
-    }else{
-        coutput("  gpu_task->stage_out = %s;\n", dump_expr((void**)stage_out_property->expr, &info));
-    }
+    jdf_find_property(body->properties, "stage_out", &stage_out_property);
+    coutput("  gpu_task->stage_out = %s;\n", (NULL == stage_out_property) ? "parsec_default_gpu_stage_out"
+                                                                          : dump_expr((void **)stage_out_property->expr, &info));
 
     /* Dump the dataflow */
     coutput("  gpu_task->pushout = 0;\n");
     for(fl = f->dataflow, di = 0; fl != NULL; fl = fl->next, di++) {
-        coutput("  gpu_task->flow[%d]         = &%s;\n",
+        coutput("  gpu_task->flow_info[%d].flow    = &%s;\n",
                 di, JDF_OBJECT_ONAME( fl ));
 
         sprintf(sa->ptr, "%s.dc", fl->varname);
         jdf_find_property(body->properties, sa->ptr, &desc_property);
-        if(desc_property == NULL){
-            coutput("  gpu_task->flow_dc[%d] = NULL;\n", di);
+        if(desc_property == NULL) {
+            coutput("  gpu_task->flow_info[%d].flow_dc = NULL;\n", di);
         }else{
-            coutput("  gpu_task->flow_dc[%d] = (parsec_data_collection_t *)%s;\n", di,
+            coutput("  gpu_task->flow_info[%d].flow_dc = (parsec_data_collection_t *)%s;\n", di,
                         dump_expr((void**)desc_property->expr, &info));
         }
 
@@ -6872,22 +6862,22 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         jdf_find_property(body->properties, sa->ptr, &size_property);
 
         if(fl->flow_flags & JDF_FLOW_TYPE_CTL) {
-            if(size_property != NULL){
+            if(size_property != NULL) {
                 fprintf(stderr, "Error: specifying GPU buffer size for CTL flow %s at line %d\n",
                         fl->varname, JDF_OBJECT_LINENO(fl));
                 exit(-1);
             }
-            coutput("  gpu_task->flow_nb_elts[%d] = 0;\n", di);
-        }else{
+            coutput("  gpu_task->flow_info[%d].flow_span = 0;\n", di);
+        } else {
             coutput("  // A shortcut to check if the flow exists\n");
             coutput("  if (gpu_task->ec->data[%d].data_in != NULL) {\n", di);
             if(size_property == NULL){
-                coutput("  gpu_task->flow_nb_elts[%d] = gpu_task->ec->data[%d].data_in->original->nb_elts;\n", di, di);
-            }else{
-                coutput("  gpu_task->flow_nb_elts[%d] = %s;\n",
-                        di, dump_expr((void**)size_property->expr, &info));
+                coutput("  gpu_task->flow_info[%d].flow_span = gpu_task->ec->data[%d].data_in->original->span;\n", di, di);
+            } else {
+                coutput("  gpu_task->flow_info[%d].flow_span = %s;\n",
+                        di, dump_expr((void **)size_property->expr, &info));
                 if( (stage_in_property == NULL) || ( stage_out_property == NULL )){
-                    coutput("  assert(gpu_task->ec->data[%d].data_in->original->nb_elts <= %s);\n",
+                    coutput("  assert(gpu_task->ec->data[%d].data_in->original->span <= %s);\n",
                             di, dump_expr((void**)size_property->expr, &info));
                 }
 
@@ -6955,6 +6945,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         }
     }
     string_arena_free(info.sa);
+    coutput("  gpu_task->nb_flows = %d;  /* inherit the flows from the task_class */\n", di);
 
     coutput("\n"
             "  return dev->kernel_scheduler(dev, es, gpu_task);\n"
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 201735f20..aa93d2c50 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -17,6 +17,7 @@
 #include "parsec/scheduling.h"
 
 #include <limits.h>
+#include <string.h>
 
 #define PARSEC_DEVICE_DATA_COPY_ATOMIC_SENTINEL 1024
 
@@ -44,23 +45,67 @@ int parsec_gpu_verbosity;
 typedef int(*parsec_gpu_step_function_t)(parsec_device_gpu_module_t  *gpu_device,
                                          parsec_gpu_task_t           *gpu_task,
                                          parsec_gpu_exec_stream_t    *gpu_stream);
+/**
+ * This is a special function to release standard device tasks instead of calling
+ * PARSEC_OBJ_RELEASE on them. If we use the PARSEC_OBJ_RELEASE route the memory pointed
+ * by the task will be free. In some cases, we don't want that to happen, but we still
+ * want to inform the DSL that the task has been done with.
+ */
+static void parsec_device_release_gpu_task(parsec_gpu_task_t *gpu_task)
+{
+    PARSEC_OBJ_RELEASE(gpu_task);
+}
+
+static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task)
+{
+    gpu_task->task_type = PARSEC_GPU_TASK_TYPE_INVALID; /* need to be set later */
+    gpu_task->pushout = 0;
+    gpu_task->last_status = 0;
+    gpu_task->submit = NULL;
+    gpu_task->complete_stage = NULL;
+    gpu_task->stage_in = NULL;
+    gpu_task->stage_out = NULL;
+    gpu_task->release_device_task = NULL;
+#if defined(PARSEC_PROF_TRACE)
+    gpu_task->prof_key_end = 0;
+    gpu_task->prof_event_id = 0;
+    gpu_task->prof_tp_id = 0;
+#endif
+    gpu_task->ec = NULL;
+    gpu_task->last_data_check_epoch = UINT64_MAX; /* force at least one validation for the task */
+    gpu_task->nb_flows = 0;
+    gpu_task->flow_info = NULL;
+    /* Default release mechanism, can be replaced by the DSL */
+    gpu_task->release_device_task = parsec_device_release_gpu_task;
+}
+PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_task_t, parsec_list_item_t,
+                          parsec_device_task_t_constructor, NULL);
+
+static void parsec_device_dsl_task_t_constructor(parsec_gpu_dsl_task_t *gpu_dsl_task)
+{
+    memset(gpu_dsl_task->flows, 0, sizeof(gpu_dsl_task->flows));
+    gpu_dsl_task->super.flow_info = gpu_dsl_task->flows;
+}
+
+PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_dsl_task_t, parsec_gpu_task_t,
+                          parsec_device_dsl_task_t_constructor, NULL);
 
 static inline int
 parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device,
                                  parsec_gpu_task_t *gpu_task)
 {
-    int i;
     int space_needed = 0;
     parsec_task_t *this_task = gpu_task->ec;
     parsec_data_t *original;
     parsec_data_copy_t *data;
     const parsec_flow_t *flow;
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    /* would have been this_task->task_class->nb_flows for classical DSL */
+    for( uint32_t i = 0; i < gpu_task->nb_flows; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
 
         data = this_task->data[i].data_in;
@@ -472,15 +517,14 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int
                                 gpu_device->super.device_index, gpu_device->super.name, __func__, __LINE__);
                 return PARSEC_ERROR;
             }
-            parsec_gpu_task_t* gpu_task = NULL;
-            gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
+            parsec_gpu_task_t* gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
             gpu_task->task_type = PARSEC_GPU_TASK_TYPE_PREFETCH;
-            gpu_task->release_device_task = free;  /* by default free the device task */
             gpu_task->ec = calloc(1, sizeof(parsec_task_t));
             PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
             gpu_task->ec->task_class = &parsec_device_data_prefetch_tc;
-            gpu_task->flow[0] = &parsec_device_data_prefetch_flow;
-            gpu_task->flow_nb_elts[0] = data->device_copies[ data->owner_device ]->original->nb_elts;
+            gpu_task->nb_flows = 1;
+            gpu_task->flow_info[0].flow = &parsec_device_data_prefetch_flow;
+            gpu_task->flow_info[0].flow_span = data->device_copies[ data->owner_device ]->original->span;
             gpu_task->stage_in  = parsec_default_gpu_stage_in;
             gpu_task->stage_out = parsec_default_gpu_stage_out;
             PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]",
@@ -492,9 +536,9 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int
             gpu_task->ec->data[0].source_repo_entry = NULL;
             gpu_task->ec->data[0].source_repo = NULL;
             PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
-                                 "GPU[%d:%s]: data copy %p [ref_count %d] linked to prefetch gpu task %p on GPU copy %p [ref_count %d]",
+                                 "GPU[%d:%s]: data copy %p [ref_count %d] linked to prefetch gpu task %p on GPU copy %p",
                                  gpu_device->super.device_index, gpu_device->super.name, gpu_task->ec->data[0].data_in, gpu_task->ec->data[0].data_in->super.super.obj_reference_count,
-                                 gpu_task, gpu_task->ec->data[0].data_out, gpu_task->ec->data[0].data_out->super.super.obj_reference_count);
+                                 gpu_task, gpu_task->ec->data[0].data_out);
             parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
             return PARSEC_SUCCESS;
         }
@@ -850,8 +894,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
     parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
     parsec_data_t* master, *oldmaster;
     const parsec_flow_t *flow;
-    int i, j, data_avail_epoch = 0, copy_readers_update = 0;
+    int data_avail_epoch = 0, copy_readers_update = 0;
     parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL;
+    uint32_t i, j;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_name[MAX_TASK_STRLEN];
@@ -864,8 +909,8 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
      * Parse all the input and output flows of data and ensure all have
      * corresponding data on the GPU available.
      */
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
-        flow = gpu_task->flow[i];
+    for (i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++) {
+        flow = gpu_task->flow_info[i].flow;
         assert( flow && (flow->flow_index == i) );
 
         /* Skip CTL flows only */
@@ -926,12 +971,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                              "GPU[%d:%s]:%s: Allocate GPU copy %p sz %zu [ref_count %d] for data %p",
                              gpu_device->super.device_index, gpu_device->super.name, task_name,
-                             gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master);
+                             gpu_elem, gpu_task->flow_info[i].flow_span, gpu_elem->super.super.obj_reference_count, master);
         gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
     malloc_data:
         copy_readers_update = 0;
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
-        gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
+        gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_info[i].flow_span);
         if( NULL == gpu_elem->device_private ) {
 #endif
 
@@ -948,7 +993,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
                                      "GPU[%d:%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s",
                                      gpu_device->super.device_index, gpu_device->super.name, task_name,
-                                     flow->name, i, this_task->task_class->nb_flows, task_name );
+                                     flow->name, i, gpu_task->nb_flows, task_name );
 #endif  /* defined(PARSEC_DEBUG_NOISIER) */
                 for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
@@ -1136,7 +1181,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
             parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
                                          parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private,
                                          gpu_device->super.device_index,
-                                         &gpu_task->flow_nb_elts[i], PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
+                                         &gpu_task->flow_info[i].flow_span, PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
         }
 #endif
 #else
@@ -1201,9 +1246,9 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
     size_t count;
     parsec_device_transfer_direction_t dir;
 
-    for(int i = 0; i < task->task_class->nb_flows; i++) {
+    for(uint32_t i = 0; i < gtask->nb_flows  /* not task->task_class->nb_flows */; i++) {
         if( !(flow_mask & (1U << i)) ) continue;
-        source = gtask->sources[i];
+        source = gtask->flow_info[i].source;
         assert(source->device_private != NULL);
         dest = task->data[i].data_out;
         src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
@@ -1216,8 +1261,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
             dir = parsec_device_gpu_transfer_direction_h2d;
         }
 
-        count = (source->original->nb_elts <= dest->original->nb_elts) ?
-            source->original->nb_elts : dest->original->nb_elts;
+        count = (source->original->span <= dest->original->span) ?
+            source->original->span : dest->original->span;
         ret = dst_dev->memcpy_async( dst_dev, gpu_stream,
                                      dest->device_private,
                                      source->device_private,
@@ -1250,16 +1295,16 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
     parsec_task_t *task = gtask->ec;
     size_t count;
     parsec_device_transfer_direction_t dir;
-    int i;
-    for(i = 0; i < task->task_class->nb_flows; i++){
+
+    for(uint32_t i = 0; i < gtask->nb_flows  /* not task->task_class->nb_flows */; i++){
         if(flow_mask & (1U << i)){
             source = task->data[i].data_out;
             dest = source->original->device_copies[0];
             dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
             src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
 
-            count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts :
-                        dest->original->nb_elts;
+            count = (source->original->span <= dest->original->span) ? source->original->span :
+                        dest->original->span;
             if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) {
                 assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
                 dir = parsec_device_gpu_transfer_direction_d2d;
@@ -1345,7 +1390,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
     parsec_data_copy_t *candidate = task_data->data_in;  /* best candidate for now */
     parsec_data_t* original = candidate->original;
     parsec_gpu_data_copy_t* gpu_elem = task_data->data_out;
-    size_t nb_elts = gpu_task->flow_nb_elts[flow->flow_index];
+    size_t span = gpu_task->flow_info[flow->flow_index].flow_span;
     int transfer_from = -1;
     /* True once a GPU source copy has a readers reference held for this transfer. */
     int source_acquired = 0;
@@ -1358,7 +1403,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
 
     parsec_atomic_lock( &original->lock );
 
-    gpu_task->sources[flow->flow_index] = candidate;  /* default source for the transfer */
+    gpu_task->flow_info[flow->flow_index].source = candidate; /* default source for the transfer */
     /**
      * If the data will be accessed in write mode, remove it from any GPU data management
      * lists until the task is completed.
@@ -1389,11 +1434,11 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
     if( (PARSEC_FLOW_ACCESS_READ & type) &&
         (gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER) ) {
         transfer_from = parsec_data_start_transfer_ownership_to_copy(original, gpu_device->super.device_index, (uint8_t)type);
-        gpu_device->super.required_data_in += original->nb_elts;
+        gpu_device->super.required_data_in += original->span;
         PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                              "GPU[%d:%s]:\t\tMove data copy %p [ref_count %d, key %x] of %zu bytes: data copy is already under transfer, ignoring double request",
                              gpu_device->super.device_index, gpu_device->super.name,
-                             gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts);
+                             gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span);
         parsec_atomic_unlock( &original->lock );
         return 1;  /* positive returns have special meaning and are used for optimizations */
     }
@@ -1550,7 +1595,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         transfer_from = -1;
 
     /* Update the transferred required_data_in size */
-    gpu_device->super.required_data_in += original->nb_elts;
+    gpu_device->super.required_data_in += original->span;
 
     if( -1 == transfer_from ) {  /* Do not need to be transferred */
         /* No transfer completion callback will run on the source in this path,
@@ -1581,7 +1626,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                          "GPU[%d:%s]:\t\tMove %s data copy %p [ref_count %d, key %x] of %zu bytes\t(src dev: %d, v:%d, ptr:%p, copy:%p [ref_count %d, under_transfer: %d, coherency_state: %d] / dst dev: %d, v:%d, ptr:%p)",
                          gpu_device->super.device_index, gpu_device->super.name,
                          PARSEC_DEV_IS_GPU(candidate_dev->super.type) ? "D2D": "H2D",
-                         gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts,
+                         gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span,
                          candidate_dev->super.device_index, candidate->version, (void*)candidate->device_private,
                          candidate, candidate->super.super.obj_reference_count, candidate->data_transfer_status, candidate->coherency_state,
                          gpu_device->super.device_index, gpu_elem->version, (void*)gpu_elem->device_private);
@@ -1619,7 +1664,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         }
         if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) {
             parsec_device_gpu_memory_prof_info_t _info;
-            _info.size = (uint64_t)nb_elts;
+            _info.size = (uint64_t)span;
             _info.data_key = gpu_elem->original->key;
             _info.dc_id = (uint64_t)(gpu_elem->original->dc);
             parsec_profiling_trace_flags(gpu_stream->profiling,
@@ -1630,7 +1675,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         }
     }
 #endif
-    gpu_task->sources[flow->flow_index] = candidate;  /* save the candidate for release on transfer completion */
+    gpu_task->flow_info[flow->flow_index].source = candidate;  /* save the candidate for release on transfer completion */
     /* Push data into the GPU from the source device */
     int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
     if(PARSEC_SUCCESS != rc) {
@@ -1639,7 +1684,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
                         candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name,
                         gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name,
-                        nb_elts,
+                        span,
                         (candidate_dev->super.type & gpu_device->super.type & PARSEC_DEV_ANY_TYPE)? "D2D": "H2D");
         if( source_acquired ) {
             int readers = parsec_gpu_data_copy_release_reader(candidate_dev, candidate, 1);
@@ -1650,9 +1695,9 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         return PARSEC_HOOK_RETURN_ERROR;
     }
     assert(candidate_dev->super.device_index < gpu_device->super.data_in_array_size);
-    gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += nb_elts;
+    gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += span;
     if( PARSEC_GPU_TASK_TYPE_KERNEL == gpu_task->task_type )
-        gpu_device->super.nb_data_faults += nb_elts;
+        gpu_device->super.nb_data_faults += span;
 
     /* We assign the version of the data preemptively (i.e. before the task is executing)
      * For read-only data, the GPU copy will get the same version as the source
@@ -1836,15 +1881,14 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy,
                                                   parsec_device_module_t *current_dev,
                                                   parsec_device_module_t *dst_dev)
 {
-    parsec_gpu_task_t* gpu_task = NULL;
-    gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
+    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     gpu_task->task_type = PARSEC_GPU_TASK_TYPE_D2D_COMPLETE;
-    gpu_task->release_device_task = free;  /* by default free the device task */
     gpu_task->ec = calloc(1, sizeof(parsec_task_t));
     PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
     gpu_task->ec->task_class = &parsec_device_d2d_complete_tc;
-    gpu_task->flow[0] = &parsec_device_d2d_complete_flow;
-    gpu_task->flow_nb_elts[0] = copy->original->nb_elts;
+    gpu_task->nb_flows = 1;
+    gpu_task->flow_info[0].flow = &parsec_device_d2d_complete_flow;
+    gpu_task->flow_info[0].flow_span = copy->original->span;
     gpu_task->stage_in  = parsec_default_gpu_stage_in;
     gpu_task->stage_out = parsec_default_gpu_stage_out;
     gpu_task->ec->data[0].data_in = copy;  /* We need to set not-null in data_in, so that the fake flow is
@@ -1874,7 +1918,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
 
     parsec_gpu_task_t *gtask = *gpu_task;
     parsec_task_t *task;
-    int32_t i;
+    uint32_t i;
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_str[MAX_TASK_STRLEN];
 #endif
@@ -1891,14 +1935,14 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                          "GPU[%d:%s]: parsec_device_callback_complete_push, PUSH of %s",
                          gpu_device->super.device_index, gpu_device->super.name, parsec_task_snprintf(task_str, MAX_TASK_STRLEN, task));
 
-    for( i = 0; i < task->task_class->nb_flows; i++ ) {
+    for (i = 0; i < gtask->nb_flows /* not task->task_class->nb_flows */; i++){
         /* Make sure data_in is not NULL */
         if( NULL == task->data[i].data_in ) continue;
         /* We also don't push back non-parsec-owned copies */
         if(NULL != task->data[i].data_out &&
            0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue;
 
-        flow = gtask->flow[i];
+        flow = gtask->flow_info[i].flow;
         assert( flow );
         assert( flow->flow_index == i );
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
@@ -1910,7 +1954,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                                                        gpu_device->super.device_index,
                                                        flow->flow_flags);
 
-            parsec_data_copy_t* source = gtask->sources[i];
+            parsec_data_copy_t* source = gtask->flow_info[i].source;
             parsec_device_gpu_module_t *src_device =
                     (parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index );
             if (task->data[i].data_in->flags & PARSEC_DATA_FLAG_EVICTED) {
@@ -2253,7 +2297,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
 {
     parsec_task_t *this_task = gpu_task->ec;
     const parsec_flow_t *flow;
-    int i, ret = 0, how_many = 0;
+    int ret = 0, how_many = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -2297,9 +2341,9 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
         return ret;
     }
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         /* Skip CTL flows */
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
 
@@ -2377,11 +2421,11 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t      *gpu_device,
 
 #if defined(PARSEC_DEBUG_PARANOID)
     const parsec_flow_t *flow;
-    for( int i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint i = 0; i < gpu_task->nb_flows  /* this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
         if( 0 == (this_task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
         assert(this_task->data[i].data_out->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
@@ -2421,15 +2465,15 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy;
     parsec_data_t              *original;
-    size_t                      nb_elts;
+    size_t                      span;
     const parsec_flow_t        *flow;
-    int return_code = 0, rc, how_many = 0, i, update_data_epoch = 0;
+    int return_code = 0, rc, how_many = 0, update_data_epoch = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
 
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) {
-        for( i = 0; i < this_task->locals[0].value; i++ ) {
+        for( int i = 0; i < this_task->locals[0].value; i++ ) {
             gpu_copy = this_task->data[i].data_out;
             /* If the gpu copy is not owned by parsec, we don't manage it at all */
             if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
@@ -2454,13 +2498,13 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name,
                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* We need to manage all data that has been used as input, even if they were read only */
 
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags) )  continue;  /* control flow */
 
         gpu_copy = this_task->data[i].data_out;
@@ -2469,7 +2513,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
         if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
 
         original = gpu_copy->original;
-        nb_elts = gpu_task->flow_nb_elts[i];
+        span = gpu_task->flow_info[i].flow_span;
 
         assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original );
 
@@ -2517,7 +2561,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                                 gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name);
 
             /* Stage the transfer of the data back to main memory */
-            gpu_device->super.required_data_out += nb_elts;
+            gpu_device->super.required_data_out += span;
             assert( ((parsec_list_item_t*)gpu_copy)->list_next == (parsec_list_item_t*)gpu_copy );
             assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy );
 
@@ -2565,7 +2609,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                     parsec_atomic_unlock(&original->lock);
                     goto release_and_return_error;
                 }
-                gpu_device->super.data_out_to_host += nb_elts; /* TODO: not hardcoded, use datatype size */
+                gpu_device->super.data_out_to_host += span; /* TODO: not hardcoded, use datatype size */
                 how_many++;
             } else {
                 assert( 0 == gpu_copy->readers );
@@ -2596,7 +2640,6 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
-    int i;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
@@ -2606,7 +2649,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
                          parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 #endif
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
@@ -2614,7 +2657,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
         if(NULL == this_task->data[i].data_out) continue;
 
 
-        if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
+        if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Warning data_out for read only flows has been overwritten in pop */
             continue;
         }
@@ -2699,7 +2742,7 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
-    int i, data_avail_epoch = 0;
+    int data_avail_epoch = 0;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
@@ -2709,13 +2752,13 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
                          parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 #endif
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
         /* Don't bother if there is no real data (aka. CTL or no output) */
         if(NULL == this_task->data[i].data_out) continue;
-        if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
+        if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Warning data_out for read only flows has been overwritten in pop */
             continue;
         }
@@ -2963,9 +3006,9 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module,
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed",
                          gpu_device->super.device_index, gpu_device->super.name,
                          gpu_task);
-    if (NULL != gpu_task->release_device_task) {
-        gpu_task->release_device_task(gpu_task);
-    }
+    /* Release the GPU task */
+    gpu_task->release_device_task(gpu_task);
+
     rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
     if( 1 == rc ) {  /* I was the last one */
 #if defined(PARSEC_PROF_TRACE)
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 653095421..2dca174d2 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024 The University of Tennessee and The University
+ * Copyright (c) 2021-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
@@ -94,7 +94,25 @@ typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t        *gtask,
  * and this function allows the device engine to delegate the release of such tasks back into
  * the DSL. Once this task called, the device task should not be accessed by the device.
  */
-typedef void (*parsec_release_device_task_function_t)(void*);
+typedef void (*parsec_release_device_task_function_t)(parsec_gpu_task_t*);
+typedef struct parsec_gpu_flow_info_s {
+    const parsec_flow_t      *flow;         /* Some DSL might not have a task_class, but they still need to provide a flow. */
+    size_t                    flow_span;    /* the span of the data on the device. For contiguous layout this is equal to the
+                                             * size of the data, for all the other copies this should be the amount of memory
+                                             * needed on the device.
+                                             */
+    parsec_data_collection_t *flow_dc; /* the data collection from which the data originates. When the data copy is local, the data
+                                        * collection can be accessed via the data_t, but for all copies coming from the network there
+                                        * is no known data collection. Thus, for such cases the DSL need to provide a reference to the
+                                        * local data collection to be used for all transfers. This data collection is needed to get
+                                        * access to the mtype, to get the memory layout of the copy.
+                                        */
+    /* The is private to the device code and should not be used outside the device driver */
+    parsec_data_copy_t       *source; /* If the driver decides to acquire the data from a different
+                                        * source, it will temporary store the best candidate here.
+                                        */
+
+} parsec_gpu_flow_info_t;
 
 struct parsec_gpu_task_s {
     parsec_list_item_t                     list_item;
@@ -115,23 +133,8 @@ struct parsec_gpu_task_s {
         struct {
             parsec_task_t                 *ec;
             uint64_t                       last_data_check_epoch;
-            const parsec_flow_t           *flow[MAX_PARAM_COUNT];  /* There is no consistent way to access the flows from the task_class,
-                                                                    * so the DSL need to provide these flows here.
-                                                                    */
-            size_t                         flow_nb_elts[MAX_PARAM_COUNT]; /* for each flow, size of the data to be allocated
-                                                                           * on the GPU.
-                                                                           */
-            parsec_data_collection_t      *flow_dc[MAX_PARAM_COUNT];     /* for each flow, data collection from which the data
-                                                                          * to be transferred logically belongs to.
-                                                                          * This gives the user the chance to indicate on the JDF
-                                                                          * a data collection to inspect during GPU transfer.
-                                                                          * User may want info from the DC (e.g. mtype),
-                                                                          * & otherwise remote copies don't have any info.
-                                                                          */
-            /* These are private and should not be used outside the device driver */
-            parsec_data_copy_t            *sources[MAX_PARAM_COUNT];  /* If the driver decides to acquire the data from a different
-                                                                       * source, it will temporary store the best candidate here.
-                                                                       */
+            uint32_t                       nb_flows;
+            parsec_gpu_flow_info_t        *flow_info;
         };
         struct {
             parsec_data_copy_t            *copy;
@@ -139,6 +142,20 @@ struct parsec_gpu_task_s {
     };
 };
 
+PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_task_t);
+
+/**
+ * Specialized GPU tasks for the PTG and DTD DSL. The maximum number of flows being MAX_PARAM_COUNT we
+ * can make the gpu_flow_info array part of the struct, to allocate the gpu_task as a single,
+ * contiguous block of memory.
+ */
+typedef struct parsec_gpu_dsl_task_s {
+    parsec_gpu_task_t                      super;
+    parsec_gpu_flow_info_t                 flows[MAX_PARAM_COUNT];  /* All the flow info necessary for the PTG and DTD DSL */
+} parsec_gpu_dsl_task_t;
+
+PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_dsl_task_t);
+
 typedef enum parsec_device_transfer_direction_e {
     parsec_device_gpu_transfer_direction_h2d,
     parsec_device_gpu_transfer_direction_d2h,
@@ -354,6 +371,7 @@ char *parsec_device_describe_gpu_task( char *tmp, size_t len, parsec_gpu_task_t
 #define PARSEC_GPU_TASK_TYPE_PREFETCH     0x2000
 #define PARSEC_GPU_TASK_TYPE_WARMUP       0x4000
 #define PARSEC_GPU_TASK_TYPE_D2D_COMPLETE 0x8000
+#define PARSEC_GPU_TASK_TYPE_INVALID      0xf000
 
 #if defined(PARSEC_PROF_TRACE)
 #define PARSEC_PROFILE_GPU_TRACK_DATA_IN  0x0001
diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c
index 50b0d886e..8d001e228 100644
--- a/parsec/mca/device/transfer_gpu.c
+++ b/parsec/mca/device/transfer_gpu.c
@@ -276,12 +276,11 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device,
     d2h_task->taskpool        = NULL;
     d2h_task->locals[0].value = nb_cleaned;
 
-    w2r_task = (parsec_gpu_task_t *)malloc(sizeof(parsec_gpu_task_t));
-    PARSEC_OBJ_CONSTRUCT(w2r_task, parsec_list_item_t);
-    w2r_task->release_device_task   = free;  /* by default free the device task */
+    w2r_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     w2r_task->ec                    = (parsec_task_t*)d2h_task;
     w2r_task->task_type             = PARSEC_GPU_TASK_TYPE_D2HTRANSFER;
     w2r_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1;
+    w2r_task->nb_flows              = nb_cleaned;
     w2r_task->stage_in              = NULL;
     w2r_task->stage_out             = &parsec_default_gpu_stage_out;
     w2r_task->complete_stage        = NULL;
@@ -309,7 +308,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device,
         parsec_atomic_lock(&gpu_copy->original->lock);
         gpu_copy->readers--;
         gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
-        gpu_device->super.data_out_to_host += gpu_copy->original->nb_elts; /* TODO: not hardcoded, use datatype size */
+        gpu_device->super.data_out_to_host += gpu_copy->original->span; /* TODO: not hardcoded, use datatype size */
         assert(gpu_copy->readers >= 0);
 
         original = gpu_copy->original;
@@ -343,7 +342,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device,
         parsec_atomic_unlock(&gpu_copy->original->lock);
     }
     parsec_thread_mempool_free(es->context_mempool, task);
-    free(gpu_task);
+    PARSEC_OBJ_RELEASE(gpu_task); /* no need to call release_device_task, just release the task */
     gpu_device->data_avail_epoch++;
     return 0;
 }
diff --git a/tests/collections/redistribute/testing_redistribute.c b/tests/collections/redistribute/testing_redistribute.c
index 21e2396c4..92462757e 100644
--- a/tests/collections/redistribute/testing_redistribute.c
+++ b/tests/collections/redistribute/testing_redistribute.c
@@ -295,6 +295,7 @@ testing_redistribute_make_device_resident(testing_redistribute_matrix_t *matrix)
             parsec_data_t *data;
             parsec_data_copy_t *cpu_copy;
             parsec_data_copy_t *gpu_copy;
+            size_t data_size;
             uint32_t owner = dc->rank_of(dc, m, n);
 
             if( owner != dc->myrank ) {
@@ -302,6 +303,7 @@ testing_redistribute_make_device_resident(testing_redistribute_matrix_t *matrix)
             }
 
             data = dc->data_of(dc, m, n);
+            data_size = data->span;
             cpu_copy = parsec_data_get_copy(data, 0);
             gpu_copy = parsec_data_get_copy(data, device_index);
 
@@ -309,11 +311,11 @@ testing_redistribute_make_device_resident(testing_redistribute_matrix_t *matrix)
                 gpu_copy = PARSEC_OBJ_NEW(parsec_data_copy_t);
                 gpu_copy->dtt = dc->default_dtt;
                 gpu_copy->release_cb = testing_redistribute_cuda_release;
-                status = cudaMalloc(&gpu_copy->device_private, data->nb_elts);
+                status = cudaMalloc(&gpu_copy->device_private, data_size);
                 if( cudaSuccess != status ) {
                     PARSEC_OBJ_RELEASE(gpu_copy);
                     fprintf(stderr, "ERROR: cudaMalloc(%zu) failed: %s\n",
-                            data->nb_elts, cudaGetErrorString(status));
+                            data_size, cudaGetErrorString(status));
                     return PARSEC_ERROR;
                 }
                 parsec_data_copy_attach(data, gpu_copy, device_index);
@@ -321,11 +323,11 @@ testing_redistribute_make_device_resident(testing_redistribute_matrix_t *matrix)
 
             status = cudaMemcpy(gpu_copy->device_private,
                                 cpu_copy->device_private,
-                                data->nb_elts,
+                                data_size,
                                 cudaMemcpyHostToDevice);
             if( cudaSuccess != status ) {
                 fprintf(stderr, "ERROR: cudaMemcpy(host to device, %zu) failed: %s\n",
-                        data->nb_elts, cudaGetErrorString(status));
+                        data_size, cudaGetErrorString(status));
                 return PARSEC_ERROR;
             }
             parsec_data_transfer_ownership_to_copy(data, device_index, PARSEC_FLOW_ACCESS_RW);
diff --git a/tests/dsl/ptg/choice/choice_data.c b/tests/dsl/ptg/choice/choice_data.c
index a6c7dcd77..70281f3ae 100644
--- a/tests/dsl/ptg/choice/choice_data.c
+++ b/tests/dsl/ptg/choice/choice_data.c
@@ -50,7 +50,7 @@ get_or_create_data(my_datatype_t* dat, uint32_t pos)
 
         data->owner_device = 0;
         data->key = pos;
-        data->nb_elts = 1;
+        data->span = 1;
         data->device_copies[0] = data_copy;
 
         if( !parsec_atomic_cas_ptr(&dat->data_map[pos], NULL, data) ) {
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
index 2d97b7749..b0b2f7f0b 100644
--- a/tests/runtime/cuda/nvlink_wrapper.c
+++ b/tests/runtime/cuda/nvlink_wrapper.c
@@ -190,7 +190,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
             /* And copy the tile from CPU to GPU */
             status = (cudaError_t)cudaMemcpy( gpu_copy->device_private,
                                               cpu_copy->device_private,
-                                              dta->nb_elts,
+                                              dta->span,
                                               cudaMemcpyHostToDevice );
             PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMemcpy", status, {return NULL;} );
             g++;
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
index f4d02ad9b..71b8fe2ee 100644
--- a/tests/runtime/cuda/stage_custom.jdf
+++ b/tests/runtime/cuda/stage_custom.jdf
@@ -37,14 +37,15 @@ stage_stride_in(parsec_gpu_task_t *gtask,
     parsec_device_gpu_module_t *in_elem_dev;
     parsec_tiled_matrix_t * dc;
     int elem_sz;
-    int i;
-    for(i = 0; i < task->task_class->nb_flows; i++){
+
+    assert(gtask->nb_flows == task->task_class->nb_flows);
+    for(uint32_t i = 0; i < gtask->nb_flows; i++){
         if(flow_mask & (1U << i)){
-            copy_in = gtask->sources[i];  /* task->data[i].data_in can be the CPU version,
-                                             but  parsec_device_data_stage_in already selected the best
-                                             source for this input, so use it instead. */
+            copy_in = gtask->flow_info[i].source;  /* task->data[i].data_in can be the CPU version,
+                                                      but parsec_device_data_stage_in already selected the best
+                                                      source for this input, so use it instead. */
             copy_out = task->data[i].data_out;
-            dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
+            dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc;
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index);
             if( !(in_elem_dev->super.type & PARSEC_DEV_CUDA) ) {
@@ -64,7 +65,7 @@ stage_stride_in(parsec_gpu_task_t *gtask,
             } else {
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
-                                                    gtask->flow_nb_elts[i],
+                                                    gtask->flow_info[i].flow_span,
                                                     cudaMemcpyDeviceToDevice,
                                                     cuda_stream->cuda_stream );
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } );
@@ -91,7 +92,7 @@ stage_stride_out(parsec_gpu_task_t *gtask,
         if(flow_mask & (1U << i)){
             copy_in = task->data[i].data_out;
             copy_out = copy_in->original->device_copies[0];
-            dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
+            dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc;
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             /* copy width bytes height times, skipping pitch - width bytes every time */
             size_t dpitch = dc->llm * elem_sz;