From 927d1918e61fed60de039487a3733e5fe388d1dc Mon Sep 17 00:00:00 2001
From: Joseph John <joseph.john@anu.edu.au>
Date: Sat, 1 Apr 2023 20:52:10 -0400
Subject: [PATCH 1/2] Updated parsec_device_gpu_module_s to incorporate
 co-manager.

complete_mutex - tracks the number of tasks to be completed by the co-manager
to_complete - list of tasks to be completed by the co-manager
co_manager_mutex - ensures that there is only one co-manager per device
---
 parsec/mca/device/cuda/device_cuda_component.c | 5 +++++
 parsec/mca/device/cuda/device_cuda_module.c    | 8 ++++++++
 parsec/mca/device/device_gpu.h                 | 5 +++++
 3 files changed, 18 insertions(+)

diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c
index b2cb9ec4b..6e109f3dc 100644
--- a/parsec/mca/device/cuda/device_cuda_component.c
+++ b/parsec/mca/device/cuda/device_cuda_component.c
@@ -41,6 +41,7 @@ char* parsec_cuda_lib_path = NULL;
 
 static int cuda_mask, cuda_nvlink_mask;
 
+int parsec_cuda_delegate_task_completion = 0;
 
 /*
  * Instantiate the public struct with all of our public information
@@ -201,6 +202,10 @@ static int device_cuda_component_register(void)
                                         false, false, 0, &parsec_device_gpu_one_profiling_stream_per_gpu_stream);
 #endif
 
+    (void)parsec_mca_param_reg_int_name("device_cuda", "delegate_task_completion",
+                                        "Integer to choose the whether task completion should be delegated to a co-manager thread (default is no)",
+                                        false, false, 0, &parsec_cuda_delegate_task_completion);
+
     /* If CUDA was not requested avoid initializing the devices */
     return (0 == parsec_device_cuda_enabled ? MCA_ERROR : MCA_SUCCESS);
 }
diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index f269ba9a8..2a685b3d7 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -45,6 +45,9 @@ parsec_cuda_memory_reserve( parsec_device_cuda_module_t* gpu_device,
 static int parsec_cuda_memory_release( parsec_device_cuda_module_t* gpu_device );
 static int parsec_cuda_flush_lru( parsec_device_module_t *device );
 
+/** MCA parameter that decides task delegation */
+extern int parsec_cuda_delegate_task_completion;
+
 /* look up how many FMA per cycle in single/double, per cuda MP
  * precision.
  * The following table provides updated values for future archs
@@ -365,6 +368,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     len = asprintf(&gpu_device->super.name, "%s: cuda(%d)", szName, dev_id);
     if(-1 == len) { gpu_device->super.name = NULL; goto release_device; }
     gpu_device->data_avail_epoch = 0;
+    gpu_device->mutex = 0;
+    gpu_device->complete_mutex = 0;
+    gpu_device->co_manager_mutex = 0;
 
     gpu_device->max_exec_streams = parsec_cuda_max_streams;
     gpu_device->exec_stream =
@@ -471,6 +477,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru,       parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->pending,           parsec_fifo_t);
+    PARSEC_OBJ_CONSTRUCT(&gpu_device->to_complete,       parsec_fifo_t);
 
     gpu_device->sort_starting_p = NULL;
     gpu_device->peer_access_mask = 0;  /* No GPU to GPU direct transfer by default */
@@ -563,6 +570,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device)
 
     /* Release pending queue */
     PARSEC_OBJ_DESTRUCT(&gpu_device->pending);
+    PARSEC_OBJ_DESTRUCT(&gpu_device->to_complete);
 
     /* Release all streams */
     for( j = 0; j < gpu_device->num_exec_streams; j++ ) {
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 129bd2b03..4307f730f 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -121,6 +121,9 @@ struct parsec_device_gpu_module_s {
                                                    *   the index of the set bit device.
                                                    */
     volatile int32_t           mutex;
+    volatile int32_t           complete_mutex;     /** tracks the number of tasks to be completed by the 
+                                                    * co-manageer
+                                                    */ 
     uint64_t                   data_avail_epoch;  /**< Identifies the epoch of the data status on the device. It
                                                    *   is increased every time a new data is made available, so
                                                    *   that we know which tasks can be evaluated for submission.
@@ -128,11 +131,13 @@ struct parsec_device_gpu_module_s {
     parsec_list_t              gpu_mem_lru;   /* Read-only blocks, and fresh blocks */
     parsec_list_t              gpu_mem_owned_lru;  /* Dirty blocks */
     parsec_fifo_t              pending;
+    parsec_fifo_t              to_complete;         /** list of tasks to be completed by the co-manager*/
     struct zone_malloc_s      *memory;
     parsec_list_item_t        *sort_starting_p;
     parsec_gpu_exec_stream_t **exec_stream;
     size_t                     mem_block_size;
     int64_t                    mem_nb_blocks;
+    volatile int32_t           co_manager_mutex;  /** ensures that there is only one co-manager per device */
 };
 
 struct parsec_gpu_exec_stream_s {

From 4067cfa728660999c38e50e4b7fb5a7d95a57431 Mon Sep 17 00:00:00 2001
From: Joseph John <joseph.john@anu.edu.au>
Date: Sat, 1 Apr 2023 22:04:13 -0400
Subject: [PATCH 2/2] co-manager implemented.

The second thread that submits the task to the GPU device is transitioned to a co-manager.
The task is completed by the manager if the co manager has not yet been set.
The task is freed by the manager if it completes the tasks or the task is freed by the
co-manager.
---
 parsec/mca/device/cuda/device_cuda_module.c | 121 +++++++++++++++++++-
 1 file changed, 117 insertions(+), 4 deletions(-)

diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index 2a685b3d7..6c6ba4649 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -47,6 +47,8 @@ static int parsec_cuda_flush_lru( parsec_device_module_t *device );
 
 /** MCA parameter that decides task delegation */
 extern int parsec_cuda_delegate_task_completion;
+parsec_hook_return_t
+parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device );
 
 /* look up how many FMA per cycle in single/double, per cuda MP
  * precision.
@@ -2549,8 +2551,9 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     parsec_device_gpu_module_t* gpu_device;
     parsec_device_cuda_module_t *cuda_device;
     cudaError_t status;
-    int rc, exec_stream = 0;
+    int rc, rc1, exec_stream = 0;
     parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL;
+    int manager_completing_task  = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -2579,7 +2582,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
      *             not in the queue yet.
      */
     while(1) {
-        rc = gpu_device->mutex;
+        rc = rc1 = gpu_device->mutex;
         struct timespec delay;
         if( rc >= 0 ) {
             if( parsec_atomic_cas_int32( &gpu_device->mutex, rc, rc+1 ) ) {
@@ -2593,6 +2596,24 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     }
     if( 0 < rc ) {
         parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
+
+        if( 1 == parsec_cuda_delegate_task_completion ) 
+        {
+            /**
+             * @brief 
+             * The second thread that push the task to device transitions to
+             * a co-manager.
+             * 
+             * 'rc1 == 1' is important or the manager thread will transition
+             * to co-manager. 'co_manager_mutex == 0' will ensure that there 
+             * is only one co-manager per device.
+             */
+            if( rc1 == 1 && gpu_device->co_manager_mutex == 0 )
+            {
+                parsec_cuda_co_manager(es, gpu_device);
+            }
+        }
+
         return PARSEC_HOOK_RETURN_ASYNC;
     }
     PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Entering GPU management at %s:%d",
@@ -2660,6 +2681,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
                 __parsec_reschedule(es, progress_task->ec);
                 gpu_task = progress_task;
                 progress_task = NULL;
+                manager_completing_task = 1;
                 goto remove_gpu_task;
             }
             gpu_task = NULL;
@@ -2736,17 +2758,44 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) {
         free( gpu_task->ec );
         gpu_task->ec = NULL;
+        manager_completing_task = 1;
         goto remove_gpu_task;
     }
     parsec_cuda_kernel_epilog( gpu_device, gpu_task );
-    __parsec_complete_execution( es, gpu_task->ec );
     gpu_device->super.executed_tasks++;
+
+    /** The manager will complete the tasks */
+    if( parsec_cuda_delegate_task_completion == 0 )
+    {
+        __parsec_complete_execution( es, gpu_task->ec );
+        manager_completing_task = 1;
+    }
+    /** The co-manager will complete the task. But first check if such a manager is active */
+    else if ( gpu_device->co_manager_mutex > 0 ) 
+    {
+        parsec_atomic_fetch_inc_int32( &(gpu_device->complete_mutex) );
+        parsec_fifo_push( &(gpu_device->to_complete), (parsec_list_item_t*)gpu_task );
+        manager_completing_task = 0;
+    }
+    /** If the co-manager is not yet ready */
+    else
+    {
+        __parsec_complete_execution( es, gpu_task->ec );
+        manager_completing_task = 1;
+    }
+
  remove_gpu_task:
     // Load problem: was parsec_device_load[gpu_device->super.device_index] -= gpu_task->load;
     parsec_device_load[gpu_device->super.device_index] -= parsec_device_sweight[gpu_device->super.device_index];
     PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed at %s:%d", gpu_device->super.name, 
                          gpu_task, __FILE__, __LINE__);
-    free( gpu_task );
+
+    /* free the task here only if the manager is completing the task*/
+    if(manager_completing_task == 1)
+    {
+        free( gpu_task );
+    }
+
     rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
     if( 1 == rc ) {  /* I was the last one */
 #if defined(PARSEC_PROF_TRACE)
@@ -2770,4 +2819,68 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     return PARSEC_HOOK_RETURN_DISABLE;
 }
 
+
+
+parsec_hook_return_t
+parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device )
+{
+    int rc = 0;
+    parsec_task_t* task = NULL;
+    parsec_gpu_task_t *gpu_task = NULL;
+    parsec_list_t *gpu_tasks_to_free = NULL; 
+    (void)es;
+
+    if( gpu_device->co_manager_mutex > 0 ) 
+    {
+        return PARSEC_HOOK_RETURN_ASYNC;
+    }
+    else 
+    {
+        rc = gpu_device->co_manager_mutex;
+        if( !parsec_atomic_cas_int32( &gpu_device->co_manager_mutex, rc, rc+1 ) ) 
+        {
+            return PARSEC_HOOK_RETURN_ASYNC;
+        }
+    }
+
+    gpu_tasks_to_free = PARSEC_OBJ_NEW(parsec_list_t);
+
+    /**
+     * @brief The migrate_manager thread exits when there are no more
+     * work to be done.
+     */
+    while( gpu_device->mutex > 0 || gpu_device->complete_mutex > 0)
+    {
+        if(gpu_device->complete_mutex > 0)
+        {
+            gpu_task = NULL;
+            task = NULL;
+
+            gpu_task = (parsec_gpu_task_t*)parsec_fifo_pop( &(gpu_device->to_complete) );
+            if( gpu_task != NULL)
+            {
+                task = gpu_task->ec;
+            
+                __parsec_complete_execution( es, task );
+                parsec_atomic_fetch_dec_int32( &(gpu_device->complete_mutex) );
+                parsec_list_push_back(gpu_tasks_to_free, (parsec_list_item_t*)gpu_task);
+            }                                  
+        }
+        
+    }
+    
+    rc = parsec_atomic_fetch_dec_int32( &(gpu_device->co_manager_mutex) );
+
+    /** We free the task delegated to the co-manager only at the end. Or else it may
+     * interfere with some operations in the manager.
+    */
+    while(NULL != (gpu_task = (parsec_gpu_task_t*)parsec_list_pop_front(gpu_tasks_to_free)) ) 
+    {
+        free(gpu_task);
+    }
+    PARSEC_OBJ_RELEASE(gpu_tasks_to_free);
+
+    return PARSEC_HOOK_RETURN_ASYNC;
+}
+
 #endif /* PARSEC_HAVE_CUDA */