diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c
index b2cb9ec4b..6e109f3dc 100644
--- a/parsec/mca/device/cuda/device_cuda_component.c
+++ b/parsec/mca/device/cuda/device_cuda_component.c
@@ -41,6 +41,7 @@ char* parsec_cuda_lib_path = NULL;
 
 static int cuda_mask, cuda_nvlink_mask;
 
+int parsec_cuda_delegate_task_completion = 0;
 
 /*
  * Instantiate the public struct with all of our public information
@@ -201,6 +202,10 @@ static int device_cuda_component_register(void)
                                         false, false, 0, &parsec_device_gpu_one_profiling_stream_per_gpu_stream);
 #endif
 
+    (void)parsec_mca_param_reg_int_name("device_cuda", "delegate_task_completion",
+                                        "Integer to choose the whether task completion should be delegated to a co-manager thread (default is no)",
+                                        false, false, 0, &parsec_cuda_delegate_task_completion);
+
     /* If CUDA was not requested avoid initializing the devices */
     return (0 == parsec_device_cuda_enabled ? MCA_ERROR : MCA_SUCCESS);
 }
diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index f269ba9a8..6c6ba4649 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -45,6 +45,11 @@ parsec_cuda_memory_reserve( parsec_device_cuda_module_t* gpu_device,
 static int parsec_cuda_memory_release( parsec_device_cuda_module_t* gpu_device );
 static int parsec_cuda_flush_lru( parsec_device_module_t *device );
 
+/** MCA parameter that decides task delegation */
+extern int parsec_cuda_delegate_task_completion;
+parsec_hook_return_t
+parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device );
+
 /* look up how many FMA per cycle in single/double, per cuda MP
  * precision.
  * The following table provides updated values for future archs
@@ -365,6 +370,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     len = asprintf(&gpu_device->super.name, "%s: cuda(%d)", szName, dev_id);
     if(-1 == len) { gpu_device->super.name = NULL; goto release_device; }
     gpu_device->data_avail_epoch = 0;
+    gpu_device->mutex = 0;
+    gpu_device->complete_mutex = 0;
+    gpu_device->co_manager_mutex = 0;
 
     gpu_device->max_exec_streams = parsec_cuda_max_streams;
     gpu_device->exec_stream =
@@ -471,6 +479,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru,       parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->pending,           parsec_fifo_t);
+    PARSEC_OBJ_CONSTRUCT(&gpu_device->to_complete,       parsec_fifo_t);
 
     gpu_device->sort_starting_p = NULL;
     gpu_device->peer_access_mask = 0;  /* No GPU to GPU direct transfer by default */
@@ -563,6 +572,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device)
 
     /* Release pending queue */
     PARSEC_OBJ_DESTRUCT(&gpu_device->pending);
+    PARSEC_OBJ_DESTRUCT(&gpu_device->to_complete);
 
     /* Release all streams */
     for( j = 0; j < gpu_device->num_exec_streams; j++ ) {
@@ -2541,8 +2551,9 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     parsec_device_gpu_module_t* gpu_device;
     parsec_device_cuda_module_t *cuda_device;
     cudaError_t status;
-    int rc, exec_stream = 0;
+    int rc, rc1, exec_stream = 0;
     parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL;
+    int manager_completing_task  = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -2571,7 +2582,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
      *             not in the queue yet.
      */
     while(1) {
-        rc = gpu_device->mutex;
+        rc = rc1 = gpu_device->mutex;
         struct timespec delay;
         if( rc >= 0 ) {
             if( parsec_atomic_cas_int32( &gpu_device->mutex, rc, rc+1 ) ) {
@@ -2585,6 +2596,24 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     }
     if( 0 < rc ) {
         parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
+
+        if( 1 == parsec_cuda_delegate_task_completion ) 
+        {
+            /**
+             * @brief 
+             * The second thread that push the task to device transitions to
+             * a co-manager.
+             * 
+             * 'rc1 == 1' is important or the manager thread will transition
+             * to co-manager. 'co_manager_mutex == 0' will ensure that there 
+             * is only one co-manager per device.
+             */
+            if( rc1 == 1 && gpu_device->co_manager_mutex == 0 )
+            {
+                parsec_cuda_co_manager(es, gpu_device);
+            }
+        }
+
         return PARSEC_HOOK_RETURN_ASYNC;
     }
     PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Entering GPU management at %s:%d",
@@ -2652,6 +2681,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
                 __parsec_reschedule(es, progress_task->ec);
                 gpu_task = progress_task;
                 progress_task = NULL;
+                manager_completing_task = 1;
                 goto remove_gpu_task;
             }
             gpu_task = NULL;
@@ -2728,17 +2758,44 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) {
         free( gpu_task->ec );
         gpu_task->ec = NULL;
+        manager_completing_task = 1;
         goto remove_gpu_task;
     }
     parsec_cuda_kernel_epilog( gpu_device, gpu_task );
-    __parsec_complete_execution( es, gpu_task->ec );
     gpu_device->super.executed_tasks++;
+
+    /** The manager will complete the tasks */
+    if( parsec_cuda_delegate_task_completion == 0 )
+    {
+        __parsec_complete_execution( es, gpu_task->ec );
+        manager_completing_task = 1;
+    }
+    /** The co-manager will complete the task. But first check if such a manager is active */
+    else if ( gpu_device->co_manager_mutex > 0 ) 
+    {
+        parsec_atomic_fetch_inc_int32( &(gpu_device->complete_mutex) );
+        parsec_fifo_push( &(gpu_device->to_complete), (parsec_list_item_t*)gpu_task );
+        manager_completing_task = 0;
+    }
+    /** If the co-manager is not yet ready */
+    else
+    {
+        __parsec_complete_execution( es, gpu_task->ec );
+        manager_completing_task = 1;
+    }
+
  remove_gpu_task:
     // Load problem: was parsec_device_load[gpu_device->super.device_index] -= gpu_task->load;
     parsec_device_load[gpu_device->super.device_index] -= parsec_device_sweight[gpu_device->super.device_index];
     PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed at %s:%d", gpu_device->super.name, 
                          gpu_task, __FILE__, __LINE__);
-    free( gpu_task );
+
+    /* free the task here only if the manager is completing the task*/
+    if(manager_completing_task == 1)
+    {
+        free( gpu_task );
+    }
+
     rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
     if( 1 == rc ) {  /* I was the last one */
 #if defined(PARSEC_PROF_TRACE)
@@ -2762,4 +2819,68 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     return PARSEC_HOOK_RETURN_DISABLE;
 }
 
+
+
+parsec_hook_return_t
+parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device )
+{
+    int rc = 0;
+    parsec_task_t* task = NULL;
+    parsec_gpu_task_t *gpu_task = NULL;
+    parsec_list_t *gpu_tasks_to_free = NULL; 
+    (void)es;
+
+    if( gpu_device->co_manager_mutex > 0 ) 
+    {
+        return PARSEC_HOOK_RETURN_ASYNC;
+    }
+    else 
+    {
+        rc = gpu_device->co_manager_mutex;
+        if( !parsec_atomic_cas_int32( &gpu_device->co_manager_mutex, rc, rc+1 ) ) 
+        {
+            return PARSEC_HOOK_RETURN_ASYNC;
+        }
+    }
+
+    gpu_tasks_to_free = PARSEC_OBJ_NEW(parsec_list_t);
+
+    /**
+     * @brief The migrate_manager thread exits when there are no more
+     * work to be done.
+     */
+    while( gpu_device->mutex > 0 || gpu_device->complete_mutex > 0)
+    {
+        if(gpu_device->complete_mutex > 0)
+        {
+            gpu_task = NULL;
+            task = NULL;
+
+            gpu_task = (parsec_gpu_task_t*)parsec_fifo_pop( &(gpu_device->to_complete) );
+            if( gpu_task != NULL)
+            {
+                task = gpu_task->ec;
+            
+                __parsec_complete_execution( es, task );
+                parsec_atomic_fetch_dec_int32( &(gpu_device->complete_mutex) );
+                parsec_list_push_back(gpu_tasks_to_free, (parsec_list_item_t*)gpu_task);
+            }                                  
+        }
+        
+    }
+    
+    rc = parsec_atomic_fetch_dec_int32( &(gpu_device->co_manager_mutex) );
+
+    /** We free the task delegated to the co-manager only at the end. Or else it may
+     * interfere with some operations in the manager.
+    */
+    while(NULL != (gpu_task = (parsec_gpu_task_t*)parsec_list_pop_front(gpu_tasks_to_free)) ) 
+    {
+        free(gpu_task);
+    }
+    PARSEC_OBJ_RELEASE(gpu_tasks_to_free);
+
+    return PARSEC_HOOK_RETURN_ASYNC;
+}
+
 #endif /* PARSEC_HAVE_CUDA */
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 129bd2b03..4307f730f 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -121,6 +121,9 @@ struct parsec_device_gpu_module_s {
                                                    *   the index of the set bit device.
                                                    */
     volatile int32_t           mutex;
+    volatile int32_t           complete_mutex;     /** tracks the number of tasks to be completed by the 
+                                                    * co-manageer
+                                                    */ 
     uint64_t                   data_avail_epoch;  /**< Identifies the epoch of the data status on the device. It
                                                    *   is increased every time a new data is made available, so
                                                    *   that we know which tasks can be evaluated for submission.
@@ -128,11 +131,13 @@ struct parsec_device_gpu_module_s {
     parsec_list_t              gpu_mem_lru;   /* Read-only blocks, and fresh blocks */
     parsec_list_t              gpu_mem_owned_lru;  /* Dirty blocks */
     parsec_fifo_t              pending;
+    parsec_fifo_t              to_complete;         /** list of tasks to be completed by the co-manager*/
     struct zone_malloc_s      *memory;
     parsec_list_item_t        *sort_starting_p;
     parsec_gpu_exec_stream_t **exec_stream;
     size_t                     mem_block_size;
     int64_t                    mem_nb_blocks;
+    volatile int32_t           co_manager_mutex;  /** ensures that there is only one co-manager per device */
 };
 
 struct parsec_gpu_exec_stream_s {