diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index b2cb9ec4b..6e109f3dc 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -41,6 +41,7 @@ char* parsec_cuda_lib_path = NULL; static int cuda_mask, cuda_nvlink_mask; +int parsec_cuda_delegate_task_completion = 0; /* * Instantiate the public struct with all of our public information @@ -201,6 +202,10 @@ static int device_cuda_component_register(void) false, false, 0, &parsec_device_gpu_one_profiling_stream_per_gpu_stream); #endif + (void)parsec_mca_param_reg_int_name("device_cuda", "delegate_task_completion", + "Integer to choose the whether task completion should be delegated to a co-manager thread (default is no)", + false, false, 0, &parsec_cuda_delegate_task_completion); + /* If CUDA was not requested avoid initializing the devices */ return (0 == parsec_device_cuda_enabled ? MCA_ERROR : MCA_SUCCESS); } diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index f269ba9a8..6c6ba4649 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -45,6 +45,11 @@ parsec_cuda_memory_reserve( parsec_device_cuda_module_t* gpu_device, static int parsec_cuda_memory_release( parsec_device_cuda_module_t* gpu_device ); static int parsec_cuda_flush_lru( parsec_device_module_t *device ); +/** MCA parameter that decides task delegation */ +extern int parsec_cuda_delegate_task_completion; +parsec_hook_return_t +parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device ); + /* look up how many FMA per cycle in single/double, per cuda MP * precision. * The following table provides updated values for future archs @@ -365,6 +370,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) len = asprintf(&gpu_device->super.name, "%s: cuda(%d)", szName, dev_id); if(-1 == len) { gpu_device->super.name = NULL; goto release_device; } gpu_device->data_avail_epoch = 0; + gpu_device->mutex = 0; + gpu_device->complete_mutex = 0; + gpu_device->co_manager_mutex = 0; gpu_device->max_exec_streams = parsec_cuda_max_streams; gpu_device->exec_stream = @@ -471,6 +479,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru, parsec_list_t); PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t); PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_fifo_t); + PARSEC_OBJ_CONSTRUCT(&gpu_device->to_complete, parsec_fifo_t); gpu_device->sort_starting_p = NULL; gpu_device->peer_access_mask = 0; /* No GPU to GPU direct transfer by default */ @@ -563,6 +572,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device) /* Release pending queue */ PARSEC_OBJ_DESTRUCT(&gpu_device->pending); + PARSEC_OBJ_DESTRUCT(&gpu_device->to_complete); /* Release all streams */ for( j = 0; j < gpu_device->num_exec_streams; j++ ) { @@ -2541,8 +2551,9 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device; parsec_device_cuda_module_t *cuda_device; cudaError_t status; - int rc, exec_stream = 0; + int rc, rc1, exec_stream = 0; parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL; + int manager_completing_task = 0; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; #endif @@ -2571,7 +2582,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, * not in the queue yet. */ while(1) { - rc = gpu_device->mutex; + rc = rc1 = gpu_device->mutex; struct timespec delay; if( rc >= 0 ) { if( parsec_atomic_cas_int32( &gpu_device->mutex, rc, rc+1 ) ) { @@ -2585,6 +2596,24 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, } if( 0 < rc ) { parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); + + if( 1 == parsec_cuda_delegate_task_completion ) + { + /** + * @brief + * The second thread that push the task to device transitions to + * a co-manager. + * + * 'rc1 == 1' is important or the manager thread will transition + * to co-manager. 'co_manager_mutex == 0' will ensure that there + * is only one co-manager per device. + */ + if( rc1 == 1 && gpu_device->co_manager_mutex == 0 ) + { + parsec_cuda_co_manager(es, gpu_device); + } + } + return PARSEC_HOOK_RETURN_ASYNC; } PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Entering GPU management at %s:%d", @@ -2652,6 +2681,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, __parsec_reschedule(es, progress_task->ec); gpu_task = progress_task; progress_task = NULL; + manager_completing_task = 1; goto remove_gpu_task; } gpu_task = NULL; @@ -2728,17 +2758,44 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) { free( gpu_task->ec ); gpu_task->ec = NULL; + manager_completing_task = 1; goto remove_gpu_task; } parsec_cuda_kernel_epilog( gpu_device, gpu_task ); - __parsec_complete_execution( es, gpu_task->ec ); gpu_device->super.executed_tasks++; + + /** The manager will complete the tasks */ + if( parsec_cuda_delegate_task_completion == 0 ) + { + __parsec_complete_execution( es, gpu_task->ec ); + manager_completing_task = 1; + } + /** The co-manager will complete the task. But first check if such a manager is active */ + else if ( gpu_device->co_manager_mutex > 0 ) + { + parsec_atomic_fetch_inc_int32( &(gpu_device->complete_mutex) ); + parsec_fifo_push( &(gpu_device->to_complete), (parsec_list_item_t*)gpu_task ); + manager_completing_task = 0; + } + /** If the co-manager is not yet ready */ + else + { + __parsec_complete_execution( es, gpu_task->ec ); + manager_completing_task = 1; + } + remove_gpu_task: // Load problem: was parsec_device_load[gpu_device->super.device_index] -= gpu_task->load; parsec_device_load[gpu_device->super.device_index] -= parsec_device_sweight[gpu_device->super.device_index]; PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed at %s:%d", gpu_device->super.name, gpu_task, __FILE__, __LINE__); - free( gpu_task ); + + /* free the task here only if the manager is completing the task*/ + if(manager_completing_task == 1) + { + free( gpu_task ); + } + rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) ); if( 1 == rc ) { /* I was the last one */ #if defined(PARSEC_PROF_TRACE) @@ -2762,4 +2819,68 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, return PARSEC_HOOK_RETURN_DISABLE; } + + +parsec_hook_return_t +parsec_cuda_co_manager( parsec_execution_stream_t *es, parsec_device_gpu_module_t* gpu_device ) +{ + int rc = 0; + parsec_task_t* task = NULL; + parsec_gpu_task_t *gpu_task = NULL; + parsec_list_t *gpu_tasks_to_free = NULL; + (void)es; + + if( gpu_device->co_manager_mutex > 0 ) + { + return PARSEC_HOOK_RETURN_ASYNC; + } + else + { + rc = gpu_device->co_manager_mutex; + if( !parsec_atomic_cas_int32( &gpu_device->co_manager_mutex, rc, rc+1 ) ) + { + return PARSEC_HOOK_RETURN_ASYNC; + } + } + + gpu_tasks_to_free = PARSEC_OBJ_NEW(parsec_list_t); + + /** + * @brief The migrate_manager thread exits when there are no more + * work to be done. + */ + while( gpu_device->mutex > 0 || gpu_device->complete_mutex > 0) + { + if(gpu_device->complete_mutex > 0) + { + gpu_task = NULL; + task = NULL; + + gpu_task = (parsec_gpu_task_t*)parsec_fifo_pop( &(gpu_device->to_complete) ); + if( gpu_task != NULL) + { + task = gpu_task->ec; + + __parsec_complete_execution( es, task ); + parsec_atomic_fetch_dec_int32( &(gpu_device->complete_mutex) ); + parsec_list_push_back(gpu_tasks_to_free, (parsec_list_item_t*)gpu_task); + } + } + + } + + rc = parsec_atomic_fetch_dec_int32( &(gpu_device->co_manager_mutex) ); + + /** We free the task delegated to the co-manager only at the end. Or else it may + * interfere with some operations in the manager. + */ + while(NULL != (gpu_task = (parsec_gpu_task_t*)parsec_list_pop_front(gpu_tasks_to_free)) ) + { + free(gpu_task); + } + PARSEC_OBJ_RELEASE(gpu_tasks_to_free); + + return PARSEC_HOOK_RETURN_ASYNC; +} + #endif /* PARSEC_HAVE_CUDA */ diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index 129bd2b03..4307f730f 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -121,6 +121,9 @@ struct parsec_device_gpu_module_s { * the index of the set bit device. */ volatile int32_t mutex; + volatile int32_t complete_mutex; /** tracks the number of tasks to be completed by the + * co-manageer + */ uint64_t data_avail_epoch; /**< Identifies the epoch of the data status on the device. It * is increased every time a new data is made available, so * that we know which tasks can be evaluated for submission. @@ -128,11 +131,13 @@ struct parsec_device_gpu_module_s { parsec_list_t gpu_mem_lru; /* Read-only blocks, and fresh blocks */ parsec_list_t gpu_mem_owned_lru; /* Dirty blocks */ parsec_fifo_t pending; + parsec_fifo_t to_complete; /** list of tasks to be completed by the co-manager*/ struct zone_malloc_s *memory; parsec_list_item_t *sort_starting_p; parsec_gpu_exec_stream_t **exec_stream; size_t mem_block_size; int64_t mem_nb_blocks; + volatile int32_t co_manager_mutex; /** ensures that there is only one co-manager per device */ }; struct parsec_gpu_exec_stream_s {