From a3203c99d26492a1cc95b2fbbd456b1e1bf986c3 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 29 Oct 2024 19:04:26 -0400 Subject: [PATCH] Offload device task release to worker threads Add a LIFO for activities that are high-priority to the context. These activities are are picked up by worker threads. With GPU execution, worker threads are mostly idle so they can spare cycles handling the release of successor tasks, including potential communication. A similar mechanism could apply to incoming communication to relieve the communication thread and offload task release upon completion of a remote dep receive. Signed-off-by: Joseph Schuchart --- parsec/include/parsec/execution_stream.h | 2 ++ parsec/mca/device/device_gpu.c | 8 +++++++- parsec/parsec.c | 9 ++++++++- parsec/scheduling.c | 12 ++++++++++++ parsec/scheduling.h | 3 +++ 5 files changed, 32 insertions(+), 2 deletions(-) diff --git a/parsec/include/parsec/execution_stream.h b/parsec/include/parsec/execution_stream.h index 279f65d81..b0652ad67 100644 --- a/parsec/include/parsec/execution_stream.h +++ b/parsec/include/parsec/execution_stream.h @@ -141,6 +141,8 @@ struct parsec_context_s { parsec_hash_table_t dtd_arena_datatypes_hash_table; /**< Hash table that stores the arena datatypes used by DTD */ int dtd_arena_datatypes_next_id; /**< Next ID to use for the next Arena Datatype by DTD */ + parsec_lifo_t activities; /**< list of tasks with outstanding activities, high-priority */ + #if defined(PARSEC_SIM) int largest_simulation_date; #endif diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 30e34789f..99ba0369c 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -2623,14 +2623,20 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, gpu_task->ec = NULL; goto remove_gpu_task; } + parsec_device_kernel_epilog( gpu_device, gpu_task ); - __parsec_complete_execution( es, gpu_task->ec ); + __parsec_schedule_activity( es, gpu_task->ec ); gpu_device->super.executed_tasks++; + remove_gpu_task: PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed", gpu_device->super.device_index, gpu_device->super.name, gpu_task); + + // TODO: this should only be done for internal gpu tasks + // and the DSL should be responsible for freeing the memory it allocated free( gpu_task ); + rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) ); if( 1 == rc ) { /* I was the last one */ #if defined(PARSEC_PROF_TRACE) diff --git a/parsec/parsec.c b/parsec/parsec.c index 095fcad27..d1db9bf9a 100644 --- a/parsec/parsec.c +++ b/parsec/parsec.c @@ -618,6 +618,8 @@ parsec_context_t* parsec_init( int nb_cores, int* pargc, char** pargv[] ) context->comm_th_core = -1; #endif /* defined(PARSEC_HAVE_HWLOC) */ + PARSEC_OBJ_CONSTRUCT(&context->activities, parsec_lifo_t); + /* TODO: nb_cores should depend on the vp_id */ nb_total_comp_threads = 0; for(p = 0; p < nb_vp; p++) { @@ -1225,7 +1227,7 @@ int parsec_fini( parsec_context_t** pcontext ) #endif /* PARSEC_PROF_TRACE */ /* PAPI SDE needs to process the shutdown before resources exposed to it are freed. - * This includes scheduling resources, so SDE needs to be finalized before the + * This includes scheduling resources, so SDE needs to be finalized before the * computation threads leave */ PARSEC_PAPI_SDE_FINI(); @@ -1265,6 +1267,11 @@ int parsec_fini( parsec_context_t** pcontext ) parsec_hwloc_fini(); #endif /* PARSEC_HAVE_HWLOC_BITMAP */ + if (!parsec_lifo_is_empty(&context->activities)) { + parsec_warning("/!\\ Warning: not all activities were executing before shutdown!\n"); + } + PARSEC_OBJ_DESTRUCT(&context->activities); + if (parsec_app_name != NULL ) { free(parsec_app_name); parsec_app_name = NULL; diff --git a/parsec/scheduling.c b/parsec/scheduling.c index 55657cc36..aac4b43db 100644 --- a/parsec/scheduling.c +++ b/parsec/scheduling.c @@ -418,6 +418,11 @@ int __parsec_reschedule(parsec_execution_stream_t* es, parsec_task_t* task) return __parsec_schedule(es, task, 0); } +int __parsec_schedule_activity(parsec_execution_stream_t *es, parsec_task_t *task) { + parsec_lifo_push(&es->virtual_process->parsec_context->activities, &task->super); + return PARSEC_SUCCESS; +} + int __parsec_complete_execution( parsec_execution_stream_t *es, parsec_task_t *task ) { @@ -532,6 +537,13 @@ __parsec_get_next_task( parsec_execution_stream_t *es, { parsec_task_t* task; + if (!parsec_list_nolock_is_empty(&es->virtual_process->parsec_context->activities)) { + task = (parsec_task_t*)parsec_lifo_pop(&es->virtual_process->parsec_context->activities); + if (NULL != task) { + return task; + } + } + if( NULL == (task = es->next_task) ) { task = parsec_current_scheduler->module.select(es, distance); } else { diff --git a/parsec/scheduling.h b/parsec/scheduling.h index fff857d0c..7d83ef262 100644 --- a/parsec/scheduling.h +++ b/parsec/scheduling.h @@ -68,6 +68,9 @@ int __parsec_schedule_vp( parsec_execution_stream_t*, parsec_task_t**, int32_t distance); +int __parsec_schedule_activity( parsec_execution_stream_t *es, + parsec_task_t *task); + /** * @brief Reschedule a task on the most appropriate resource. *