diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index af84dba73..abe5d73df 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -2371,6 +2371,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, } incarnations[i].type = device_type; + incarnations[i].flags = PARSEC_CHORE_FLAG_NONE; if(PARSEC_DEV_CUDA == device_type) { incarnations[i].hook = parsec_dtd_gpu_task_submit; dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function; diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 37f7e5f10..e1b3dbf65 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -3954,6 +3954,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, } string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper); string_arena_add_string(sa, " { .type = PARSEC_DEV_%s,\n", dev_upper); + string_arena_add_string(sa, " .flags = PARSEC_CHORE_FLAG_NONE,\n"); if( NULL == dyld_property ) { string_arena_add_string(sa, " .dyld = NULL,\n"); } else { @@ -3986,7 +3987,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, } while (NULL != body); string_arena_add_string(sa, " { .type = PARSEC_DEV_NONE,\n" - " .evaluate = NULL,\n" + " .flags = PARSEC_CHORE_FLAG_NONE,\n" " .hook = (parsec_hook_t*)NULL }, /* End marker */\n" "};\n\n"); } @@ -4497,6 +4498,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf ) " idx++;\n" " }\n" " chores[idx].type = PARSEC_DEV_NONE;\n" + " chores[idx].flags = PARSEC_CHORE_FLAG_NONE;\n" " chores[idx].evaluate = NULL;\n" " chores[idx].hook = NULL;\n" " /* Create the initialization tasks for each taskclass */\n" diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d57ac904e..77533b324 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -4,6 +4,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -2498,6 +2499,16 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device, return 0; } +/** + * Returns true if the task's completion should be shifted to worker threads. + */ +static bool shift_completed_task(parsec_device_gpu_module_t* gpu_device, parsec_gpu_task_t* gpu_task) +{ + parsec_task_t* this_task = gpu_task->ec; + const __parsec_chore_t *chore = &this_task->task_class->incarnations[this_task->selected_chore]; + return (bool)(chore->flags & PARSEC_CHORE_FLAG_SHIFT_COMPLETION); +} + /** * This version is based on 4 streams: one for transfers from the memory to * the GPU, 2 for kernel executions and one for transfers from the GPU into @@ -2703,8 +2714,27 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto remove_gpu_task; } parsec_device_kernel_epilog( gpu_device, gpu_task ); - __parsec_complete_execution( es, gpu_task->ec ); + +#if defined(PARSEC_DEBUG_PARANOID) + /** + * Batched submissions should have been split again for completion but + * in case this ever changes we will catch that here. + */ + assert(parsec_gpu_task_is_singleton(gpu_task)); +#endif + if (shift_completed_task(gpu_device, gpu_task)) { + // ship the task to other threads to complete its execution + gpu_task->ec->status = PARSEC_TASK_STATUS_COMPLETE; + PARSEC_LIST_ITEM_SINGLETON(gpu_task->ec); + __parsec_schedule(es, gpu_task->ec, 1); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: task %p of gpu_task %p scheduled for completion", + gpu_device->super.device_index, gpu_device->super.name, + gpu_task->ec, gpu_task); + } else { + __parsec_complete_execution( es, gpu_task->ec ); + } gpu_device->super.executed_tasks++; + remove_gpu_task: PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed", gpu_device->super.device_index, gpu_device->super.name, diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h index 214d18c75..ab747edce 100644 --- a/parsec/parsec_internal.h +++ b/parsec/parsec_internal.h @@ -392,9 +392,12 @@ int parsec_update_deps_with_counter_count_task(parsec_taskpool_t *tp, const parsec_task_t* PARSEC_RESTRICT origin, const parsec_flow_t* PARSEC_RESTRICT origin_flow, const parsec_flow_t* PARSEC_RESTRICT dest_flow); - + +#define PARSEC_CHORE_FLAG_NONE 0x00 +#define PARSEC_CHORE_FLAG_SHIFT_COMPLETION 0x01 typedef struct __parsec_internal_incarnation_s { int32_t type; + int32_t flags; parsec_evaluate_function_t *evaluate; parsec_hook_t *hook; char *dyld;