From ac0e0ed4fcb162ec98625c00d5450b19018ed3ef Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 26 Mar 2025 14:24:12 -0400 Subject: [PATCH 1/6] GPU: Thread-shift completion of tasks to worker threads Completion is potential costly (discovering new tasks) and might hit the network so we better move that away from the device management thread. Signed-off-by: Joseph Schuchart --- parsec/mca/device/device_gpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d57ac904e..c94569fc2 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -2703,7 +2703,10 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto remove_gpu_task; } parsec_device_kernel_epilog( gpu_device, gpu_task ); - __parsec_complete_execution( es, gpu_task->ec ); + // ship the task to other threads to complete its execution + gpu_task->ec->status = PARSEC_TASK_STATUS_COMPLETE; + PARSEC_LIST_ITEM_SINGLETON(gpu_task->ec); + __parsec_schedule(es, gpu_task->ec, 1); gpu_device->super.executed_tasks++; remove_gpu_task: PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed", From da86325888782d6e535368ce5a17c523fa695a71 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 14 May 2026 15:29:49 -0400 Subject: [PATCH 2/6] Decide thread-shifting completion based on chore flag The DSL can decide whether the completion of the task should be shifted to a worker thread by setting the PARSEC_CHORE_FLAG_SHIFT_COMPLETION flag on the chore. DTD and PTG will not currently thread-shift completions. Signed-off-by: Joseph Schuchart --- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 1 + parsec/mca/device/device_gpu.c | 24 ++++++++++++++++++---- parsec/parsec_internal.h | 5 ++++- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 37f7e5f10..98630107e 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -4497,6 +4497,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf ) " idx++;\n" " }\n" " chores[idx].type = PARSEC_DEV_NONE;\n" + " chores[idx].flags = PARSEC_CHORE_FLAG_NONE;\n" " chores[idx].evaluate = NULL;\n" " chores[idx].hook = NULL;\n" " /* Create the initialization tasks for each taskclass */\n" diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index c94569fc2..b3b38f72d 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -2498,6 +2498,16 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device, return 0; } +/** + * Returns true if the task's completion should be shifted to worker threads. + */ +static bool shift_completed_task(parsec_device_gpu_module_t* gpu_device, parsec_gpu_task_t* gpu_task) +{ + parsec_task_t* this_task = gpu_task->ec; + const __parsec_chore_t *chore = &this_task->task_class->incarnations[gpu_device->super.device_index]; + return (bool)(chore->flags & PARSEC_CHORE_FLAG_SHIFT_COMPLETION); +} + /** * This version is based on 4 streams: one for transfers from the memory to * the GPU, 2 for kernel executions and one for transfers from the GPU into @@ -2703,11 +2713,17 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto remove_gpu_task; } parsec_device_kernel_epilog( gpu_device, gpu_task ); - // ship the task to other threads to complete its execution - gpu_task->ec->status = PARSEC_TASK_STATUS_COMPLETE; - PARSEC_LIST_ITEM_SINGLETON(gpu_task->ec); - __parsec_schedule(es, gpu_task->ec, 1); + + if (shift_completed_task(gpu_device, gpu_task)) { + // ship the task to other threads to complete its execution + gpu_task->ec->status = PARSEC_TASK_STATUS_COMPLETE; + PARSEC_LIST_ITEM_SINGLETON(gpu_task->ec); + __parsec_schedule(es, gpu_task->ec, 1); + } else { + __parsec_complete_execution( es, gpu_task->ec ); + } gpu_device->super.executed_tasks++; + remove_gpu_task: PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed", gpu_device->super.device_index, gpu_device->super.name, diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h index 214d18c75..ab747edce 100644 --- a/parsec/parsec_internal.h +++ b/parsec/parsec_internal.h @@ -392,9 +392,12 @@ int parsec_update_deps_with_counter_count_task(parsec_taskpool_t *tp, const parsec_task_t* PARSEC_RESTRICT origin, const parsec_flow_t* PARSEC_RESTRICT origin_flow, const parsec_flow_t* PARSEC_RESTRICT dest_flow); - + +#define PARSEC_CHORE_FLAG_NONE 0x00 +#define PARSEC_CHORE_FLAG_SHIFT_COMPLETION 0x01 typedef struct __parsec_internal_incarnation_s { int32_t type; + int32_t flags; parsec_evaluate_function_t *evaluate; parsec_hook_t *hook; char *dyld; From c3d31c2b744c74c892288b44ef90aadaacfeb8ad Mon Sep 17 00:00:00 2001 From: bosilca Date: Mon, 18 May 2026 16:15:46 -0400 Subject: [PATCH 3/6] Apply suggestion from @bosilca --- parsec/mca/device/device_gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index b3b38f72d..d74fe30c8 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -2504,7 +2504,7 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device, static bool shift_completed_task(parsec_device_gpu_module_t* gpu_device, parsec_gpu_task_t* gpu_task) { parsec_task_t* this_task = gpu_task->ec; - const __parsec_chore_t *chore = &this_task->task_class->incarnations[gpu_device->super.device_index]; + const __parsec_chore_t *chore = &this_task->task_class->incarnations[this_task->selected_chore]; return (bool)(chore->flags & PARSEC_CHORE_FLAG_SHIFT_COMPLETION); } From b43f4e16494ffa34e0fd6d61c52653c4fab1743b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 8 Jun 2026 19:09:33 -0400 Subject: [PATCH 4/6] Add PARSEC_CHORE_FLAG_NONE to JDF Make sure we set the flag on the chore in JDF Co-authored-by: bosilca --- parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 98630107e..e1b3dbf65 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -3954,6 +3954,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, } string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper); string_arena_add_string(sa, " { .type = PARSEC_DEV_%s,\n", dev_upper); + string_arena_add_string(sa, " .flags = PARSEC_CHORE_FLAG_NONE,\n"); if( NULL == dyld_property ) { string_arena_add_string(sa, " .dyld = NULL,\n"); } else { @@ -3986,7 +3987,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, } while (NULL != body); string_arena_add_string(sa, " { .type = PARSEC_DEV_NONE,\n" - " .evaluate = NULL,\n" + " .flags = PARSEC_CHORE_FLAG_NONE,\n" " .hook = (parsec_hook_t*)NULL }, /* End marker */\n" "};\n\n"); } From c990f1822589fc7ad524785586478e9c5ede0b9d Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 8 Jun 2026 19:13:03 -0400 Subject: [PATCH 5/6] Assert that the GPU task is a singleton before completing In case the we stop picking apart a batch before completing tasks. Also, add debug output to signal that the task was sent for completion. Signed-off-by: Joseph Schuchart --- parsec/mca/device/device_gpu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d74fe30c8..77533b324 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -4,6 +4,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -2714,11 +2715,21 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, } parsec_device_kernel_epilog( gpu_device, gpu_task ); +#if defined(PARSEC_DEBUG_PARANOID) + /** + * Batched submissions should have been split again for completion but + * in case this ever changes we will catch that here. + */ + assert(parsec_gpu_task_is_singleton(gpu_task)); +#endif if (shift_completed_task(gpu_device, gpu_task)) { // ship the task to other threads to complete its execution gpu_task->ec->status = PARSEC_TASK_STATUS_COMPLETE; PARSEC_LIST_ITEM_SINGLETON(gpu_task->ec); __parsec_schedule(es, gpu_task->ec, 1); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: task %p of gpu_task %p scheduled for completion", + gpu_device->super.device_index, gpu_device->super.name, + gpu_task->ec, gpu_task); } else { __parsec_complete_execution( es, gpu_task->ec ); } From ac0008c89f7d38b9dd0a54a9b2896aa600b673b5 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 8 Jun 2026 19:13:32 -0400 Subject: [PATCH 6/6] DTD: set flags of chore to PARSEC_CHORE_FLAG_NONE Signed-off-by: Joseph Schuchart --- parsec/interfaces/dtd/insert_function.c | 1 + 1 file changed, 1 insertion(+) diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index af84dba73..abe5d73df 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -2371,6 +2371,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, } incarnations[i].type = device_type; + incarnations[i].flags = PARSEC_CHORE_FLAG_NONE; if(PARSEC_DEV_CUDA == device_type) { incarnations[i].hook = parsec_dtd_gpu_task_submit; dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;