From 075891084fcd7074e784ae1eeea199530fdb726c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Jun 2026 17:07:58 -0400 Subject: [PATCH 1/2] Sort incoming device tasks using heap Currently, sorting of device tasks is broken (they don't actually contain a priority field) and linear over a slice of the input list. Instead, we can use a max-heap to keep a sorted array of device tasks, push all new tasks into the heap, and take out the highest priority task. We can detach the full lifo at once to avoid repeated atomic operations by the management thread. Also, use nolock variants on stream-local lists and change to lifo for the shared queue to avoid the lock of the fifo. They are only modified by the device management thread. Inter-thread communication happens through the pending queue. Signed-off-by: Joseph Schuchart --- parsec/CMakeLists.txt | 1 + parsec/class/lifo.h | 76 ++++ parsec/class/parsec_heap.c | 204 +++++++++ parsec/class/parsec_heap.h | 87 ++++ parsec/data.c | 2 +- parsec/hbbuffer.c | 4 +- parsec/maxheap.c | 422 +++++------------- parsec/maxheap.h | 55 ++- .../mca/device/cuda/device_cuda_component.c | 8 +- parsec/mca/device/cuda/device_cuda_module.c | 22 +- parsec/mca/device/device.h | 8 +- parsec/mca/device/device_gpu.c | 103 +---- parsec/mca/device/device_gpu.h | 54 +-- .../level_zero/device_level_zero_component.c | 8 +- .../level_zero/device_level_zero_module.c | 12 +- parsec/mca/sched/ltq/sched_ltq_module.c | 20 +- 16 files changed, 596 insertions(+), 490 deletions(-) create mode 100644 parsec/class/parsec_heap.c create mode 100644 parsec/class/parsec_heap.h diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt index df3dfe05c..5a51b2344 100644 --- a/parsec/CMakeLists.txt +++ b/parsec/CMakeLists.txt @@ -17,6 +17,7 @@ set(BASE_SOURCES class/parsec_value_array.c class/parsec_hash_table.c class/parsec_rwlock.c + class/parsec_heap.c class/parsec_rbtree.c class/parsec_future.c class/parsec_datacopy_future.c diff --git a/parsec/class/lifo.h b/parsec/class/lifo.h index 69bacc2db..526541b89 100644 --- a/parsec/class/lifo.h +++ b/parsec/class/lifo.h @@ -2,6 +2,7 @@ * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #ifndef LIFO_H_HAS_BEEN_INCLUDED @@ -232,6 +233,20 @@ LIFO_STATIC_INLINE int parsec_lifo_nolock_is_empty( parsec_lifo_t* lifo ) { return (NULL == lifo->lifo_head.data.item); } +/* Detach all elements in the chain */ +#if defined(PARSEC_DEBUG_PARANOID) +#define PARSEC_CHAIN_DETACH(item) { \ + parsec_list_item_t *_item = (item); \ + while (_item != NULL) { \ + parsec_list_item_t *next = (parsec_list_item_t *) _item->list_next; \ + PARSEC_ITEM_DETACH(_item); \ + _item = next; \ + } \ +} +#else +#define PARSEC_CHAIN_DETACH(item) do { (void)(item); } while(0) +#endif + #if defined(PARSEC_ATOMIC_HAS_ATOMIC_CAS_INT128) /* Add one element to the FIFO. Returns true if successful, false otherwise. */ @@ -356,6 +371,29 @@ LIFO_STATIC_INLINE parsec_list_item_t* parsec_lifo_try_pop( parsec_lifo_t* lifo return NULL; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_detach_chain(parsec_lifo_t *lifo) +{ + parsec_counted_pointer_t old_head; + do { + old_head.data.guard.counter = lifo->lifo_head.data.guard.counter; + parsec_atomic_rmb(); + old_head.data.item = lifo->lifo_head.data.item; + if (NULL == old_head.data.item) return NULL; + } while (!parsec_update_counted_pointer(&lifo->lifo_head, old_head, NULL)); + parsec_atomic_wmb(); + parsec_list_item_t *item = old_head.data.item; +#if defined(PARSEC_DEBUG_PARANOID) + while (item != NULL) { + parsec_list_item_t *next = (parsec_list_item_t *) item->list_next; + PARSEC_ITEM_DETACH(item); + item = next; + } +#endif + PARSEC_CHAIN_DETACH(item); + return item; +} + #elif defined(PARSEC_ATOMIC_HAS_ATOMIC_LLSC_PTR) LIFO_STATIC_INLINE void _parsec_lifo_release_cpu (void) @@ -468,6 +506,23 @@ LIFO_STATIC_INLINE parsec_list_item_t* parsec_lifo_try_pop( parsec_lifo_t* lifo return item; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_detach_chain(parsec_lifo_t *lifo) +{ + parsec_list_item_t *item; + int attempt = 0; + do { + if (++attempt == 5) { + _parsec_lifo_release_cpu(); + attempt = 0; + } + item = (parsec_list_item_t *)parsec_atomic_ll_ptr((long *)&lifo->lifo_head.data.item); + if (NULL == item) return NULL; + } while (!parsec_atomic_sc_ptr((long *)&lifo->lifo_head.data.item, (intptr_t)NULL)); + parsec_atomic_wmb(); + PARSEC_CHAIN_DETACH(item); + return item; +} #else /* defined(PARSEC_ATOMIC_HAS_ATOMIC_CAS_INT128) || defined(PARSEC_ATOMIC_HAS_ATOMIC_LLSC_PTR) */ @@ -547,6 +602,18 @@ LIFO_STATIC_INLINE parsec_list_item_t *parsec_lifo_try_pop(parsec_lifo_t* lifo) return item; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_detach_chain(parsec_lifo_t *lifo) +{ + parsec_list_item_t *item; + parsec_atomic_lock(&lifo->lifo_head.data.guard.lock); + item = lifo->lifo_head.data.item; + lifo->lifo_head.data.item = NULL; + parsec_atomic_unlock(&lifo->lifo_head.data.guard.lock); + PARSEC_CHAIN_DETACH(item); + return item; +} + #endif /* defined(PARSEC_ATOMIC_HAS_ATOMIC_CAS_INT128) || defined(PARSEC_ATOMIC_HAS_ATOMIC_LLSC_PTR) */ LIFO_STATIC_INLINE void parsec_lifo_nolock_push( parsec_lifo_t* lifo, @@ -583,6 +650,15 @@ LIFO_STATIC_INLINE parsec_list_item_t* parsec_lifo_nolock_pop( parsec_lifo_t* li return item; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_nolock_detach_chain(parsec_lifo_t *lifo) +{ + parsec_list_item_t *item = lifo->lifo_head.data.item; + lifo->lifo_head.data.item = NULL; + PARSEC_CHAIN_DETACH(item); + return item; +} + /** * @brief Allocate a lifo item. * diff --git a/parsec/class/parsec_heap.c b/parsec/class/parsec_heap.c new file mode 100644 index 000000000..bd92aa799 --- /dev/null +++ b/parsec/class/parsec_heap.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2026 Stony Brook University. All rights reserved. + */ + +#include "parsec/parsec_config.h" +#include "parsec/class/parsec_heap.h" +#include "parsec/constants.h" + +#include + +/* COMPARISON_VAL(ptr, offset) — reads *(int*) at byte offset inside ptr. + * Defined in parsec_config_bottom.h, included transitively via parsec_config.h. */ + +/* left child = list_prev + * right child = list_next + * Same convention as parsec/maxheap.c and parsec_rbtree.c. */ +#define HLEFT(item) ((parsec_list_item_t *)(item)->list_prev) +#define HRIGHT(item) ((parsec_list_item_t *)(item)->list_next) +#define HSET_LEFT(it, v) ((it)->list_prev = (volatile struct parsec_list_item_s *)(v)) +#define HSET_RIGHT(it, v) ((it)->list_next = (volatile struct parsec_list_item_s *)(v)) + +static inline int heap_cmp(const parsec_heap_t *h, + const parsec_list_item_t *a, + const parsec_list_item_t *b) +{ + int va = COMPARISON_VAL(a, h->comp_offset); + int vb = COMPARISON_VAL(b, h->comp_offset); + return (va > vb) - (va < vb); +} + +/* Maximum depth of the path array. 64 supports heaps of up to 2^64 elements. */ +#define HEAP_MAX_DEPTH 64 + +int parsec_heap_push(parsec_heap_t *heap, parsec_list_item_t *item) +{ + HSET_LEFT(item, NULL); + HSET_RIGHT(item, NULL); + heap->size++; + + if (heap->size == 1) { + heap->top = item; + return PARSEC_SUCCESS; + } + + /* Find the insertion point by following the bit path of 'size'. + * After the leading 1-bit, each subsequent bit chooses right (1) or left (0). + * Save ancestors for the sift-up pass. + * (Same bit-navigation used by heap_insert in parsec/maxheap.c.) */ + parsec_list_item_t *path[HEAP_MAX_DEPTH]; + int depth = 0; + + size_t size = heap->size; + size_t bitmask = 1; + while (bitmask <= size) bitmask <<= 1; + bitmask >>= 2; /* position at bit just below the leading 1 */ + + parsec_list_item_t *node = heap->top; + path[depth++] = node; + while (bitmask > 1) { + node = (bitmask & size) ? HRIGHT(node) : HLEFT(node); + path[depth++] = node; + bitmask >>= 1; + } + /* Attach item as left (0) or right (1) child */ + if (bitmask & size) HSET_RIGHT(node, item); + else HSET_LEFT(node, item); + + /* Sift up: walk from immediate parent (path[depth-1]) toward root */ + int level = depth - 1; + while (level >= 0) { + parsec_list_item_t *parent = path[level]; + if (heap_cmp(heap, item, parent) <= 0) break; + + /* Fix grandparent to point to item instead of parent */ + if (level > 0) { + parsec_list_item_t *gp = path[level - 1]; + if (HLEFT(gp) == parent) HSET_LEFT(gp, item); + else HSET_RIGHT(gp, item); + } else { + heap->top = item; + } + + /* Swap item and parent: item takes parent's position, parent takes item's */ + parsec_list_item_t *pl = HLEFT(parent); + parsec_list_item_t *pr = HRIGHT(parent); + HSET_LEFT(parent, HLEFT(item)); + HSET_RIGHT(parent, HRIGHT(item)); + if (pl == item) { + HSET_LEFT(item, parent); + HSET_RIGHT(item, pr); + } else { + HSET_LEFT(item, pl); + HSET_RIGHT(item, parent); + } + level--; + } + return PARSEC_SUCCESS; +} + +parsec_list_item_t *parsec_heap_pop(parsec_heap_t *heap) +{ + if (0 == heap->size) return NULL; + + parsec_list_item_t *root = heap->top; + + if (heap->size == 1) { + heap->top = NULL; + heap->size = 0; + PARSEC_LIST_ITEM_SINGLETON(root); + return root; + } + + /* Navigate to the parent of the 'last' node (rightmost node in the + * bottom level), then detach it. Track which side it was on so we + * can correctly wire it into root's position even after clearing the + * pointer. */ + size_t size = heap->size; + size_t bitmask = 1; + while (bitmask <= size) bitmask <<= 1; + bitmask >>= 2; + + parsec_list_item_t *parent = heap->top; + while (bitmask > 1) { + parent = (bitmask & size) ? HRIGHT(parent) : HLEFT(parent); + bitmask >>= 1; + } + + parsec_list_item_t *last; + int last_was_right = (int)(bitmask & size); + if (last_was_right) { + last = HRIGHT(parent); + HSET_RIGHT(parent, NULL); + } else { + last = HLEFT(parent); + HSET_LEFT(parent, NULL); + } + assert(last != NULL); + + /* Wire 'last' into root's place, inheriting root's children. */ + if (parent != root) { + HSET_LEFT(last, HLEFT(root)); + HSET_RIGHT(last, HRIGHT(root)); + } else { + /* last was a direct child of root; one pointer was already cleared above */ + if (last_was_right) { + HSET_LEFT(last, HLEFT(root)); /* root's left is intact */ + HSET_RIGHT(last, NULL); /* last is a leaf */ + } else { + HSET_LEFT(last, NULL); /* last is a leaf */ + HSET_RIGHT(last, HRIGHT(root)); /* root's right is intact */ + } + } + heap->top = last; + heap->size--; + + /* Sift down: swap last with the larger child until heap order is restored. + * Track parent and which side we came from (no extra allocation needed). */ + parsec_list_item_t *bubbler = last; + parsec_list_item_t *par = NULL; + int from_right = 0; + while (1) { + parsec_list_item_t *left = HLEFT(bubbler); + parsec_list_item_t *right = HRIGHT(bubbler); + int go_left = (left && heap_cmp(heap, left, bubbler) > 0 && + (!right || heap_cmp(heap, left, right) >= 0)); + int go_right = (!go_left && right && heap_cmp(heap, right, bubbler) > 0); + if (!go_left && !go_right) break; + + parsec_list_item_t *swap = go_left ? left : right; + if (par) { + if (from_right) HSET_RIGHT(par, swap); + else HSET_LEFT(par, swap); + } else { + heap->top = swap; + } + HSET_LEFT(bubbler, HLEFT(swap)); + HSET_RIGHT(bubbler, HRIGHT(swap)); + if (go_left) { + HSET_LEFT(swap, bubbler); + HSET_RIGHT(swap, right); + from_right = 0; + } else { + HSET_LEFT(swap, left); + HSET_RIGHT(swap, bubbler); + from_right = 1; + } + par = swap; + } + + PARSEC_LIST_ITEM_SINGLETON(root); + return root; +} + +int parsec_heap_push_chain(parsec_heap_t *heap, parsec_list_item_t *chain) +{ + parsec_list_item_t *item = chain; + do { + /* Capture list_next before parsec_heap_push repurposes it as right-child */ + parsec_list_item_t *next = (parsec_list_item_t *)item->list_next; + parsec_heap_push(heap, item); + item = next; + } while (item != chain && item != NULL); + return PARSEC_SUCCESS; +} diff --git a/parsec/class/parsec_heap.h b/parsec/class/parsec_heap.h new file mode 100644 index 000000000..6409fc397 --- /dev/null +++ b/parsec/class/parsec_heap.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2026 Stony Brook University. All rights reserved. + */ + +#ifndef PARSEC_HEAP_H_HAS_BEEN_INCLUDED +#define PARSEC_HEAP_H_HAS_BEEN_INCLUDED + +#include "parsec/parsec_config.h" +#include "parsec/class/list_item.h" +#include "parsec/constants.h" + +BEGIN_C_DECLS + +/** + * @brief Intrusive pointer-based max-heap with an int32_t priority key. + * + * @details Elements are linked directly through their parsec_list_item_t + * list_prev (left child) and list_next (right child) pointers, forming a + * complete binary tree — the same technique used by parsec/maxheap.c for + * CPU scheduler task heaps. No separate backing array is allocated. + * + * The priority of each element is read as *(int32_t*)((char*)element + + * comp_offset), matching the COMPARISON_VAL macro convention used by the + * rbtree and list sort. + * + * All operations are O(log N). Sift-up uses a small on-stack path array + * (max 64 entries; supports up to 2^64 elements). + * + * An element's list_prev/list_next are used as tree child pointers while it + * is in the heap. parsec_heap_pop() restores them to singleton state before + * returning, so the caller can pass the result directly to + * parsec_list_push_back() / parsec_gpu_stream_push_pending() etc. + */ +typedef struct parsec_heap_s { + parsec_list_item_t *top; /**< root of the complete binary tree */ + size_t size; /**< current element count */ + size_t comp_offset; /**< byte offset of int32_t priority key */ +} parsec_heap_t; + +/** Initialize an empty heap with the given priority-key offset. */ +static inline void parsec_heap_init(parsec_heap_t *heap, size_t comp_offset) { + heap->top = NULL; + heap->size = 0; + heap->comp_offset = comp_offset; +} + +/** Finalize heap (no-op: no allocation to free). */ +static inline void parsec_heap_fini(parsec_heap_t *heap) { + (void)heap; +} + +/** Return non-zero if the heap is empty. */ +static inline int parsec_heap_is_empty(const parsec_heap_t *heap) { + return (heap->size == 0); +} + +/** Return the number of elements. */ +static inline size_t parsec_heap_size(const parsec_heap_t *heap) { + return heap->size; +} + +/** View the maximum element without removing it. O(1). */ +static inline parsec_list_item_t *parsec_heap_peek(const parsec_heap_t *heap) { + return heap->top; +} + +/** + * Insert one element. O(log N). + * @return PARSEC_SUCCESS (cannot fail; no allocation is performed). + */ +int parsec_heap_push(parsec_heap_t *heap, parsec_list_item_t *item); + +/** + * Remove and return the maximum element, or NULL if empty. O(log N). + * The returned item's list_prev and list_next are reset to singleton state. + */ +parsec_list_item_t *parsec_heap_pop(parsec_heap_t *heap); + +/** + * Batch-insert all elements from a chain or ring. + * @return PARSEC_SUCCESS (cannot fail). + */ +int parsec_heap_push_chain(parsec_heap_t *heap, parsec_list_item_t *chain); + +END_C_DECLS + +#endif /* PARSEC_HEAP_H_HAS_BEEN_INCLUDED */ diff --git a/parsec/data.c b/parsec/data.c index 306a6c507..29770d593 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -635,7 +635,7 @@ static void parsec_arena_datatype_construct(parsec_object_t *obj) { adt->ht_item.next_item = NULL; /* keep Coverity happy */ adt->ht_item.hash64 = 0; /* keep Coverity happy */ adt->ht_item.key = 0; /* keep Coverity happy */ - adt->opaque_dtt = NULL; + adt->opaque_dtt = PARSEC_DATATYPE_NULL; } static void parsec_arena_datatype_destruct(parsec_object_t *obj) { diff --git a/parsec/hbbuffer.c b/parsec/hbbuffer.c index 9dcb30ae3..64a16a095 100644 --- a/parsec/hbbuffer.c +++ b/parsec/hbbuffer.c @@ -250,9 +250,9 @@ parsec_hbbuffer_pop_best(parsec_hbbuffer_t *b, off_t priority_offset) #if defined(PARSEC_DEBUG_NOISIER) if( best_elt != NULL ) { char tmp[MAX_TASK_STRLEN]; - if (priority_offset == offsetof(parsec_heap_t, priority)) { + if (priority_offset == offsetof(parsec_task_heap_t, priority)) { PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "HBB:\tFound best element %s in heap %p in local queue %p at position %d", - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, (parsec_task_t*)((parsec_heap_t*)best_elt)->top), best_elt, + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, (parsec_task_t*)((parsec_task_heap_t*)best_elt)->heap.top), best_elt, b, best_idx); } else { PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "HBB:\tFound best element %s in local queue %p at position %d", diff --git a/parsec/maxheap.c b/parsec/maxheap.c index bba41d9b2..8d5d4c490 100644 --- a/parsec/maxheap.c +++ b/parsec/maxheap.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -13,8 +14,14 @@ #include "parsec/maxheap.h" #include +#include -static inline int hiBit(unsigned int n) +/* list_prev = left child, list_next = right child (same as parsec_heap.c) */ +#define HLEFT(item) ((parsec_list_item_t *)(item)->list_prev) +#define HRIGHT(item) ((parsec_list_item_t *)(item)->list_next) + +/* Highest set bit: used to compute sub-heap sizes in heap_split_and_steal. */ +static inline unsigned int hiBit(unsigned int n) { n |= (n >> 1); n |= (n >> 2); @@ -24,362 +31,153 @@ static inline int hiBit(unsigned int n) return n - (n >> 1); } -parsec_heap_t* heap_create(void) +parsec_task_heap_t* heap_create(void) { - parsec_heap_t* heap = calloc(1, sizeof(parsec_heap_t)); - /* Point back to the parent structure */ - heap->list_item.list_next = (parsec_list_item_t*)heap; - heap->list_item.list_prev = (parsec_list_item_t*)heap; - return heap; + parsec_task_heap_t *h = calloc(1, sizeof(parsec_task_heap_t)); + h->list_item.list_next = (parsec_list_item_t*)h; + h->list_item.list_prev = (parsec_list_item_t*)h; + h->priority = 0; + parsec_heap_init(&h->heap, offsetof(parsec_task_t, priority)); + return h; } -void heap_destroy(parsec_heap_t** heap) +void heap_destroy(parsec_task_heap_t **heap) { - assert((*heap)->top == NULL); + assert(parsec_heap_is_empty(&(*heap)->heap)); + parsec_heap_fini(&(*heap)->heap); free(*heap); - (*heap) = NULL; + *heap = NULL; } -/* - * Insertion is O(lg n), as we know exactly how to get to the next insertion point, - * and the tree is manually balanced. - * Overall build is O(n lg n) - * - * Destroys elem->list_item next and prev. - */ -void heap_insert(parsec_heap_t * heap, parsec_task_t * elem) +void heap_insert(parsec_task_heap_t *heap, parsec_task_t *elem) { assert(heap != NULL); assert(elem != NULL); - heap->size++; - elem->super.list_next = NULL; - elem->super.list_prev = NULL; - - if (heap->size == 1) { - heap->top = elem; - } else { - parsec_task_t * parent = heap->top; - unsigned int bitmask = 1, size = heap->size; - // prime the bitmask - int level_counter = 0, parents_size = 0; - while (bitmask <= size) { - bitmask = bitmask << 1; - level_counter++; - } - parents_size = level_counter; - - parsec_task_t ** parents = calloc(level_counter, sizeof(parsec_task_t *)); - // now the bitmask is two places farther than we want it, so back down - bitmask = bitmask >> 2; - - parents[--level_counter] = heap->top; - // now move through tree - while (bitmask > 1) { - parent = (parsec_task_t*)((bitmask & size) ? parent->super.list_next : parent->super.list_prev); - parents[--level_counter] = parent; // save parent - bitmask = bitmask >> 1; - } - if (bitmask & size) - parent->super.list_next = (parsec_list_item_t*)elem; - else - parent->super.list_prev = (parsec_list_item_t*)elem; - - // now bubble up to preserve max heap org. - while( (level_counter < parents_size) && - (parents[level_counter] != NULL) && - (elem->priority > parents[level_counter]->priority) ) { - parent = parents[level_counter]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tswapping parent %p and elem %p (priorities: %d and %d)", - parent, elem, parent->priority, elem->priority); - /* first, fix our grandparent, if necessary */ - if (level_counter + 1 < parents_size && parents[level_counter + 1] != NULL) { - parsec_task_t * grandparent = parents[level_counter + 1]; - // i.e. our parent has a parent - if (grandparent->super.list_prev /* left */ == (parsec_list_item_t*)parent) - grandparent->super.list_prev = (parsec_list_item_t*)elem; - else /* our grandparent's right child is our parent*/ - grandparent->super.list_next = (parsec_list_item_t*)elem; - } - - /* next, fix our parent */ - parsec_list_item_t * parent_left = (parsec_list_item_t*)parent->super.list_prev; - parsec_list_item_t * parent_right = (parsec_list_item_t*)parent->super.list_next; - parent->super.list_prev = elem->super.list_prev; - parent->super.list_next = elem->super.list_next; - - /* lastly, fix ourselves */ - if (parent_left == (parsec_list_item_t*)elem) { - /* we're our parent's left child */ - elem->super.list_prev = (parsec_list_item_t*)parent; - elem->super.list_next = (parsec_list_item_t*)parent_right; - } else { - /* we're out parent's right child */ - elem->super.list_prev = (parsec_list_item_t*)parent_left; - elem->super.list_next = (parsec_list_item_t*)parent; - } - - if (parent == heap->top) - heap->top = elem; - - level_counter++; - } - free(parents); - } - - /* set priority to top priority */ - heap->priority = heap->top->priority; + parsec_heap_push(&heap->heap, &elem->super); + heap->priority = (unsigned int)COMPARISON_VAL(heap->heap.top, heap->heap.comp_offset); #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tInserted exec C %s (%p) into maxheap %p of size %u", - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, elem), elem, heap, heap->size); + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tInserted exec C %s (%p) into maxheap %p of size %zu", + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, elem), elem, + heap, heap->heap.size); #endif } -/* - * split-and-steal (remove) is O(1), although the preceding - * list search is probably O(n), technically, since eventually we - * end up with a list of n/2 trees with single nodes - * - * This function expects one valid heap (heap that has at least one element) - * and another pointer to a NULL heap pointer. - * If you pass a NULL heap, the function will simply return NULL. - * This function WILL destroy your heap if it empties it. - * It will also MODIFY your stack appropriately. If both of your heap pointers - * are NULL after it returns, there was only one element in the heap you passed. - * If only the new_heap pointer is NULL, then you still have one (and ONLY ONE) - * valid heap. - * If your valid heap had at least 3 nodes, then the heap will actually be split, - * a new heap pointer created and put on your stack. - * No matter what happens, an execution_context is returned unless the heap was NULL. - */ -parsec_task_t* -heap_split_and_steal(parsec_heap_t ** heap_ptr, - parsec_heap_t ** new_heap_ptr) +parsec_task_t* heap_remove(parsec_task_heap_t **heap_ptr) { - // if tree is empty, return NULL - // if tree has only one node (top), return new heap with single node - // moved into to_use slot - // if tree has left child but not right child, put left child in new tree + parsec_task_heap_t *heap = *heap_ptr; + if (NULL == heap) return NULL; - parsec_heap_t * heap = *heap_ptr; // shortcut to doing a bunch of (*heap_ptr)s - parsec_task_t * to_use = NULL; - (*new_heap_ptr) = NULL; // this should already be NULL, but if it's not, we'll fix that. + parsec_list_item_t *item = parsec_heap_pop(&heap->heap); + if (NULL == item) return NULL; - if( NULL == heap ) return NULL; + parsec_task_t *task = (parsec_task_t*)item; - assert(heap->top != NULL); // this heap should have been destroyed - to_use = heap->top; // this will always be what we return, even if it's NULL, if a valid heap was passed - if( NULL == heap->top->super.list_prev ) { - /* no left child, so 'top' is the only node */ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tDestroying heap %p", heap->top, heap->top->super.list_next, heap); - heap->top = NULL; + if (parsec_heap_is_empty(&heap->heap)) { heap_destroy(heap_ptr); - assert(*heap_ptr == NULL); - goto prepare_for_return; - } /* otherwise we do have left child */ - if( NULL == heap->top->super.list_next /* right */ ) { - assert(heap->size == 2); - /* but doesn't have right child, so still not splitting */ - heap->top = (parsec_task_t*)heap->top->super.list_prev; // left - assert(heap->top->super.list_next == NULL); - assert(heap->top->super.list_prev == NULL); - heap->priority = heap->top->priority; - heap->size--; // should equal 1 - /* set up doubly-linked singleton list in here, as DEFAULT scenario */ - // PETER TODO this comment needs to be better, b/c I don't understand it anymore + } else { + heap->priority = (unsigned int)COMPARISON_VAL(heap->heap.top, heap->heap.comp_offset); + /* Restore singleton list links so the wrapper can be re-inserted into a scheduler list */ heap->list_item.list_prev = (parsec_list_item_t*)*heap_ptr; heap->list_item.list_next = (parsec_list_item_t*)*heap_ptr; } - else { // heap has at least 3 nodes, so we should be actually splitting - unsigned int size = heap->size; - unsigned int highBit = hiBit(heap->size); - unsigned int twoBit = highBit >> 1; - assert(heap->size >= 3); - (*new_heap_ptr) = heap_create(); - (*new_heap_ptr)->top = (parsec_task_t*)heap->top->super.list_prev; // left - (*new_heap_ptr)->priority = (*new_heap_ptr)->top->priority; - heap->top = (parsec_task_t*)heap->top->super.list_next; - heap->priority = heap->top->priority; - if (twoBit & size) { // last item is on right side - heap->size = ~highBit & size; - (*new_heap_ptr)->size = size - heap->size - 1; - } - else { // last item is on left side - (*new_heap_ptr)->size = (size & ~highBit) + twoBit; - heap->size = size - (*new_heap_ptr)->size - 1; - } - /* set up doubly-linked two-element list in here, as DEFAULT scenario */ - heap->list_item.list_prev = (parsec_list_item_t*)(*new_heap_ptr); - heap->list_item.list_next = (parsec_list_item_t*)(*new_heap_ptr); - (*new_heap_ptr)->list_item.list_prev = (parsec_list_item_t*)heap; - (*new_heap_ptr)->list_item.list_next = (parsec_list_item_t*)heap; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tSplit heap %p into itself and heap %p", heap, *new_heap_ptr); - } - prepare_for_return: - PARSEC_LIST_ITEM_SINGLETON(to_use); + + task->super.list_next = (parsec_list_item_t*)task; /* safety */ + task->super.list_prev = (parsec_list_item_t*)task; #if defined(PARSEC_DEBUG_NOISIER) - { + if (task != NULL) { char tmp[MAX_TASK_STRLEN]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tStole exec C %s (%p) from heap %p", - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, to_use), to_use, heap); + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tStole exec C %s (%p) from heap %p", + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, task), task, *heap_ptr); } #endif - return to_use; + return task; } -// cannot be made thread-safe with atomics -parsec_task_t* heap_remove(parsec_heap_t ** heap_ptr) +parsec_task_t* +heap_split_and_steal(parsec_task_heap_t **heap_ptr, + parsec_task_heap_t **new_heap_ptr) { - parsec_task_t * to_use = NULL; - parsec_heap_t * heap = *heap_ptr; + parsec_task_heap_t *heap = *heap_ptr; + *new_heap_ptr = NULL; + if (NULL == heap) return NULL; - if (heap != NULL) { - assert(heap->top != NULL); // this heap should have been destroyed - to_use = heap->top; // this will always be what we return, even if it's NULL, if a valid heap was passed - if (heap->top->super.list_prev == NULL) { - /* no left child, so 'top' is the only node */ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tDestroying heap %p", heap->top, heap->top->super.list_next, heap); - assert(heap->size == 1); - heap->top = NULL; - heap_destroy(heap_ptr); - assert(*heap_ptr == NULL); - } - else { /* does have left child */ - if (heap->top->super.list_next /* right */ == NULL) { - assert(heap->size == 2); - /* but doesn't have right child, so still not splitting */ - heap->top = (parsec_task_t*)heap->top->super.list_prev; // left - /* set up doubly-linked singleton list in here, as DEFAULT scenario */ - heap->list_item.list_prev = (parsec_list_item_t*)*heap_ptr; - heap->list_item.list_next = (parsec_list_item_t*)*heap_ptr; - } - else { // heap has at least 3 nodes, so we do fancy removal - assert(heap->size >= 3); - /* - the strategy here is to find the 'last' node in the 'complete' heap - and swap it up to replace the top node (which is being removed), because - it is the only node that can be moved without making the heap 'incomplete'. - Once the swap is made, in order to preserve priority order, we then - 'bubble down' in the direction of the higher of any higher children. - */ - parsec_task_t * parent = heap->top; - unsigned int bitmask = 1; - unsigned int size = heap->size; - // this allows us to count the number of layers in the heap - while (bitmask <= size) - bitmask = bitmask << 1; - /* at this point, the ith bit in bitmask tells us that we have i - 1 layers... - * ...so we shift down one to get rid of the 'extra' layer, - * and another to prepare for the following logic, which only 'moves' - * through the heap until the second-to-last layer. - */ - bitmask = bitmask >> 2; - while (bitmask > 1) { - /* the "bitmask & size" operation is a simple way of moving - * through the heap one layer at a time in the direction of the - * 'last' element in the 'complete' heap. - */ - parent = (parsec_task_t*)( - (bitmask & size) ? parent->super.list_next : parent->super.list_prev); - bitmask = bitmask >> 1; - } + parsec_heap_t *h = &heap->heap; + assert(h->top != NULL); + + parsec_task_t *to_use = (parsec_task_t*)h->top; + + if (NULL == HLEFT(h->top)) { + /* Only root — no children */ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tDestroying heap %p (single node)", heap); + h->top = NULL; + h->size = 0; + heap_destroy(heap_ptr); + goto prepare_for_return; + } + + if (NULL == HRIGHT(h->top)) { + /* Root has only a left child (size == 2) */ + assert(h->size == 2); + parsec_list_item_t *left = HLEFT(h->top); + h->top = left; + h->size = 1; + heap->priority = (unsigned int)COMPARISON_VAL(left, h->comp_offset); + heap->list_item.list_prev = (parsec_list_item_t*)*heap_ptr; + heap->list_item.list_next = (parsec_list_item_t*)*heap_ptr; + goto prepare_for_return; + } - if (bitmask & size) { // LAST NODE IS A 'NEXT' NODE - heap->top = (parsec_task_t*)parent->super.list_next; - // should ALWAYS be a leaf node - assert(heap->top != NULL); - assert(heap->top->super.list_next == NULL); - assert(heap->top->super.list_prev == NULL); - if (parent != to_use) { // if not a second-level-from-the-top node... - heap->top->super.list_next = to_use->super.list_next; - parent->super.list_next = NULL; - } - else - heap->top->super.list_next = NULL; - heap->top->super.list_prev = to_use->super.list_prev; - } - else { // LAST NODE IS A 'PREV' NODE - heap->top = (parsec_task_t*)parent->super.list_prev; - // should ALWAYS be a leaf node - assert(heap->top != NULL); - assert(heap->top->super.list_next == NULL); - assert(heap->top->super.list_prev == NULL); - /* a prev node isn't on the second level from the top - * (because otherwise size == 2), so we safely assume it has a parent - */ - heap->top->super.list_next = to_use->super.list_next; - heap->top->super.list_prev = to_use->super.list_prev; - parent->super.list_prev = NULL; - } + /* >= 3 nodes: split into left (new_heap) and right (heap) subtrees */ + { + unsigned int size = (unsigned int)h->size; + unsigned int highBit = hiBit(size); + unsigned int twoBit = highBit >> 1; - // now bubble down - parsec_task_t * bubbler = heap->top; - int is_next = -1; /* flag keeps track of whether we are 'prev' or 'next' to our current PARENT. - * the initial value doesn't matter since we're at the top and have no parent. */ - parent = NULL; - while (1) { - parsec_task_t * next = (parsec_task_t*)bubbler->super.list_next; - parsec_task_t * prev = (parsec_task_t*)bubbler->super.list_prev; - // first, compare all three priorities to see which way to bubble, if any - if (prev != NULL && prev->priority > bubbler->priority && - (next == NULL || prev->priority >= next->priority)) { - // bubble toward (swap with) prev - if (parent) { - if (is_next) - parent->super.list_next = (parsec_list_item_t *)prev; - else - parent->super.list_prev = (parsec_list_item_t *)prev; - } - else - heap->top = prev; + *new_heap_ptr = heap_create(); + (*new_heap_ptr)->heap.comp_offset = h->comp_offset; - bubbler->super.list_prev = prev->super.list_prev; - bubbler->super.list_next = prev->super.list_next; - prev->super.list_prev = (parsec_list_item_t *)bubbler; - prev->super.list_next = (parsec_list_item_t *)next; + parsec_list_item_t *left_top = HLEFT(h->top); + parsec_list_item_t *right_top = HRIGHT(h->top); - is_next = 0; // b/c we will be our parent's PREV in the next round - parent = prev; - } - else if (next != NULL && next->priority > bubbler->priority && - (prev == NULL || next->priority > prev->priority)) { - // bubble toward next - if (parent) { - if (is_next) - parent->super.list_next = (parsec_list_item_t *)next; - else - parent->super.list_prev = (parsec_list_item_t *)next; - } - else - heap->top = next; + (*new_heap_ptr)->heap.top = left_top; + (*new_heap_ptr)->priority = (unsigned int)COMPARISON_VAL(left_top, h->comp_offset); - bubbler->super.list_prev = next->super.list_prev; - bubbler->super.list_next = next->super.list_next; - next->super.list_prev = (parsec_list_item_t *)prev; - next->super.list_next = (parsec_list_item_t *)bubbler; + h->top = right_top; + heap->priority = (unsigned int)COMPARISON_VAL(right_top, h->comp_offset); - is_next = 1; // b/c we will be our parent's NEXT in the next round - parent = next; - } - else // either both next and prev are NULL, or neither has a higher priority than bubbler - break; - } - } - heap->size--; - heap->priority = heap->top->priority; + if (twoBit & size) { /* last node is in the right subtree */ + h->size = (size_t)(~highBit & size); + (*new_heap_ptr)->heap.size = (size_t)(size - (unsigned int)h->size - 1); + } else { /* last node is in the left subtree */ + (*new_heap_ptr)->heap.size = (size_t)((size & ~highBit) + twoBit); + h->size = (size_t)(size - (unsigned int)(*new_heap_ptr)->heap.size - 1); } - to_use->super.list_next = (parsec_list_item_t*)to_use; // safety's - to_use->super.list_prev = (parsec_list_item_t*)to_use; // sake + + /* Form a two-element ring so the caller can re-singleton each side */ + heap->list_item.list_prev = (parsec_list_item_t*)(*new_heap_ptr); + heap->list_item.list_next = (parsec_list_item_t*)(*new_heap_ptr); + (*new_heap_ptr)->list_item.list_prev = (parsec_list_item_t*)heap; + (*new_heap_ptr)->list_item.list_next = (parsec_list_item_t*)heap; + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tSplit heap %p into itself and heap %p", heap, *new_heap_ptr); } + prepare_for_return: + PARSEC_LIST_ITEM_SINGLETON(to_use); + #if defined(PARSEC_DEBUG_NOISIER) - if (to_use != NULL) { + { char tmp[MAX_TASK_STRLEN]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tStole exec C %s (%p) from heap %p", parsec_task_snprintf(tmp, MAX_TASK_STRLEN, to_use), to_use, heap); + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tStole exec C %s (%p) from heap %p", + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, to_use), to_use, *heap_ptr); } #endif return to_use; } - diff --git a/parsec/maxheap.h b/parsec/maxheap.h index 780b773b0..6c46911f1 100644 --- a/parsec/maxheap.h +++ b/parsec/maxheap.h @@ -2,42 +2,55 @@ * Copyright (c) 2009-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #ifndef MAXHEAP_H_HAS_BEEN_INCLUDED #define MAXHEAP_H_HAS_BEEN_INCLUDED #include "parsec/parsec_config.h" -#include "parsec/class/list_item.h" +#include "parsec/class/parsec_heap.h" +#include "parsec/runtime.h" BEGIN_C_DECLS /** - * The structure implemented here is not thread safe. All concurrent - * accesses should be protected by the upper level. + * Wrapper around parsec_heap_t that adds the list_item field (so the heap + * can be stored in scheduler lists) and an explicit 'priority' field (the + * max priority of any task in the heap, used by parsec_hbbuffer_pop_best + * to pick the best heap to steal from without traversing the tree). + * + * Not thread-safe; all concurrent accesses must be protected by the caller. */ +typedef struct parsec_task_heap_s { + parsec_list_item_t list_item; /**< for compatibility with scheduler lists */ + unsigned int priority; /**< max priority of any task in this heap */ + parsec_heap_t heap; /**< pointer-based max-heap storage */ +} parsec_task_heap_t; -/* main struct holding size info and ID */ -typedef struct parsec_heap_s { - parsec_list_item_t list_item; /* to be compatible with the lists */ - unsigned int size; - unsigned int priority; - parsec_task_t * top; -} parsec_heap_t; +/** Allocate an empty heap as a singleton list item with zero priority. */ +parsec_task_heap_t* heap_create(void); -/* - allocates an empty heap as a correctly doubly-linked singleton list - with the lowest possible priority - */ -parsec_heap_t* heap_create(void); +/** Free an empty heap. Asserts that the heap is empty. */ +void heap_destroy(parsec_task_heap_t** heap); -void heap_destroy(parsec_heap_t** heap); +/** Insert a task into the heap, updating the stored max priority. */ +void heap_insert(parsec_task_heap_t *heap, parsec_task_t *elem); -void heap_insert(parsec_heap_t * heap, parsec_task_t * elem); -parsec_task_t* -heap_split_and_steal(parsec_heap_t ** heap_ptr, - parsec_heap_t ** new_heap_ptr); -parsec_task_t * heap_remove(parsec_heap_t ** heap_ptr); +/** + * Remove the maximum-priority task from the heap and, if the heap has at + * least 3 nodes, split it into two sub-heaps for work stealing. + * On return, *heap_ptr and *new_heap_ptr are the two sub-heaps (either + * may be NULL if the original heap had fewer than 3 nodes). + */ +parsec_task_t* heap_split_and_steal(parsec_task_heap_t **heap_ptr, + parsec_task_heap_t **new_heap_ptr); + +/** + * Remove the maximum-priority task from the heap. + * If the heap becomes empty it is destroyed and *heap_ptr is set to NULL. + */ +parsec_task_t* heap_remove(parsec_task_heap_t **heap_ptr); END_C_DECLS diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index e29f144e9..91c87accb 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -41,7 +42,6 @@ int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_me char* parsec_cuda_lib_path = NULL; static int cuda_mask; -static int parsec_cuda_sort_pending; #if defined(PARSEC_PROF_TRACE) int parsec_device_cuda_one_profiling_stream_per_gpu_stream = 0; @@ -114,9 +114,6 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority assert( NULL == parsec_device_cuda_component.modules[j] ); continue; } - if(parsec_cuda_sort_pending) { - parsec_device_cuda_component.modules[j]->sort_pending_list = parsec_device_sort_pending_list; - } parsec_device_cuda_component.modules[j]->component = &parsec_device_cuda_component; j++; /* next available spot */ parsec_device_cuda_component.modules[j] = NULL; @@ -164,9 +161,6 @@ static int device_cuda_component_register(void) (void)parsec_mca_param_reg_int_name("device_cuda", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams); - (void)parsec_mca_param_reg_int_name("device_cuda", "sort_pending_tasks", - "Boolean to let the GPU engine sort the first pending tasks stored in the list", - false, false, 0, &parsec_cuda_sort_pending); #if defined(PARSEC_PROF_TRACE) (void)parsec_mca_param_reg_int_name("device_cuda", "one_profiling_stream_per_cuda_stream", "Boolean to separate the profiling of each cuda stream into a single profiling stream", diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index 932d968e8..fe07c19ad 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -23,7 +24,9 @@ #include "parsec/utils/debug.h" #include "parsec/utils/argv.h" #include "parsec/utils/zone_malloc.h" -#include "parsec/class/fifo.h" +#include "parsec/class/lifo.h" +#include "parsec/class/parsec_heap.h" +#include #include #include @@ -161,7 +164,7 @@ static int parsec_cuda_all_devices_attached(parsec_device_module_t *device) for( int j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) { if( target_gpu == source_gpu ) { /* always set bit for self-access */ - source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | + source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 << target_gpu->super.super.device_index)); continue; } @@ -173,7 +176,7 @@ static int parsec_cuda_all_devices_attached(parsec_device_module_t *device) cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 ); PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus, {continue;} ); - source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | + source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 << target_gpu->super.super.device_index)); } } @@ -416,7 +419,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) double fp16, fp32, fp64, tf32; struct cudaDeviceProp prop; - show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities"); + show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities"); if(0 < show_caps_index) { parsec_mca_param_lookup_int(show_caps_index, &show_caps); } @@ -510,7 +513,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) /* Each 'exec' stream gets its own profiling stream, except IN and OUT stream that share it. * It's good to separate the exec streams to know what was submitted to what stream * We don't have this issue for the IN and OUT streams because types of event discriminate - * what happens where, and separating them consumes memory and increases the number of + * what happens where, and separating them consumes memory and increases the number of * events that needs to be matched between streams because we cannot differentiate some * ends between IN or OUT, so they are all logged on the same stream. */ gpu_device->trackable_events = PARSEC_PROFILE_GPU_TRACK_EXEC | PARSEC_PROFILE_GPU_TRACK_DATA_OUT @@ -567,9 +570,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) /* Initialize internal lists */ PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru, parsec_list_t); PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t); - PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_fifo_t); + PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_lifo_t); + parsec_heap_init(&gpu_device->pending_heap, offsetof(parsec_gpu_task_t, priority)); - gpu_device->sort_starting_p = NULL; gpu_device->peer_access_mask = 0; /* No GPU to GPU direct transfer by default */ device->memory_register = parsec_cuda_memory_register; @@ -641,7 +644,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) #if defined(PARSEC_PROF_TRACE) if( NULL != exec_stream->profiling ) { /* No function to clean the profiling stream. If one is introduced - * some day, remember that exec streams 0 and 1 always share the same + * some day, remember that exec streams 0 and 1 always share the same * ->profiling stream, and that all of them share the same * ->profiling stream if parsec_device_cuda_one_profiling_stream_per_cuda_stream == 0 */ } @@ -671,8 +674,9 @@ parsec_cuda_module_fini(parsec_device_module_t* device) /* Release the registered memory */ parsec_device_memory_release(gpu_device); - /* Release pending queue */ + /* Release pending queue and heap */ PARSEC_OBJ_DESTRUCT(&gpu_device->pending); + parsec_heap_fini(&gpu_device->pending_heap); /* Release all streams */ for( j = 0; j < gpu_device->num_exec_streams; j++ ) { diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h index a08bd7659..c84cef393 100644 --- a/parsec/mca/device/device.h +++ b/parsec/mca/device/device.h @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ /** @addtogroup parsec_device @@ -117,12 +118,6 @@ typedef int (*parsec_device_memory_release_f)(parsec_device_module_t*); typedef int (*parsec_device_data_advise_f)(parsec_device_module_t*, parsec_data_t*, int); typedef void* (*parsec_device_find_function_f)(parsec_device_module_t*, char*); -/** - * Reorders the list of pending tasks on the current device based on the - * current heuristic implemented by the device - */ -typedef int (*parsec_device_sort_pending_list_function_f)(parsec_device_module_t*); - /** * Schedules some kernel represented by @p task on the device @p module, * from the execution stream @p es. @@ -155,7 +150,6 @@ struct parsec_device_module_s { parsec_device_memory_release_f memory_release; parsec_device_data_advise_f data_advise; parsec_device_find_function_f find_function; - parsec_device_sort_pending_list_function_f sort_pending_list; parsec_device_kernel_scheduler_function_t kernel_scheduler; parsec_device_all_devices_attached_f all_devices_attached; diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 67e7ca6c3..ff653c3cb 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -189,6 +190,7 @@ static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task) gpu_task->last_data_check_epoch = UINT64_MAX; /* force at least one validation for the task */ gpu_task->nb_flows = 0; gpu_task->flow_info = NULL; + gpu_task->priority = -1; // priority is inherited from the task /* Default release mechanism, can be replaced by the DSL */ gpu_task->release_device_task = parsec_device_release_gpu_task; } @@ -392,60 +394,6 @@ void parsec_device_enable_debug(void) } } -int parsec_device_sort_pending_list(parsec_device_module_t *device) -{ - if( !PARSEC_DEV_IS_GPU(device->type) ) - return 0; - - parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)device; - parsec_list_t *sort_list = gpu_device->exec_stream[0]->fifo_pending; - - if (parsec_list_is_empty(sort_list) ) { /* list is empty */ - return 0; - } - - if (gpu_device->sort_starting_p == NULL || !parsec_list_nolock_contains(sort_list, gpu_device->sort_starting_p) ) { - gpu_device->sort_starting_p = (parsec_list_item_t*)sort_list->ghost_element.list_next; - } - - /* p is head */ - parsec_list_item_t *p = gpu_device->sort_starting_p; - int i, j, NB_SORT = 10, space_q, space_min; - - parsec_list_item_t *q, *prev_p, *min_p; - for (i = 0; i < NB_SORT; i++) { - if ( p == &(sort_list->ghost_element) ) { - break; - } - min_p = p; /* assume the minimum one is the first one p */ - q = (parsec_list_item_t*)min_p->list_next; - space_min = parsec_device_check_space_needed(gpu_device, (parsec_gpu_task_t*)min_p); - for (j = i+1; j < NB_SORT; j++) { - if ( q == &(sort_list->ghost_element) ) { - break; - } - space_q = parsec_device_check_space_needed(gpu_device, (parsec_gpu_task_t*)q); - if ( space_min > space_q ) { - min_p = q; - space_min = space_q; - } - q = (parsec_list_item_t*)q->list_next; - - } - if (min_p != p) { /* minimum is not the first one, let's insert min_p before p */ - /* take min_p out */ - parsec_list_item_ring_chop(min_p); - PARSEC_LIST_ITEM_SINGLETON(min_p); - prev_p = (parsec_list_item_t*)p->list_prev; - - /* insert min_p after prev_p */ - parsec_list_add_after( sort_list, prev_p, min_p); - } - p = (parsec_list_item_t*)min_p->list_next; - } - - return 0; -} void* parsec_device_pop_workspace(parsec_device_gpu_module_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream, size_t size) @@ -762,7 +710,7 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int "GPU[%d:%s]: data copy %p [ref_count %d] linked to prefetch gpu task %p on GPU copy %p", gpu_device->super.device_index, gpu_device->super.name, gpu_task->ec->data[0].data_in, gpu_task->ec->data[0].data_in->super.super.obj_reference_count, gpu_task, gpu_task->ec->data[0].data_out); - parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); + parsec_lifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); return PARSEC_SUCCESS; } break; @@ -2164,19 +2112,6 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, return 1; /* positive returns have special meaning and are used for optimizations */ } -#if PARSEC_GPU_USE_PRIORITIES - -static inline parsec_list_item_t* parsec_device_push_task_ordered( parsec_list_t* list, - parsec_list_item_t* elem ) -{ - parsec_list_push_sorted(list, elem, parsec_execution_context_priority_comparator); - return elem; -} -#define PARSEC_PUSH_TASK parsec_device_push_task_ordered -#else -#define PARSEC_PUSH_TASK parsec_list_push_back -#endif - static inline int parsec_gpu_task_is_singleton(parsec_gpu_task_t *task) { @@ -2199,10 +2134,10 @@ parsec_gpu_stream_push_pending(parsec_gpu_exec_stream_t *stream, * order when feeding the tasks to the next stream. */ if( !parsec_gpu_task_is_singleton(task) ) { - parsec_list_chain_back(stream->fifo_pending, &task->list_item); + parsec_list_nolock_chain_back(stream->fifo_pending, &task->list_item); return; } - PARSEC_PUSH_TASK(stream->fifo_pending, &task->list_item); + parsec_list_nolock_push_back(stream->fifo_pending, &task->list_item); } static inline int @@ -2257,7 +2192,6 @@ parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, fifo_pending = gpu_stream->fifo_pending; assert(NULL != fifo_pending); - parsec_list_lock(fifo_pending); for(item = (parsec_list_item_t *)fifo_pending->ghost_element.list_next; item != &fifo_pending->ghost_element; item = next) { @@ -2270,7 +2204,6 @@ parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, } rc = callback(candidate, batch_head, callback_data); if( rc < 0 ) { - parsec_list_unlock(fifo_pending); return rc; } if( 0 == rc ) { @@ -2279,7 +2212,6 @@ parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, nb_tasks++; } } - parsec_list_unlock(fifo_pending); return nb_tasks; } @@ -2332,6 +2264,7 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy, gpu_task->ec = calloc(1, sizeof(parsec_task_t)); PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t); gpu_task->ec->task_class = &parsec_device_d2d_complete_tc; + gpu_task->ec->priority = INT32_MAX; /* This task should be executed as soon as possible */ gpu_task->nb_flows = 1; gpu_task->flow_info[0].flow = &parsec_device_d2d_complete_flow; gpu_task->flow_info[0].flow_span = copy->original->span; @@ -2352,7 +2285,7 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy, current_dev->device_index, current_dev->name, gpu_task->ec->data[0].data_out, gpu_task->ec->data[0].data_out->super.super.obj_reference_count, dst_dev->device_index, dst_dev->name); - parsec_fifo_push( &(((parsec_device_gpu_module_t*)dst_dev)->pending), (parsec_list_item_t*)gpu_task ); + parsec_lifo_push( &(((parsec_device_gpu_module_t*)dst_dev)->pending), (parsec_list_item_t*)gpu_task ); } static int @@ -2654,7 +2587,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, grab_a_task: assert(NULL == task); if( NULL == stream->tasks[stream->start] ) { /* there is room on the stream */ - task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending); /* get the best task */ + task = (parsec_gpu_task_t*)parsec_list_nolock_pop_front(stream->fifo_pending); /* get the next task */ } if( NULL == task ) { /* No tasks, we're done */ return PARSEC_HOOK_RETURN_DONE; @@ -3395,6 +3328,10 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, PARSEC_PROFILING_EVENT_RESCHEDULED ); #endif /* defined(PARSEC_PROF_TRACE) */ + if (gpu_task != NULL && gpu_task->priority < 0) { + gpu_task->priority = (gpu_task->ec != NULL) ? gpu_task->ec->priority : 0; + } + /* Check the GPU status -- three kinds of values for rc: * - rc < 0: somebody is doing a short atomic operation while there is no manager, * so wait. @@ -3419,7 +3356,7 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, } } if( 0 < rc ) { - parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); + parsec_lifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); return PARSEC_HOOK_RETURN_ASYNC; } PARSEC_DEBUG_VERBOSE(5, parsec_gpu_output_stream, "GPU[%d:%s]: Entering GPU management", @@ -3536,16 +3473,16 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, fetch_task_from_shared_queue: assert( NULL == gpu_task ); - if (NULL != gpu_device->super.sort_pending_list && out_task_submit == NULL && out_task_pop == NULL) { - gpu_device->super.sort_pending_list(&gpu_device->super); + { + parsec_list_item_t *chain = parsec_lifo_detach_chain(&gpu_device->pending); + if (NULL != chain) { + parsec_heap_push_chain(&gpu_device->pending_heap, chain); + } } - gpu_task = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->pending) ); + gpu_task = (parsec_gpu_task_t*)parsec_heap_pop(&gpu_device->pending_heap); if( NULL != gpu_task ) { pop_null = 0; - /* parsec_fifo_try_pop() detaches the task but does not reset list links - * in release builds; normalize before the stream FIFO inspects them. - */ - PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)gpu_task); + /* parsec_heap_push_ring() singletonizes each item; the popped task is already a singleton. */ gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1; /* force at least one tour */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]:\tGet from shared queue %s", gpu_device->super.device_index, gpu_device->super.name, parsec_device_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task)); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index b36a40718..5b5ec18ba 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #ifndef PARSEC_DEVICE_GPU_H @@ -14,11 +15,11 @@ #include "parsec/class/list_item.h" #include "parsec/class/list.h" -#include "parsec/class/fifo.h" +#include "parsec/class/lifo.h" +#include "parsec/class/parsec_heap.h" BEGIN_C_DECLS -#define PARSEC_GPU_USE_PRIORITIES 1 #define PARSEC_GPU_MAX_STREAMS 6 #define PARSEC_MAX_EVENTS_PER_STREAM 4 #define PARSEC_GPU_MAX_WORKSPACE 2 @@ -116,6 +117,7 @@ typedef struct parsec_gpu_flow_info_s { struct parsec_gpu_task_s { parsec_list_item_t list_item; + int32_t priority; /**< device task priority, inherited from task if < 0 */ uint16_t task_type; uint16_t pushout; int32_t last_status; @@ -164,20 +166,20 @@ typedef enum parsec_device_transfer_direction_e { /** * @brief Set the device for the calling thread. - * + * * @details typically maps to cudaSetDevice or equivalent - * + * * @return PARSEC_SUCCESS or a PARSEC error */ typedef int (*parsec_device_set_device_fn_t)(struct parsec_device_gpu_module_s *gpu); /** * @brief Schedules the asynchronous copy of @p bytes bytes from @p source onto @p dest - * on the GPU stream of @p gpu_stream. @p direction must reflect the memory space of + * on the GPU stream of @p gpu_stream. @p direction must reflect the memory space of * @p source and @p dest. - * + * * @details typically maps to cudaMemcpyAsync or equivalent - * + * * @return PARSEC_SUCCESS or a PARSEC error */ typedef int (*parsec_device_memcpy_async_fn_t)(struct parsec_device_gpu_module_s *gpu, struct parsec_gpu_exec_stream_s *gpu_stream, @@ -185,20 +187,20 @@ typedef int (*parsec_device_memcpy_async_fn_t)(struct parsec_device_gpu_module_s /** * @brief Record an event on the GPU @p gpu_stream of GPU @p gpu, with index @p idx. - * + * * @details typically maps to cudaRecordEvent or equivalent. The GPU device must have allocated - * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). - * + * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). + * * @return PARSEC_SUCCESS or a PARSEC error */ typedef int (*parsec_device_event_record_fn_t)(struct parsec_device_gpu_module_s *gpu, struct parsec_gpu_exec_stream_s *gpu_stream, int32_t event_idx); /** * @brief Record an event on the GPU @p gpu_stream of GPU @p gpu, with index @p idx. - * + * * @details typically maps to cudaRecordEvent or equivalent. The GPU device must have allocated - * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). - * + * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). + * * @return 0 if the event recorded at @p event_idx in @p gpu_stream is not ready yet * 1 if the event recorded at @p event_idx in @p gpu_stream is ready/completed * a negative value which is a PARSEC error otherwise @@ -209,34 +211,34 @@ typedef int (*parsec_device_event_query_fn_t)(struct parsec_device_gpu_module_s * @brief Computes how much memory is available on the GPU. Returns two values: * @p free_mem is the amount of memory available for this process * @p total_mem is the amount of memory on the device (including memory allocated by other processes) - * - * @details typically maps to cudaMemGetInfo or equivalent. - * + * + * @details typically maps to cudaMemGetInfo or equivalent. + * * @return PARSEC_SUCCESS if successful, a PARSEC error otherwise (in which case the parameters are undefined) */ typedef int (*parsec_device_memory_info_fn_t)(struct parsec_device_gpu_module_s *gpu, size_t *free_mem, size_t *total_mem); /** * @brief Allocates @p bytes bytes on GPU @p gpu, and returns the address of the allocated memory in @p addr. - * - * @details typically maps to cudaMalloc or equivalent. - * + * + * @details typically maps to cudaMalloc or equivalent. + * * @return PARSEC_SUCCESS if successful, a PARSEC error otherwise (in which case @p addr is undefined) */ typedef int (*parsec_device_memory_allocate_fn_t)(struct parsec_device_gpu_module_s *gpu, size_t bytes, void **addr); /** * @brief Frees memory @p addr allocated by @fn parsec_device_memory_allocate_fn_t on the same GPU @p gpu. - * - * @details typically maps to cudaFree or equivalent. - * + * + * @details typically maps to cudaFree or equivalent. + * * @return PARSEC_SUCCESS if successful, a PARSEC error otherwise */ typedef int (*parsec_device_memory_free_fn_t)(struct parsec_device_gpu_module_s *gpu, void *addr); /** * @brief Find a function incarnation for the given function name - * + * * @param gpu_device the target GPU * @param fname the function name to look for * @return address of the symbol that implements this function @@ -269,9 +271,9 @@ struct parsec_device_gpu_module_s { */ parsec_list_t gpu_mem_lru; /* Read-only blocks, and fresh blocks */ parsec_list_t gpu_mem_owned_lru; /* Dirty blocks */ - parsec_fifo_t pending; + parsec_lifo_t pending; /**< lock-free LIFO: CPU threads push here */ + parsec_heap_t pending_heap; /**< manager-private max-heap for priority ordering */ struct zone_malloc_s *memory; - parsec_list_item_t *sort_starting_p; parsec_gpu_exec_stream_t **exec_stream; size_t mem_block_size; int64_t mem_nb_blocks; @@ -332,8 +334,6 @@ int parsec_device_push_workspace(parsec_device_gpu_module_t* gpu_device, parsec_ void* parsec_device_pop_workspace(parsec_device_gpu_module_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream, size_t size); int parsec_device_free_workspace(parsec_device_gpu_module_t * gpu_device); -/* sort pending task list by number of spaces needed */ -int parsec_device_sort_pending_list(parsec_device_module_t *gpu_device); parsec_gpu_task_t* parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_execution_stream_t *es); int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *w2r_task, parsec_execution_stream_t *es); diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c index fec2bff88..c281360f9 100644 --- a/parsec/mca/device/level_zero/device_level_zero_component.c +++ b/parsec/mca/device/level_zero/device_level_zero_component.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec.h" @@ -40,7 +41,6 @@ int parsec_level_zero_memory_block_size, parsec_level_zero_memory_percentage, pa char* parsec_level_zero_lib_path = NULL; static int level_zero_mask, level_zero_nvlink_mask; -static int parsec_level_zero_sort_pending; #if defined(PARSEC_PROF_TRACE) int parsec_device_level_zero_one_profiling_stream_per_gpu_stream = 0; @@ -167,9 +167,6 @@ static int device_level_zero_component_query(mca_base_module_t **module, int *pr } driver->ref_count++; parsec_device_level_zero_component.modules[j]->component = &parsec_device_level_zero_component; - if(parsec_level_zero_sort_pending) { - parsec_device_level_zero_component.modules[j]->sort_pending_list = parsec_device_sort_pending_list; - } j++; /* next available spot */ parsec_device_level_zero_component.modules[j] = NULL; i++; @@ -275,9 +272,6 @@ static int device_level_zero_component_register(void) (void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams); - (void)parsec_mca_param_reg_int_name("device_level_zero", "sort_pending_tasks", - "Boolean to let the GPU engine sort the first pending tasks stored in the list", - false, false, 0, &parsec_level_zero_sort_pending); #if defined(PARSEC_PROF_TRACE) (void)parsec_mca_param_reg_int_name("device_level_zero", "one_profiling_stream_per_level_zero_stream", "Boolean to separate the profiling of each level_zero stream into a single profiling stream", diff --git a/parsec/mca/device/level_zero/device_level_zero_module.c b/parsec/mca/device/level_zero/device_level_zero_module.c index 4b2ef0799..a88bb314a 100644 --- a/parsec/mca/device/level_zero/device_level_zero_module.c +++ b/parsec/mca/device/level_zero/device_level_zero_module.c @@ -2,6 +2,7 @@ * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -22,7 +23,9 @@ #include "parsec/utils/debug.h" #include "parsec/utils/argv.h" #include "parsec/utils/zone_malloc.h" -#include "parsec/class/fifo.h" +#include "parsec/class/lifo.h" +#include "parsec/class/parsec_heap.h" +#include #include "parsec/mca/device/level_zero/device_level_zero_dpcpp.h" #include @@ -413,9 +416,9 @@ int parsec_level_zero_module_init( int dev_id, parsec_device_level_zero_driver_t /* Initialize internal lists */ PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru, parsec_list_t); PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t); - PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_fifo_t); + PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_lifo_t); + parsec_heap_init(&gpu_device->pending_heap, offsetof(parsec_gpu_task_t, priority)); - gpu_device->sort_starting_p = NULL; gpu_device->peer_access_mask = 0; /* No GPU to GPU direct transfer by default */ device->memory_register = NULL; // TODO there seem to be no memory pinning in level_zero? @@ -501,8 +504,9 @@ parsec_level_zero_module_fini(parsec_device_module_t* device) /* Release the registered memory */ parsec_device_memory_release(gpu_device); - /* Release pending queue */ + /* Release pending queue and heap */ PARSEC_OBJ_DESTRUCT(&gpu_device->pending); + parsec_heap_fini(&gpu_device->pending_heap); /* Release all streams */ for( j = 0; j < gpu_device->num_exec_streams; j++ ) { diff --git a/parsec/mca/sched/ltq/sched_ltq_module.c b/parsec/mca/sched/ltq/sched_ltq_module.c index 48948ac53..e502b3777 100644 --- a/parsec/mca/sched/ltq/sched_ltq_module.c +++ b/parsec/mca/sched/ltq/sched_ltq_module.c @@ -25,7 +25,7 @@ #include "parsec/parsec_hwloc.h" #include "parsec/papi_sde.h" -#define parsec_heap_priority_comparator (offsetof(parsec_heap_t, priority)) +#define parsec_heap_priority_comparator (offsetof(parsec_task_heap_t, priority)) /** * Module functions @@ -163,8 +163,8 @@ static parsec_task_t* sched_ltq_select(parsec_execution_stream_t *es, int32_t* distance) { - parsec_heap_t* heap = NULL; - parsec_heap_t* new_heap = NULL; + parsec_task_heap_t* heap = NULL; + parsec_task_heap_t* new_heap = NULL; parsec_task_t * task = NULL; int i = 0; /* @@ -173,7 +173,7 @@ sched_ltq_select(parsec_execution_stream_t *es, and choose a tree that has the highest value then take that task from that tree. */ - heap = (parsec_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->task_queue, + heap = (parsec_task_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->task_queue, parsec_heap_priority_comparator); task = heap_remove(&heap); if( NULL != heap ) { @@ -187,7 +187,7 @@ sched_ltq_select(parsec_execution_stream_t *es, // if we failed to find one in our queue for(i = 1; i < PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->nb_hierarch_queues; i++ ) { - heap = (parsec_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->hierarch_queues[i], parsec_heap_priority_comparator); + heap = (parsec_task_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->hierarch_queues[i], parsec_heap_priority_comparator); task = heap_split_and_steal(&heap, &new_heap); if( NULL != heap ) { if (NULL != new_heap) { @@ -215,7 +215,7 @@ sched_ltq_select(parsec_execution_stream_t *es, } // if nothing yet, then go to system queue - heap = (parsec_heap_t *)parsec_dequeue_pop_front(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->system_queue); + heap = (parsec_task_heap_t *)parsec_dequeue_pop_front(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->system_queue); task = heap_split_and_steal(&heap, &new_heap); #if defined(PARSEC_PAPI_SDE) if( NULL != task ) { @@ -224,7 +224,7 @@ sched_ltq_select(parsec_execution_stream_t *es, #endif if (heap != NULL) { #if defined(PARSEC_PAPI_SDE) - PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->local_system_queue_balance-= heap->size; + PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->local_system_queue_balance -= (int32_t)heap->heap.size; #endif parsec_hbbuffer_push_all(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->task_queue, (parsec_list_item_t*)heap, 0); @@ -239,8 +239,8 @@ static int sched_ltq_schedule(parsec_execution_stream_t* es, { parsec_task_t * cur = new_context; parsec_task_t * next; - parsec_heap_t* heap = heap_create(); - parsec_heap_t* first_h = heap; + parsec_task_heap_t* heap = heap_create(); + parsec_task_heap_t* first_h = heap; int matches = 0; int i, j; @@ -272,7 +272,7 @@ static int sched_ltq_schedule(parsec_execution_stream_t* es, if (!matches) { // make new heap - parsec_heap_t * new_heap = heap_create(); + parsec_task_heap_t * new_heap = heap_create(); heap->list_item.list_next->list_prev = (parsec_list_item_t*)new_heap; new_heap->list_item.list_prev = (parsec_list_item_t*)heap; new_heap->list_item.list_next = (parsec_list_item_t*)heap->list_item.list_next; From ef37d593db19f4a16c4fb694323045ee0cc90d2e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 8 Jun 2026 14:17:59 -0400 Subject: [PATCH 2/2] Remove unused variables Spotted in CI. Likely introduced and not removed. Signed-off-by: Joseph Schuchart --- parsec/mca/device/device_gpu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index ff653c3cb..3dd873acd 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -3312,7 +3312,7 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, { parsec_device_gpu_module_t* gpu_device = (parsec_device_gpu_module_t *)module; int rc, exec_stream = 0; - parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL; + parsec_gpu_task_t *progress_task = NULL; parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t*)_gpu_task; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; @@ -3439,7 +3439,6 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto remove_gpu_task; } gpu_task = progress_task; - out_task_submit = progress_task; get_data_out_of_device: if( (NULL != gpu_task) && (PARSEC_GPU_TASK_TYPE_KERNEL == gpu_task->task_type) ) { @@ -3469,7 +3468,6 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto complete_task; } gpu_task = progress_task; - out_task_pop = progress_task; fetch_task_from_shared_queue: assert( NULL == gpu_task );