diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt index df3dfe05c..5a51b2344 100644 --- a/parsec/CMakeLists.txt +++ b/parsec/CMakeLists.txt @@ -17,6 +17,7 @@ set(BASE_SOURCES class/parsec_value_array.c class/parsec_hash_table.c class/parsec_rwlock.c + class/parsec_heap.c class/parsec_rbtree.c class/parsec_future.c class/parsec_datacopy_future.c diff --git a/parsec/class/lifo.h b/parsec/class/lifo.h index 69bacc2db..526541b89 100644 --- a/parsec/class/lifo.h +++ b/parsec/class/lifo.h @@ -2,6 +2,7 @@ * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #ifndef LIFO_H_HAS_BEEN_INCLUDED @@ -232,6 +233,20 @@ LIFO_STATIC_INLINE int parsec_lifo_nolock_is_empty( parsec_lifo_t* lifo ) { return (NULL == lifo->lifo_head.data.item); } +/* Detach all elements in the chain */ +#if defined(PARSEC_DEBUG_PARANOID) +#define PARSEC_CHAIN_DETACH(item) { \ + parsec_list_item_t *_item = (item); \ + while (_item != NULL) { \ + parsec_list_item_t *next = (parsec_list_item_t *) _item->list_next; \ + PARSEC_ITEM_DETACH(_item); \ + _item = next; \ + } \ +} +#else +#define PARSEC_CHAIN_DETACH(item) do { (void)(item); } while(0) +#endif + #if defined(PARSEC_ATOMIC_HAS_ATOMIC_CAS_INT128) /* Add one element to the FIFO. Returns true if successful, false otherwise. */ @@ -356,6 +371,29 @@ LIFO_STATIC_INLINE parsec_list_item_t* parsec_lifo_try_pop( parsec_lifo_t* lifo return NULL; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_detach_chain(parsec_lifo_t *lifo) +{ + parsec_counted_pointer_t old_head; + do { + old_head.data.guard.counter = lifo->lifo_head.data.guard.counter; + parsec_atomic_rmb(); + old_head.data.item = lifo->lifo_head.data.item; + if (NULL == old_head.data.item) return NULL; + } while (!parsec_update_counted_pointer(&lifo->lifo_head, old_head, NULL)); + parsec_atomic_wmb(); + parsec_list_item_t *item = old_head.data.item; +#if defined(PARSEC_DEBUG_PARANOID) + while (item != NULL) { + parsec_list_item_t *next = (parsec_list_item_t *) item->list_next; + PARSEC_ITEM_DETACH(item); + item = next; + } +#endif + PARSEC_CHAIN_DETACH(item); + return item; +} + #elif defined(PARSEC_ATOMIC_HAS_ATOMIC_LLSC_PTR) LIFO_STATIC_INLINE void _parsec_lifo_release_cpu (void) @@ -468,6 +506,23 @@ LIFO_STATIC_INLINE parsec_list_item_t* parsec_lifo_try_pop( parsec_lifo_t* lifo return item; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_detach_chain(parsec_lifo_t *lifo) +{ + parsec_list_item_t *item; + int attempt = 0; + do { + if (++attempt == 5) { + _parsec_lifo_release_cpu(); + attempt = 0; + } + item = (parsec_list_item_t *)parsec_atomic_ll_ptr((long *)&lifo->lifo_head.data.item); + if (NULL == item) return NULL; + } while (!parsec_atomic_sc_ptr((long *)&lifo->lifo_head.data.item, (intptr_t)NULL)); + parsec_atomic_wmb(); + PARSEC_CHAIN_DETACH(item); + return item; +} #else /* defined(PARSEC_ATOMIC_HAS_ATOMIC_CAS_INT128) || defined(PARSEC_ATOMIC_HAS_ATOMIC_LLSC_PTR) */ @@ -547,6 +602,18 @@ LIFO_STATIC_INLINE parsec_list_item_t *parsec_lifo_try_pop(parsec_lifo_t* lifo) return item; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_detach_chain(parsec_lifo_t *lifo) +{ + parsec_list_item_t *item; + parsec_atomic_lock(&lifo->lifo_head.data.guard.lock); + item = lifo->lifo_head.data.item; + lifo->lifo_head.data.item = NULL; + parsec_atomic_unlock(&lifo->lifo_head.data.guard.lock); + PARSEC_CHAIN_DETACH(item); + return item; +} + #endif /* defined(PARSEC_ATOMIC_HAS_ATOMIC_CAS_INT128) || defined(PARSEC_ATOMIC_HAS_ATOMIC_LLSC_PTR) */ LIFO_STATIC_INLINE void parsec_lifo_nolock_push( parsec_lifo_t* lifo, @@ -583,6 +650,15 @@ LIFO_STATIC_INLINE parsec_list_item_t* parsec_lifo_nolock_pop( parsec_lifo_t* li return item; } +LIFO_STATIC_INLINE parsec_list_item_t * +parsec_lifo_nolock_detach_chain(parsec_lifo_t *lifo) +{ + parsec_list_item_t *item = lifo->lifo_head.data.item; + lifo->lifo_head.data.item = NULL; + PARSEC_CHAIN_DETACH(item); + return item; +} + /** * @brief Allocate a lifo item. * diff --git a/parsec/class/parsec_heap.c b/parsec/class/parsec_heap.c new file mode 100644 index 000000000..bd92aa799 --- /dev/null +++ b/parsec/class/parsec_heap.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2026 Stony Brook University. All rights reserved. + */ + +#include "parsec/parsec_config.h" +#include "parsec/class/parsec_heap.h" +#include "parsec/constants.h" + +#include + +/* COMPARISON_VAL(ptr, offset) — reads *(int*) at byte offset inside ptr. + * Defined in parsec_config_bottom.h, included transitively via parsec_config.h. */ + +/* left child = list_prev + * right child = list_next + * Same convention as parsec/maxheap.c and parsec_rbtree.c. */ +#define HLEFT(item) ((parsec_list_item_t *)(item)->list_prev) +#define HRIGHT(item) ((parsec_list_item_t *)(item)->list_next) +#define HSET_LEFT(it, v) ((it)->list_prev = (volatile struct parsec_list_item_s *)(v)) +#define HSET_RIGHT(it, v) ((it)->list_next = (volatile struct parsec_list_item_s *)(v)) + +static inline int heap_cmp(const parsec_heap_t *h, + const parsec_list_item_t *a, + const parsec_list_item_t *b) +{ + int va = COMPARISON_VAL(a, h->comp_offset); + int vb = COMPARISON_VAL(b, h->comp_offset); + return (va > vb) - (va < vb); +} + +/* Maximum depth of the path array. 64 supports heaps of up to 2^64 elements. */ +#define HEAP_MAX_DEPTH 64 + +int parsec_heap_push(parsec_heap_t *heap, parsec_list_item_t *item) +{ + HSET_LEFT(item, NULL); + HSET_RIGHT(item, NULL); + heap->size++; + + if (heap->size == 1) { + heap->top = item; + return PARSEC_SUCCESS; + } + + /* Find the insertion point by following the bit path of 'size'. + * After the leading 1-bit, each subsequent bit chooses right (1) or left (0). + * Save ancestors for the sift-up pass. + * (Same bit-navigation used by heap_insert in parsec/maxheap.c.) */ + parsec_list_item_t *path[HEAP_MAX_DEPTH]; + int depth = 0; + + size_t size = heap->size; + size_t bitmask = 1; + while (bitmask <= size) bitmask <<= 1; + bitmask >>= 2; /* position at bit just below the leading 1 */ + + parsec_list_item_t *node = heap->top; + path[depth++] = node; + while (bitmask > 1) { + node = (bitmask & size) ? HRIGHT(node) : HLEFT(node); + path[depth++] = node; + bitmask >>= 1; + } + /* Attach item as left (0) or right (1) child */ + if (bitmask & size) HSET_RIGHT(node, item); + else HSET_LEFT(node, item); + + /* Sift up: walk from immediate parent (path[depth-1]) toward root */ + int level = depth - 1; + while (level >= 0) { + parsec_list_item_t *parent = path[level]; + if (heap_cmp(heap, item, parent) <= 0) break; + + /* Fix grandparent to point to item instead of parent */ + if (level > 0) { + parsec_list_item_t *gp = path[level - 1]; + if (HLEFT(gp) == parent) HSET_LEFT(gp, item); + else HSET_RIGHT(gp, item); + } else { + heap->top = item; + } + + /* Swap item and parent: item takes parent's position, parent takes item's */ + parsec_list_item_t *pl = HLEFT(parent); + parsec_list_item_t *pr = HRIGHT(parent); + HSET_LEFT(parent, HLEFT(item)); + HSET_RIGHT(parent, HRIGHT(item)); + if (pl == item) { + HSET_LEFT(item, parent); + HSET_RIGHT(item, pr); + } else { + HSET_LEFT(item, pl); + HSET_RIGHT(item, parent); + } + level--; + } + return PARSEC_SUCCESS; +} + +parsec_list_item_t *parsec_heap_pop(parsec_heap_t *heap) +{ + if (0 == heap->size) return NULL; + + parsec_list_item_t *root = heap->top; + + if (heap->size == 1) { + heap->top = NULL; + heap->size = 0; + PARSEC_LIST_ITEM_SINGLETON(root); + return root; + } + + /* Navigate to the parent of the 'last' node (rightmost node in the + * bottom level), then detach it. Track which side it was on so we + * can correctly wire it into root's position even after clearing the + * pointer. */ + size_t size = heap->size; + size_t bitmask = 1; + while (bitmask <= size) bitmask <<= 1; + bitmask >>= 2; + + parsec_list_item_t *parent = heap->top; + while (bitmask > 1) { + parent = (bitmask & size) ? HRIGHT(parent) : HLEFT(parent); + bitmask >>= 1; + } + + parsec_list_item_t *last; + int last_was_right = (int)(bitmask & size); + if (last_was_right) { + last = HRIGHT(parent); + HSET_RIGHT(parent, NULL); + } else { + last = HLEFT(parent); + HSET_LEFT(parent, NULL); + } + assert(last != NULL); + + /* Wire 'last' into root's place, inheriting root's children. */ + if (parent != root) { + HSET_LEFT(last, HLEFT(root)); + HSET_RIGHT(last, HRIGHT(root)); + } else { + /* last was a direct child of root; one pointer was already cleared above */ + if (last_was_right) { + HSET_LEFT(last, HLEFT(root)); /* root's left is intact */ + HSET_RIGHT(last, NULL); /* last is a leaf */ + } else { + HSET_LEFT(last, NULL); /* last is a leaf */ + HSET_RIGHT(last, HRIGHT(root)); /* root's right is intact */ + } + } + heap->top = last; + heap->size--; + + /* Sift down: swap last with the larger child until heap order is restored. + * Track parent and which side we came from (no extra allocation needed). */ + parsec_list_item_t *bubbler = last; + parsec_list_item_t *par = NULL; + int from_right = 0; + while (1) { + parsec_list_item_t *left = HLEFT(bubbler); + parsec_list_item_t *right = HRIGHT(bubbler); + int go_left = (left && heap_cmp(heap, left, bubbler) > 0 && + (!right || heap_cmp(heap, left, right) >= 0)); + int go_right = (!go_left && right && heap_cmp(heap, right, bubbler) > 0); + if (!go_left && !go_right) break; + + parsec_list_item_t *swap = go_left ? left : right; + if (par) { + if (from_right) HSET_RIGHT(par, swap); + else HSET_LEFT(par, swap); + } else { + heap->top = swap; + } + HSET_LEFT(bubbler, HLEFT(swap)); + HSET_RIGHT(bubbler, HRIGHT(swap)); + if (go_left) { + HSET_LEFT(swap, bubbler); + HSET_RIGHT(swap, right); + from_right = 0; + } else { + HSET_LEFT(swap, left); + HSET_RIGHT(swap, bubbler); + from_right = 1; + } + par = swap; + } + + PARSEC_LIST_ITEM_SINGLETON(root); + return root; +} + +int parsec_heap_push_chain(parsec_heap_t *heap, parsec_list_item_t *chain) +{ + parsec_list_item_t *item = chain; + do { + /* Capture list_next before parsec_heap_push repurposes it as right-child */ + parsec_list_item_t *next = (parsec_list_item_t *)item->list_next; + parsec_heap_push(heap, item); + item = next; + } while (item != chain && item != NULL); + return PARSEC_SUCCESS; +} diff --git a/parsec/class/parsec_heap.h b/parsec/class/parsec_heap.h new file mode 100644 index 000000000..6409fc397 --- /dev/null +++ b/parsec/class/parsec_heap.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2026 Stony Brook University. All rights reserved. + */ + +#ifndef PARSEC_HEAP_H_HAS_BEEN_INCLUDED +#define PARSEC_HEAP_H_HAS_BEEN_INCLUDED + +#include "parsec/parsec_config.h" +#include "parsec/class/list_item.h" +#include "parsec/constants.h" + +BEGIN_C_DECLS + +/** + * @brief Intrusive pointer-based max-heap with an int32_t priority key. + * + * @details Elements are linked directly through their parsec_list_item_t + * list_prev (left child) and list_next (right child) pointers, forming a + * complete binary tree — the same technique used by parsec/maxheap.c for + * CPU scheduler task heaps. No separate backing array is allocated. + * + * The priority of each element is read as *(int32_t*)((char*)element + + * comp_offset), matching the COMPARISON_VAL macro convention used by the + * rbtree and list sort. + * + * All operations are O(log N). Sift-up uses a small on-stack path array + * (max 64 entries; supports up to 2^64 elements). + * + * An element's list_prev/list_next are used as tree child pointers while it + * is in the heap. parsec_heap_pop() restores them to singleton state before + * returning, so the caller can pass the result directly to + * parsec_list_push_back() / parsec_gpu_stream_push_pending() etc. + */ +typedef struct parsec_heap_s { + parsec_list_item_t *top; /**< root of the complete binary tree */ + size_t size; /**< current element count */ + size_t comp_offset; /**< byte offset of int32_t priority key */ +} parsec_heap_t; + +/** Initialize an empty heap with the given priority-key offset. */ +static inline void parsec_heap_init(parsec_heap_t *heap, size_t comp_offset) { + heap->top = NULL; + heap->size = 0; + heap->comp_offset = comp_offset; +} + +/** Finalize heap (no-op: no allocation to free). */ +static inline void parsec_heap_fini(parsec_heap_t *heap) { + (void)heap; +} + +/** Return non-zero if the heap is empty. */ +static inline int parsec_heap_is_empty(const parsec_heap_t *heap) { + return (heap->size == 0); +} + +/** Return the number of elements. */ +static inline size_t parsec_heap_size(const parsec_heap_t *heap) { + return heap->size; +} + +/** View the maximum element without removing it. O(1). */ +static inline parsec_list_item_t *parsec_heap_peek(const parsec_heap_t *heap) { + return heap->top; +} + +/** + * Insert one element. O(log N). + * @return PARSEC_SUCCESS (cannot fail; no allocation is performed). + */ +int parsec_heap_push(parsec_heap_t *heap, parsec_list_item_t *item); + +/** + * Remove and return the maximum element, or NULL if empty. O(log N). + * The returned item's list_prev and list_next are reset to singleton state. + */ +parsec_list_item_t *parsec_heap_pop(parsec_heap_t *heap); + +/** + * Batch-insert all elements from a chain or ring. + * @return PARSEC_SUCCESS (cannot fail). + */ +int parsec_heap_push_chain(parsec_heap_t *heap, parsec_list_item_t *chain); + +END_C_DECLS + +#endif /* PARSEC_HEAP_H_HAS_BEEN_INCLUDED */ diff --git a/parsec/data.c b/parsec/data.c index 306a6c507..29770d593 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -635,7 +635,7 @@ static void parsec_arena_datatype_construct(parsec_object_t *obj) { adt->ht_item.next_item = NULL; /* keep Coverity happy */ adt->ht_item.hash64 = 0; /* keep Coverity happy */ adt->ht_item.key = 0; /* keep Coverity happy */ - adt->opaque_dtt = NULL; + adt->opaque_dtt = PARSEC_DATATYPE_NULL; } static void parsec_arena_datatype_destruct(parsec_object_t *obj) { diff --git a/parsec/hbbuffer.c b/parsec/hbbuffer.c index 9dcb30ae3..64a16a095 100644 --- a/parsec/hbbuffer.c +++ b/parsec/hbbuffer.c @@ -250,9 +250,9 @@ parsec_hbbuffer_pop_best(parsec_hbbuffer_t *b, off_t priority_offset) #if defined(PARSEC_DEBUG_NOISIER) if( best_elt != NULL ) { char tmp[MAX_TASK_STRLEN]; - if (priority_offset == offsetof(parsec_heap_t, priority)) { + if (priority_offset == offsetof(parsec_task_heap_t, priority)) { PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "HBB:\tFound best element %s in heap %p in local queue %p at position %d", - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, (parsec_task_t*)((parsec_heap_t*)best_elt)->top), best_elt, + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, (parsec_task_t*)((parsec_task_heap_t*)best_elt)->heap.top), best_elt, b, best_idx); } else { PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "HBB:\tFound best element %s in local queue %p at position %d", diff --git a/parsec/maxheap.c b/parsec/maxheap.c index bba41d9b2..8d5d4c490 100644 --- a/parsec/maxheap.c +++ b/parsec/maxheap.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -13,8 +14,14 @@ #include "parsec/maxheap.h" #include +#include -static inline int hiBit(unsigned int n) +/* list_prev = left child, list_next = right child (same as parsec_heap.c) */ +#define HLEFT(item) ((parsec_list_item_t *)(item)->list_prev) +#define HRIGHT(item) ((parsec_list_item_t *)(item)->list_next) + +/* Highest set bit: used to compute sub-heap sizes in heap_split_and_steal. */ +static inline unsigned int hiBit(unsigned int n) { n |= (n >> 1); n |= (n >> 2); @@ -24,362 +31,153 @@ static inline int hiBit(unsigned int n) return n - (n >> 1); } -parsec_heap_t* heap_create(void) +parsec_task_heap_t* heap_create(void) { - parsec_heap_t* heap = calloc(1, sizeof(parsec_heap_t)); - /* Point back to the parent structure */ - heap->list_item.list_next = (parsec_list_item_t*)heap; - heap->list_item.list_prev = (parsec_list_item_t*)heap; - return heap; + parsec_task_heap_t *h = calloc(1, sizeof(parsec_task_heap_t)); + h->list_item.list_next = (parsec_list_item_t*)h; + h->list_item.list_prev = (parsec_list_item_t*)h; + h->priority = 0; + parsec_heap_init(&h->heap, offsetof(parsec_task_t, priority)); + return h; } -void heap_destroy(parsec_heap_t** heap) +void heap_destroy(parsec_task_heap_t **heap) { - assert((*heap)->top == NULL); + assert(parsec_heap_is_empty(&(*heap)->heap)); + parsec_heap_fini(&(*heap)->heap); free(*heap); - (*heap) = NULL; + *heap = NULL; } -/* - * Insertion is O(lg n), as we know exactly how to get to the next insertion point, - * and the tree is manually balanced. - * Overall build is O(n lg n) - * - * Destroys elem->list_item next and prev. - */ -void heap_insert(parsec_heap_t * heap, parsec_task_t * elem) +void heap_insert(parsec_task_heap_t *heap, parsec_task_t *elem) { assert(heap != NULL); assert(elem != NULL); - heap->size++; - elem->super.list_next = NULL; - elem->super.list_prev = NULL; - - if (heap->size == 1) { - heap->top = elem; - } else { - parsec_task_t * parent = heap->top; - unsigned int bitmask = 1, size = heap->size; - // prime the bitmask - int level_counter = 0, parents_size = 0; - while (bitmask <= size) { - bitmask = bitmask << 1; - level_counter++; - } - parents_size = level_counter; - - parsec_task_t ** parents = calloc(level_counter, sizeof(parsec_task_t *)); - // now the bitmask is two places farther than we want it, so back down - bitmask = bitmask >> 2; - - parents[--level_counter] = heap->top; - // now move through tree - while (bitmask > 1) { - parent = (parsec_task_t*)((bitmask & size) ? parent->super.list_next : parent->super.list_prev); - parents[--level_counter] = parent; // save parent - bitmask = bitmask >> 1; - } - if (bitmask & size) - parent->super.list_next = (parsec_list_item_t*)elem; - else - parent->super.list_prev = (parsec_list_item_t*)elem; - - // now bubble up to preserve max heap org. - while( (level_counter < parents_size) && - (parents[level_counter] != NULL) && - (elem->priority > parents[level_counter]->priority) ) { - parent = parents[level_counter]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tswapping parent %p and elem %p (priorities: %d and %d)", - parent, elem, parent->priority, elem->priority); - /* first, fix our grandparent, if necessary */ - if (level_counter + 1 < parents_size && parents[level_counter + 1] != NULL) { - parsec_task_t * grandparent = parents[level_counter + 1]; - // i.e. our parent has a parent - if (grandparent->super.list_prev /* left */ == (parsec_list_item_t*)parent) - grandparent->super.list_prev = (parsec_list_item_t*)elem; - else /* our grandparent's right child is our parent*/ - grandparent->super.list_next = (parsec_list_item_t*)elem; - } - - /* next, fix our parent */ - parsec_list_item_t * parent_left = (parsec_list_item_t*)parent->super.list_prev; - parsec_list_item_t * parent_right = (parsec_list_item_t*)parent->super.list_next; - parent->super.list_prev = elem->super.list_prev; - parent->super.list_next = elem->super.list_next; - - /* lastly, fix ourselves */ - if (parent_left == (parsec_list_item_t*)elem) { - /* we're our parent's left child */ - elem->super.list_prev = (parsec_list_item_t*)parent; - elem->super.list_next = (parsec_list_item_t*)parent_right; - } else { - /* we're out parent's right child */ - elem->super.list_prev = (parsec_list_item_t*)parent_left; - elem->super.list_next = (parsec_list_item_t*)parent; - } - - if (parent == heap->top) - heap->top = elem; - - level_counter++; - } - free(parents); - } - - /* set priority to top priority */ - heap->priority = heap->top->priority; + parsec_heap_push(&heap->heap, &elem->super); + heap->priority = (unsigned int)COMPARISON_VAL(heap->heap.top, heap->heap.comp_offset); #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tInserted exec C %s (%p) into maxheap %p of size %u", - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, elem), elem, heap, heap->size); + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tInserted exec C %s (%p) into maxheap %p of size %zu", + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, elem), elem, + heap, heap->heap.size); #endif } -/* - * split-and-steal (remove) is O(1), although the preceding - * list search is probably O(n), technically, since eventually we - * end up with a list of n/2 trees with single nodes - * - * This function expects one valid heap (heap that has at least one element) - * and another pointer to a NULL heap pointer. - * If you pass a NULL heap, the function will simply return NULL. - * This function WILL destroy your heap if it empties it. - * It will also MODIFY your stack appropriately. If both of your heap pointers - * are NULL after it returns, there was only one element in the heap you passed. - * If only the new_heap pointer is NULL, then you still have one (and ONLY ONE) - * valid heap. - * If your valid heap had at least 3 nodes, then the heap will actually be split, - * a new heap pointer created and put on your stack. - * No matter what happens, an execution_context is returned unless the heap was NULL. - */ -parsec_task_t* -heap_split_and_steal(parsec_heap_t ** heap_ptr, - parsec_heap_t ** new_heap_ptr) +parsec_task_t* heap_remove(parsec_task_heap_t **heap_ptr) { - // if tree is empty, return NULL - // if tree has only one node (top), return new heap with single node - // moved into to_use slot - // if tree has left child but not right child, put left child in new tree + parsec_task_heap_t *heap = *heap_ptr; + if (NULL == heap) return NULL; - parsec_heap_t * heap = *heap_ptr; // shortcut to doing a bunch of (*heap_ptr)s - parsec_task_t * to_use = NULL; - (*new_heap_ptr) = NULL; // this should already be NULL, but if it's not, we'll fix that. + parsec_list_item_t *item = parsec_heap_pop(&heap->heap); + if (NULL == item) return NULL; - if( NULL == heap ) return NULL; + parsec_task_t *task = (parsec_task_t*)item; - assert(heap->top != NULL); // this heap should have been destroyed - to_use = heap->top; // this will always be what we return, even if it's NULL, if a valid heap was passed - if( NULL == heap->top->super.list_prev ) { - /* no left child, so 'top' is the only node */ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tDestroying heap %p", heap->top, heap->top->super.list_next, heap); - heap->top = NULL; + if (parsec_heap_is_empty(&heap->heap)) { heap_destroy(heap_ptr); - assert(*heap_ptr == NULL); - goto prepare_for_return; - } /* otherwise we do have left child */ - if( NULL == heap->top->super.list_next /* right */ ) { - assert(heap->size == 2); - /* but doesn't have right child, so still not splitting */ - heap->top = (parsec_task_t*)heap->top->super.list_prev; // left - assert(heap->top->super.list_next == NULL); - assert(heap->top->super.list_prev == NULL); - heap->priority = heap->top->priority; - heap->size--; // should equal 1 - /* set up doubly-linked singleton list in here, as DEFAULT scenario */ - // PETER TODO this comment needs to be better, b/c I don't understand it anymore + } else { + heap->priority = (unsigned int)COMPARISON_VAL(heap->heap.top, heap->heap.comp_offset); + /* Restore singleton list links so the wrapper can be re-inserted into a scheduler list */ heap->list_item.list_prev = (parsec_list_item_t*)*heap_ptr; heap->list_item.list_next = (parsec_list_item_t*)*heap_ptr; } - else { // heap has at least 3 nodes, so we should be actually splitting - unsigned int size = heap->size; - unsigned int highBit = hiBit(heap->size); - unsigned int twoBit = highBit >> 1; - assert(heap->size >= 3); - (*new_heap_ptr) = heap_create(); - (*new_heap_ptr)->top = (parsec_task_t*)heap->top->super.list_prev; // left - (*new_heap_ptr)->priority = (*new_heap_ptr)->top->priority; - heap->top = (parsec_task_t*)heap->top->super.list_next; - heap->priority = heap->top->priority; - if (twoBit & size) { // last item is on right side - heap->size = ~highBit & size; - (*new_heap_ptr)->size = size - heap->size - 1; - } - else { // last item is on left side - (*new_heap_ptr)->size = (size & ~highBit) + twoBit; - heap->size = size - (*new_heap_ptr)->size - 1; - } - /* set up doubly-linked two-element list in here, as DEFAULT scenario */ - heap->list_item.list_prev = (parsec_list_item_t*)(*new_heap_ptr); - heap->list_item.list_next = (parsec_list_item_t*)(*new_heap_ptr); - (*new_heap_ptr)->list_item.list_prev = (parsec_list_item_t*)heap; - (*new_heap_ptr)->list_item.list_next = (parsec_list_item_t*)heap; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tSplit heap %p into itself and heap %p", heap, *new_heap_ptr); - } - prepare_for_return: - PARSEC_LIST_ITEM_SINGLETON(to_use); + + task->super.list_next = (parsec_list_item_t*)task; /* safety */ + task->super.list_prev = (parsec_list_item_t*)task; #if defined(PARSEC_DEBUG_NOISIER) - { + if (task != NULL) { char tmp[MAX_TASK_STRLEN]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tStole exec C %s (%p) from heap %p", - parsec_task_snprintf(tmp, MAX_TASK_STRLEN, to_use), to_use, heap); + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tStole exec C %s (%p) from heap %p", + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, task), task, *heap_ptr); } #endif - return to_use; + return task; } -// cannot be made thread-safe with atomics -parsec_task_t* heap_remove(parsec_heap_t ** heap_ptr) +parsec_task_t* +heap_split_and_steal(parsec_task_heap_t **heap_ptr, + parsec_task_heap_t **new_heap_ptr) { - parsec_task_t * to_use = NULL; - parsec_heap_t * heap = *heap_ptr; + parsec_task_heap_t *heap = *heap_ptr; + *new_heap_ptr = NULL; + if (NULL == heap) return NULL; - if (heap != NULL) { - assert(heap->top != NULL); // this heap should have been destroyed - to_use = heap->top; // this will always be what we return, even if it's NULL, if a valid heap was passed - if (heap->top->super.list_prev == NULL) { - /* no left child, so 'top' is the only node */ - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tDestroying heap %p", heap->top, heap->top->super.list_next, heap); - assert(heap->size == 1); - heap->top = NULL; - heap_destroy(heap_ptr); - assert(*heap_ptr == NULL); - } - else { /* does have left child */ - if (heap->top->super.list_next /* right */ == NULL) { - assert(heap->size == 2); - /* but doesn't have right child, so still not splitting */ - heap->top = (parsec_task_t*)heap->top->super.list_prev; // left - /* set up doubly-linked singleton list in here, as DEFAULT scenario */ - heap->list_item.list_prev = (parsec_list_item_t*)*heap_ptr; - heap->list_item.list_next = (parsec_list_item_t*)*heap_ptr; - } - else { // heap has at least 3 nodes, so we do fancy removal - assert(heap->size >= 3); - /* - the strategy here is to find the 'last' node in the 'complete' heap - and swap it up to replace the top node (which is being removed), because - it is the only node that can be moved without making the heap 'incomplete'. - Once the swap is made, in order to preserve priority order, we then - 'bubble down' in the direction of the higher of any higher children. - */ - parsec_task_t * parent = heap->top; - unsigned int bitmask = 1; - unsigned int size = heap->size; - // this allows us to count the number of layers in the heap - while (bitmask <= size) - bitmask = bitmask << 1; - /* at this point, the ith bit in bitmask tells us that we have i - 1 layers... - * ...so we shift down one to get rid of the 'extra' layer, - * and another to prepare for the following logic, which only 'moves' - * through the heap until the second-to-last layer. - */ - bitmask = bitmask >> 2; - while (bitmask > 1) { - /* the "bitmask & size" operation is a simple way of moving - * through the heap one layer at a time in the direction of the - * 'last' element in the 'complete' heap. - */ - parent = (parsec_task_t*)( - (bitmask & size) ? parent->super.list_next : parent->super.list_prev); - bitmask = bitmask >> 1; - } + parsec_heap_t *h = &heap->heap; + assert(h->top != NULL); + + parsec_task_t *to_use = (parsec_task_t*)h->top; + + if (NULL == HLEFT(h->top)) { + /* Only root — no children */ + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tDestroying heap %p (single node)", heap); + h->top = NULL; + h->size = 0; + heap_destroy(heap_ptr); + goto prepare_for_return; + } + + if (NULL == HRIGHT(h->top)) { + /* Root has only a left child (size == 2) */ + assert(h->size == 2); + parsec_list_item_t *left = HLEFT(h->top); + h->top = left; + h->size = 1; + heap->priority = (unsigned int)COMPARISON_VAL(left, h->comp_offset); + heap->list_item.list_prev = (parsec_list_item_t*)*heap_ptr; + heap->list_item.list_next = (parsec_list_item_t*)*heap_ptr; + goto prepare_for_return; + } - if (bitmask & size) { // LAST NODE IS A 'NEXT' NODE - heap->top = (parsec_task_t*)parent->super.list_next; - // should ALWAYS be a leaf node - assert(heap->top != NULL); - assert(heap->top->super.list_next == NULL); - assert(heap->top->super.list_prev == NULL); - if (parent != to_use) { // if not a second-level-from-the-top node... - heap->top->super.list_next = to_use->super.list_next; - parent->super.list_next = NULL; - } - else - heap->top->super.list_next = NULL; - heap->top->super.list_prev = to_use->super.list_prev; - } - else { // LAST NODE IS A 'PREV' NODE - heap->top = (parsec_task_t*)parent->super.list_prev; - // should ALWAYS be a leaf node - assert(heap->top != NULL); - assert(heap->top->super.list_next == NULL); - assert(heap->top->super.list_prev == NULL); - /* a prev node isn't on the second level from the top - * (because otherwise size == 2), so we safely assume it has a parent - */ - heap->top->super.list_next = to_use->super.list_next; - heap->top->super.list_prev = to_use->super.list_prev; - parent->super.list_prev = NULL; - } + /* >= 3 nodes: split into left (new_heap) and right (heap) subtrees */ + { + unsigned int size = (unsigned int)h->size; + unsigned int highBit = hiBit(size); + unsigned int twoBit = highBit >> 1; - // now bubble down - parsec_task_t * bubbler = heap->top; - int is_next = -1; /* flag keeps track of whether we are 'prev' or 'next' to our current PARENT. - * the initial value doesn't matter since we're at the top and have no parent. */ - parent = NULL; - while (1) { - parsec_task_t * next = (parsec_task_t*)bubbler->super.list_next; - parsec_task_t * prev = (parsec_task_t*)bubbler->super.list_prev; - // first, compare all three priorities to see which way to bubble, if any - if (prev != NULL && prev->priority > bubbler->priority && - (next == NULL || prev->priority >= next->priority)) { - // bubble toward (swap with) prev - if (parent) { - if (is_next) - parent->super.list_next = (parsec_list_item_t *)prev; - else - parent->super.list_prev = (parsec_list_item_t *)prev; - } - else - heap->top = prev; + *new_heap_ptr = heap_create(); + (*new_heap_ptr)->heap.comp_offset = h->comp_offset; - bubbler->super.list_prev = prev->super.list_prev; - bubbler->super.list_next = prev->super.list_next; - prev->super.list_prev = (parsec_list_item_t *)bubbler; - prev->super.list_next = (parsec_list_item_t *)next; + parsec_list_item_t *left_top = HLEFT(h->top); + parsec_list_item_t *right_top = HRIGHT(h->top); - is_next = 0; // b/c we will be our parent's PREV in the next round - parent = prev; - } - else if (next != NULL && next->priority > bubbler->priority && - (prev == NULL || next->priority > prev->priority)) { - // bubble toward next - if (parent) { - if (is_next) - parent->super.list_next = (parsec_list_item_t *)next; - else - parent->super.list_prev = (parsec_list_item_t *)next; - } - else - heap->top = next; + (*new_heap_ptr)->heap.top = left_top; + (*new_heap_ptr)->priority = (unsigned int)COMPARISON_VAL(left_top, h->comp_offset); - bubbler->super.list_prev = next->super.list_prev; - bubbler->super.list_next = next->super.list_next; - next->super.list_prev = (parsec_list_item_t *)prev; - next->super.list_next = (parsec_list_item_t *)bubbler; + h->top = right_top; + heap->priority = (unsigned int)COMPARISON_VAL(right_top, h->comp_offset); - is_next = 1; // b/c we will be our parent's NEXT in the next round - parent = next; - } - else // either both next and prev are NULL, or neither has a higher priority than bubbler - break; - } - } - heap->size--; - heap->priority = heap->top->priority; + if (twoBit & size) { /* last node is in the right subtree */ + h->size = (size_t)(~highBit & size); + (*new_heap_ptr)->heap.size = (size_t)(size - (unsigned int)h->size - 1); + } else { /* last node is in the left subtree */ + (*new_heap_ptr)->heap.size = (size_t)((size & ~highBit) + twoBit); + h->size = (size_t)(size - (unsigned int)(*new_heap_ptr)->heap.size - 1); } - to_use->super.list_next = (parsec_list_item_t*)to_use; // safety's - to_use->super.list_prev = (parsec_list_item_t*)to_use; // sake + + /* Form a two-element ring so the caller can re-singleton each side */ + heap->list_item.list_prev = (parsec_list_item_t*)(*new_heap_ptr); + heap->list_item.list_next = (parsec_list_item_t*)(*new_heap_ptr); + (*new_heap_ptr)->list_item.list_prev = (parsec_list_item_t*)heap; + (*new_heap_ptr)->list_item.list_next = (parsec_list_item_t*)heap; + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tSplit heap %p into itself and heap %p", heap, *new_heap_ptr); } + prepare_for_return: + PARSEC_LIST_ITEM_SINGLETON(to_use); + #if defined(PARSEC_DEBUG_NOISIER) - if (to_use != NULL) { + { char tmp[MAX_TASK_STRLEN]; - PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "MH:\tStole exec C %s (%p) from heap %p", parsec_task_snprintf(tmp, MAX_TASK_STRLEN, to_use), to_use, heap); + PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, + "MH:\tStole exec C %s (%p) from heap %p", + parsec_task_snprintf(tmp, MAX_TASK_STRLEN, to_use), to_use, *heap_ptr); } #endif return to_use; } - diff --git a/parsec/maxheap.h b/parsec/maxheap.h index 780b773b0..6c46911f1 100644 --- a/parsec/maxheap.h +++ b/parsec/maxheap.h @@ -2,42 +2,55 @@ * Copyright (c) 2009-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #ifndef MAXHEAP_H_HAS_BEEN_INCLUDED #define MAXHEAP_H_HAS_BEEN_INCLUDED #include "parsec/parsec_config.h" -#include "parsec/class/list_item.h" +#include "parsec/class/parsec_heap.h" +#include "parsec/runtime.h" BEGIN_C_DECLS /** - * The structure implemented here is not thread safe. All concurrent - * accesses should be protected by the upper level. + * Wrapper around parsec_heap_t that adds the list_item field (so the heap + * can be stored in scheduler lists) and an explicit 'priority' field (the + * max priority of any task in the heap, used by parsec_hbbuffer_pop_best + * to pick the best heap to steal from without traversing the tree). + * + * Not thread-safe; all concurrent accesses must be protected by the caller. */ +typedef struct parsec_task_heap_s { + parsec_list_item_t list_item; /**< for compatibility with scheduler lists */ + unsigned int priority; /**< max priority of any task in this heap */ + parsec_heap_t heap; /**< pointer-based max-heap storage */ +} parsec_task_heap_t; -/* main struct holding size info and ID */ -typedef struct parsec_heap_s { - parsec_list_item_t list_item; /* to be compatible with the lists */ - unsigned int size; - unsigned int priority; - parsec_task_t * top; -} parsec_heap_t; +/** Allocate an empty heap as a singleton list item with zero priority. */ +parsec_task_heap_t* heap_create(void); -/* - allocates an empty heap as a correctly doubly-linked singleton list - with the lowest possible priority - */ -parsec_heap_t* heap_create(void); +/** Free an empty heap. Asserts that the heap is empty. */ +void heap_destroy(parsec_task_heap_t** heap); -void heap_destroy(parsec_heap_t** heap); +/** Insert a task into the heap, updating the stored max priority. */ +void heap_insert(parsec_task_heap_t *heap, parsec_task_t *elem); -void heap_insert(parsec_heap_t * heap, parsec_task_t * elem); -parsec_task_t* -heap_split_and_steal(parsec_heap_t ** heap_ptr, - parsec_heap_t ** new_heap_ptr); -parsec_task_t * heap_remove(parsec_heap_t ** heap_ptr); +/** + * Remove the maximum-priority task from the heap and, if the heap has at + * least 3 nodes, split it into two sub-heaps for work stealing. + * On return, *heap_ptr and *new_heap_ptr are the two sub-heaps (either + * may be NULL if the original heap had fewer than 3 nodes). + */ +parsec_task_t* heap_split_and_steal(parsec_task_heap_t **heap_ptr, + parsec_task_heap_t **new_heap_ptr); + +/** + * Remove the maximum-priority task from the heap. + * If the heap becomes empty it is destroyed and *heap_ptr is set to NULL. + */ +parsec_task_t* heap_remove(parsec_task_heap_t **heap_ptr); END_C_DECLS diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index e29f144e9..91c87accb 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -41,7 +42,6 @@ int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_me char* parsec_cuda_lib_path = NULL; static int cuda_mask; -static int parsec_cuda_sort_pending; #if defined(PARSEC_PROF_TRACE) int parsec_device_cuda_one_profiling_stream_per_gpu_stream = 0; @@ -114,9 +114,6 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority assert( NULL == parsec_device_cuda_component.modules[j] ); continue; } - if(parsec_cuda_sort_pending) { - parsec_device_cuda_component.modules[j]->sort_pending_list = parsec_device_sort_pending_list; - } parsec_device_cuda_component.modules[j]->component = &parsec_device_cuda_component; j++; /* next available spot */ parsec_device_cuda_component.modules[j] = NULL; @@ -164,9 +161,6 @@ static int device_cuda_component_register(void) (void)parsec_mca_param_reg_int_name("device_cuda", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams); - (void)parsec_mca_param_reg_int_name("device_cuda", "sort_pending_tasks", - "Boolean to let the GPU engine sort the first pending tasks stored in the list", - false, false, 0, &parsec_cuda_sort_pending); #if defined(PARSEC_PROF_TRACE) (void)parsec_mca_param_reg_int_name("device_cuda", "one_profiling_stream_per_cuda_stream", "Boolean to separate the profiling of each cuda stream into a single profiling stream", diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index 932d968e8..fe07c19ad 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -23,7 +24,9 @@ #include "parsec/utils/debug.h" #include "parsec/utils/argv.h" #include "parsec/utils/zone_malloc.h" -#include "parsec/class/fifo.h" +#include "parsec/class/lifo.h" +#include "parsec/class/parsec_heap.h" +#include #include #include @@ -161,7 +164,7 @@ static int parsec_cuda_all_devices_attached(parsec_device_module_t *device) for( int j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) { if( target_gpu == source_gpu ) { /* always set bit for self-access */ - source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | + source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 << target_gpu->super.super.device_index)); continue; } @@ -173,7 +176,7 @@ static int parsec_cuda_all_devices_attached(parsec_device_module_t *device) cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 ); PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus, {continue;} ); - source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | + source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 << target_gpu->super.super.device_index)); } } @@ -416,7 +419,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) double fp16, fp32, fp64, tf32; struct cudaDeviceProp prop; - show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities"); + show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities"); if(0 < show_caps_index) { parsec_mca_param_lookup_int(show_caps_index, &show_caps); } @@ -510,7 +513,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) /* Each 'exec' stream gets its own profiling stream, except IN and OUT stream that share it. * It's good to separate the exec streams to know what was submitted to what stream * We don't have this issue for the IN and OUT streams because types of event discriminate - * what happens where, and separating them consumes memory and increases the number of + * what happens where, and separating them consumes memory and increases the number of * events that needs to be matched between streams because we cannot differentiate some * ends between IN or OUT, so they are all logged on the same stream. */ gpu_device->trackable_events = PARSEC_PROFILE_GPU_TRACK_EXEC | PARSEC_PROFILE_GPU_TRACK_DATA_OUT @@ -567,9 +570,9 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) /* Initialize internal lists */ PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru, parsec_list_t); PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t); - PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_fifo_t); + PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_lifo_t); + parsec_heap_init(&gpu_device->pending_heap, offsetof(parsec_gpu_task_t, priority)); - gpu_device->sort_starting_p = NULL; gpu_device->peer_access_mask = 0; /* No GPU to GPU direct transfer by default */ device->memory_register = parsec_cuda_memory_register; @@ -641,7 +644,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module ) #if defined(PARSEC_PROF_TRACE) if( NULL != exec_stream->profiling ) { /* No function to clean the profiling stream. If one is introduced - * some day, remember that exec streams 0 and 1 always share the same + * some day, remember that exec streams 0 and 1 always share the same * ->profiling stream, and that all of them share the same * ->profiling stream if parsec_device_cuda_one_profiling_stream_per_cuda_stream == 0 */ } @@ -671,8 +674,9 @@ parsec_cuda_module_fini(parsec_device_module_t* device) /* Release the registered memory */ parsec_device_memory_release(gpu_device); - /* Release pending queue */ + /* Release pending queue and heap */ PARSEC_OBJ_DESTRUCT(&gpu_device->pending); + parsec_heap_fini(&gpu_device->pending_heap); /* Release all streams */ for( j = 0; j < gpu_device->num_exec_streams; j++ ) { diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h index a08bd7659..c84cef393 100644 --- a/parsec/mca/device/device.h +++ b/parsec/mca/device/device.h @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ /** @addtogroup parsec_device @@ -117,12 +118,6 @@ typedef int (*parsec_device_memory_release_f)(parsec_device_module_t*); typedef int (*parsec_device_data_advise_f)(parsec_device_module_t*, parsec_data_t*, int); typedef void* (*parsec_device_find_function_f)(parsec_device_module_t*, char*); -/** - * Reorders the list of pending tasks on the current device based on the - * current heuristic implemented by the device - */ -typedef int (*parsec_device_sort_pending_list_function_f)(parsec_device_module_t*); - /** * Schedules some kernel represented by @p task on the device @p module, * from the execution stream @p es. @@ -155,7 +150,6 @@ struct parsec_device_module_s { parsec_device_memory_release_f memory_release; parsec_device_data_advise_f data_advise; parsec_device_find_function_f find_function; - parsec_device_sort_pending_list_function_f sort_pending_list; parsec_device_kernel_scheduler_function_t kernel_scheduler; parsec_device_all_devices_attached_f all_devices_attached; diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 67e7ca6c3..3dd873acd 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -189,6 +190,7 @@ static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task) gpu_task->last_data_check_epoch = UINT64_MAX; /* force at least one validation for the task */ gpu_task->nb_flows = 0; gpu_task->flow_info = NULL; + gpu_task->priority = -1; // priority is inherited from the task /* Default release mechanism, can be replaced by the DSL */ gpu_task->release_device_task = parsec_device_release_gpu_task; } @@ -392,60 +394,6 @@ void parsec_device_enable_debug(void) } } -int parsec_device_sort_pending_list(parsec_device_module_t *device) -{ - if( !PARSEC_DEV_IS_GPU(device->type) ) - return 0; - - parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)device; - parsec_list_t *sort_list = gpu_device->exec_stream[0]->fifo_pending; - - if (parsec_list_is_empty(sort_list) ) { /* list is empty */ - return 0; - } - - if (gpu_device->sort_starting_p == NULL || !parsec_list_nolock_contains(sort_list, gpu_device->sort_starting_p) ) { - gpu_device->sort_starting_p = (parsec_list_item_t*)sort_list->ghost_element.list_next; - } - - /* p is head */ - parsec_list_item_t *p = gpu_device->sort_starting_p; - int i, j, NB_SORT = 10, space_q, space_min; - - parsec_list_item_t *q, *prev_p, *min_p; - for (i = 0; i < NB_SORT; i++) { - if ( p == &(sort_list->ghost_element) ) { - break; - } - min_p = p; /* assume the minimum one is the first one p */ - q = (parsec_list_item_t*)min_p->list_next; - space_min = parsec_device_check_space_needed(gpu_device, (parsec_gpu_task_t*)min_p); - for (j = i+1; j < NB_SORT; j++) { - if ( q == &(sort_list->ghost_element) ) { - break; - } - space_q = parsec_device_check_space_needed(gpu_device, (parsec_gpu_task_t*)q); - if ( space_min > space_q ) { - min_p = q; - space_min = space_q; - } - q = (parsec_list_item_t*)q->list_next; - - } - if (min_p != p) { /* minimum is not the first one, let's insert min_p before p */ - /* take min_p out */ - parsec_list_item_ring_chop(min_p); - PARSEC_LIST_ITEM_SINGLETON(min_p); - prev_p = (parsec_list_item_t*)p->list_prev; - - /* insert min_p after prev_p */ - parsec_list_add_after( sort_list, prev_p, min_p); - } - p = (parsec_list_item_t*)min_p->list_next; - } - - return 0; -} void* parsec_device_pop_workspace(parsec_device_gpu_module_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream, size_t size) @@ -762,7 +710,7 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int "GPU[%d:%s]: data copy %p [ref_count %d] linked to prefetch gpu task %p on GPU copy %p", gpu_device->super.device_index, gpu_device->super.name, gpu_task->ec->data[0].data_in, gpu_task->ec->data[0].data_in->super.super.obj_reference_count, gpu_task, gpu_task->ec->data[0].data_out); - parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); + parsec_lifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); return PARSEC_SUCCESS; } break; @@ -2164,19 +2112,6 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, return 1; /* positive returns have special meaning and are used for optimizations */ } -#if PARSEC_GPU_USE_PRIORITIES - -static inline parsec_list_item_t* parsec_device_push_task_ordered( parsec_list_t* list, - parsec_list_item_t* elem ) -{ - parsec_list_push_sorted(list, elem, parsec_execution_context_priority_comparator); - return elem; -} -#define PARSEC_PUSH_TASK parsec_device_push_task_ordered -#else -#define PARSEC_PUSH_TASK parsec_list_push_back -#endif - static inline int parsec_gpu_task_is_singleton(parsec_gpu_task_t *task) { @@ -2199,10 +2134,10 @@ parsec_gpu_stream_push_pending(parsec_gpu_exec_stream_t *stream, * order when feeding the tasks to the next stream. */ if( !parsec_gpu_task_is_singleton(task) ) { - parsec_list_chain_back(stream->fifo_pending, &task->list_item); + parsec_list_nolock_chain_back(stream->fifo_pending, &task->list_item); return; } - PARSEC_PUSH_TASK(stream->fifo_pending, &task->list_item); + parsec_list_nolock_push_back(stream->fifo_pending, &task->list_item); } static inline int @@ -2257,7 +2192,6 @@ parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, fifo_pending = gpu_stream->fifo_pending; assert(NULL != fifo_pending); - parsec_list_lock(fifo_pending); for(item = (parsec_list_item_t *)fifo_pending->ghost_element.list_next; item != &fifo_pending->ghost_element; item = next) { @@ -2270,7 +2204,6 @@ parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, } rc = callback(candidate, batch_head, callback_data); if( rc < 0 ) { - parsec_list_unlock(fifo_pending); return rc; } if( 0 == rc ) { @@ -2279,7 +2212,6 @@ parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, nb_tasks++; } } - parsec_list_unlock(fifo_pending); return nb_tasks; } @@ -2332,6 +2264,7 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy, gpu_task->ec = calloc(1, sizeof(parsec_task_t)); PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t); gpu_task->ec->task_class = &parsec_device_d2d_complete_tc; + gpu_task->ec->priority = INT32_MAX; /* This task should be executed as soon as possible */ gpu_task->nb_flows = 1; gpu_task->flow_info[0].flow = &parsec_device_d2d_complete_flow; gpu_task->flow_info[0].flow_span = copy->original->span; @@ -2352,7 +2285,7 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy, current_dev->device_index, current_dev->name, gpu_task->ec->data[0].data_out, gpu_task->ec->data[0].data_out->super.super.obj_reference_count, dst_dev->device_index, dst_dev->name); - parsec_fifo_push( &(((parsec_device_gpu_module_t*)dst_dev)->pending), (parsec_list_item_t*)gpu_task ); + parsec_lifo_push( &(((parsec_device_gpu_module_t*)dst_dev)->pending), (parsec_list_item_t*)gpu_task ); } static int @@ -2654,7 +2587,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, grab_a_task: assert(NULL == task); if( NULL == stream->tasks[stream->start] ) { /* there is room on the stream */ - task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending); /* get the best task */ + task = (parsec_gpu_task_t*)parsec_list_nolock_pop_front(stream->fifo_pending); /* get the next task */ } if( NULL == task ) { /* No tasks, we're done */ return PARSEC_HOOK_RETURN_DONE; @@ -3379,7 +3312,7 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, { parsec_device_gpu_module_t* gpu_device = (parsec_device_gpu_module_t *)module; int rc, exec_stream = 0; - parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL; + parsec_gpu_task_t *progress_task = NULL; parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t*)_gpu_task; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; @@ -3395,6 +3328,10 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, PARSEC_PROFILING_EVENT_RESCHEDULED ); #endif /* defined(PARSEC_PROF_TRACE) */ + if (gpu_task != NULL && gpu_task->priority < 0) { + gpu_task->priority = (gpu_task->ec != NULL) ? gpu_task->ec->priority : 0; + } + /* Check the GPU status -- three kinds of values for rc: * - rc < 0: somebody is doing a short atomic operation while there is no manager, * so wait. @@ -3419,7 +3356,7 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, } } if( 0 < rc ) { - parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); + parsec_lifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task ); return PARSEC_HOOK_RETURN_ASYNC; } PARSEC_DEBUG_VERBOSE(5, parsec_gpu_output_stream, "GPU[%d:%s]: Entering GPU management", @@ -3502,7 +3439,6 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto remove_gpu_task; } gpu_task = progress_task; - out_task_submit = progress_task; get_data_out_of_device: if( (NULL != gpu_task) && (PARSEC_GPU_TASK_TYPE_KERNEL == gpu_task->task_type) ) { @@ -3532,20 +3468,19 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, goto complete_task; } gpu_task = progress_task; - out_task_pop = progress_task; fetch_task_from_shared_queue: assert( NULL == gpu_task ); - if (NULL != gpu_device->super.sort_pending_list && out_task_submit == NULL && out_task_pop == NULL) { - gpu_device->super.sort_pending_list(&gpu_device->super); + { + parsec_list_item_t *chain = parsec_lifo_detach_chain(&gpu_device->pending); + if (NULL != chain) { + parsec_heap_push_chain(&gpu_device->pending_heap, chain); + } } - gpu_task = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->pending) ); + gpu_task = (parsec_gpu_task_t*)parsec_heap_pop(&gpu_device->pending_heap); if( NULL != gpu_task ) { pop_null = 0; - /* parsec_fifo_try_pop() detaches the task but does not reset list links - * in release builds; normalize before the stream FIFO inspects them. - */ - PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)gpu_task); + /* parsec_heap_push_ring() singletonizes each item; the popped task is already a singleton. */ gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1; /* force at least one tour */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]:\tGet from shared queue %s", gpu_device->super.device_index, gpu_device->super.name, parsec_device_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task)); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index b36a40718..5b5ec18ba 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #ifndef PARSEC_DEVICE_GPU_H @@ -14,11 +15,11 @@ #include "parsec/class/list_item.h" #include "parsec/class/list.h" -#include "parsec/class/fifo.h" +#include "parsec/class/lifo.h" +#include "parsec/class/parsec_heap.h" BEGIN_C_DECLS -#define PARSEC_GPU_USE_PRIORITIES 1 #define PARSEC_GPU_MAX_STREAMS 6 #define PARSEC_MAX_EVENTS_PER_STREAM 4 #define PARSEC_GPU_MAX_WORKSPACE 2 @@ -116,6 +117,7 @@ typedef struct parsec_gpu_flow_info_s { struct parsec_gpu_task_s { parsec_list_item_t list_item; + int32_t priority; /**< device task priority, inherited from task if < 0 */ uint16_t task_type; uint16_t pushout; int32_t last_status; @@ -164,20 +166,20 @@ typedef enum parsec_device_transfer_direction_e { /** * @brief Set the device for the calling thread. - * + * * @details typically maps to cudaSetDevice or equivalent - * + * * @return PARSEC_SUCCESS or a PARSEC error */ typedef int (*parsec_device_set_device_fn_t)(struct parsec_device_gpu_module_s *gpu); /** * @brief Schedules the asynchronous copy of @p bytes bytes from @p source onto @p dest - * on the GPU stream of @p gpu_stream. @p direction must reflect the memory space of + * on the GPU stream of @p gpu_stream. @p direction must reflect the memory space of * @p source and @p dest. - * + * * @details typically maps to cudaMemcpyAsync or equivalent - * + * * @return PARSEC_SUCCESS or a PARSEC error */ typedef int (*parsec_device_memcpy_async_fn_t)(struct parsec_device_gpu_module_s *gpu, struct parsec_gpu_exec_stream_s *gpu_stream, @@ -185,20 +187,20 @@ typedef int (*parsec_device_memcpy_async_fn_t)(struct parsec_device_gpu_module_s /** * @brief Record an event on the GPU @p gpu_stream of GPU @p gpu, with index @p idx. - * + * * @details typically maps to cudaRecordEvent or equivalent. The GPU device must have allocated - * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). - * + * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). + * * @return PARSEC_SUCCESS or a PARSEC error */ typedef int (*parsec_device_event_record_fn_t)(struct parsec_device_gpu_module_s *gpu, struct parsec_gpu_exec_stream_s *gpu_stream, int32_t event_idx); /** * @brief Record an event on the GPU @p gpu_stream of GPU @p gpu, with index @p idx. - * + * * @details typically maps to cudaRecordEvent or equivalent. The GPU device must have allocated - * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). - * + * @p gpu_stream->super.max_events previously (@p 0 <= event_idx < gpu_stream->super.max_events). + * * @return 0 if the event recorded at @p event_idx in @p gpu_stream is not ready yet * 1 if the event recorded at @p event_idx in @p gpu_stream is ready/completed * a negative value which is a PARSEC error otherwise @@ -209,34 +211,34 @@ typedef int (*parsec_device_event_query_fn_t)(struct parsec_device_gpu_module_s * @brief Computes how much memory is available on the GPU. Returns two values: * @p free_mem is the amount of memory available for this process * @p total_mem is the amount of memory on the device (including memory allocated by other processes) - * - * @details typically maps to cudaMemGetInfo or equivalent. - * + * + * @details typically maps to cudaMemGetInfo or equivalent. + * * @return PARSEC_SUCCESS if successful, a PARSEC error otherwise (in which case the parameters are undefined) */ typedef int (*parsec_device_memory_info_fn_t)(struct parsec_device_gpu_module_s *gpu, size_t *free_mem, size_t *total_mem); /** * @brief Allocates @p bytes bytes on GPU @p gpu, and returns the address of the allocated memory in @p addr. - * - * @details typically maps to cudaMalloc or equivalent. - * + * + * @details typically maps to cudaMalloc or equivalent. + * * @return PARSEC_SUCCESS if successful, a PARSEC error otherwise (in which case @p addr is undefined) */ typedef int (*parsec_device_memory_allocate_fn_t)(struct parsec_device_gpu_module_s *gpu, size_t bytes, void **addr); /** * @brief Frees memory @p addr allocated by @fn parsec_device_memory_allocate_fn_t on the same GPU @p gpu. - * - * @details typically maps to cudaFree or equivalent. - * + * + * @details typically maps to cudaFree or equivalent. + * * @return PARSEC_SUCCESS if successful, a PARSEC error otherwise */ typedef int (*parsec_device_memory_free_fn_t)(struct parsec_device_gpu_module_s *gpu, void *addr); /** * @brief Find a function incarnation for the given function name - * + * * @param gpu_device the target GPU * @param fname the function name to look for * @return address of the symbol that implements this function @@ -269,9 +271,9 @@ struct parsec_device_gpu_module_s { */ parsec_list_t gpu_mem_lru; /* Read-only blocks, and fresh blocks */ parsec_list_t gpu_mem_owned_lru; /* Dirty blocks */ - parsec_fifo_t pending; + parsec_lifo_t pending; /**< lock-free LIFO: CPU threads push here */ + parsec_heap_t pending_heap; /**< manager-private max-heap for priority ordering */ struct zone_malloc_s *memory; - parsec_list_item_t *sort_starting_p; parsec_gpu_exec_stream_t **exec_stream; size_t mem_block_size; int64_t mem_nb_blocks; @@ -332,8 +334,6 @@ int parsec_device_push_workspace(parsec_device_gpu_module_t* gpu_device, parsec_ void* parsec_device_pop_workspace(parsec_device_gpu_module_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream, size_t size); int parsec_device_free_workspace(parsec_device_gpu_module_t * gpu_device); -/* sort pending task list by number of spaces needed */ -int parsec_device_sort_pending_list(parsec_device_module_t *gpu_device); parsec_gpu_task_t* parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_execution_stream_t *es); int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *w2r_task, parsec_execution_stream_t *es); diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c index fec2bff88..c281360f9 100644 --- a/parsec/mca/device/level_zero/device_level_zero_component.c +++ b/parsec/mca/device/level_zero/device_level_zero_component.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec.h" @@ -40,7 +41,6 @@ int parsec_level_zero_memory_block_size, parsec_level_zero_memory_percentage, pa char* parsec_level_zero_lib_path = NULL; static int level_zero_mask, level_zero_nvlink_mask; -static int parsec_level_zero_sort_pending; #if defined(PARSEC_PROF_TRACE) int parsec_device_level_zero_one_profiling_stream_per_gpu_stream = 0; @@ -167,9 +167,6 @@ static int device_level_zero_component_query(mca_base_module_t **module, int *pr } driver->ref_count++; parsec_device_level_zero_component.modules[j]->component = &parsec_device_level_zero_component; - if(parsec_level_zero_sort_pending) { - parsec_device_level_zero_component.modules[j]->sort_pending_list = parsec_device_sort_pending_list; - } j++; /* next available spot */ parsec_device_level_zero_component.modules[j] = NULL; i++; @@ -275,9 +272,6 @@ static int device_level_zero_component_register(void) (void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams); - (void)parsec_mca_param_reg_int_name("device_level_zero", "sort_pending_tasks", - "Boolean to let the GPU engine sort the first pending tasks stored in the list", - false, false, 0, &parsec_level_zero_sort_pending); #if defined(PARSEC_PROF_TRACE) (void)parsec_mca_param_reg_int_name("device_level_zero", "one_profiling_stream_per_level_zero_stream", "Boolean to separate the profiling of each level_zero stream into a single profiling stream", diff --git a/parsec/mca/device/level_zero/device_level_zero_module.c b/parsec/mca/device/level_zero/device_level_zero_module.c index 4b2ef0799..a88bb314a 100644 --- a/parsec/mca/device/level_zero/device_level_zero_module.c +++ b/parsec/mca/device/level_zero/device_level_zero_module.c @@ -2,6 +2,7 @@ * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 Stony Brook University. All rights reserved. */ #include "parsec/parsec_config.h" @@ -22,7 +23,9 @@ #include "parsec/utils/debug.h" #include "parsec/utils/argv.h" #include "parsec/utils/zone_malloc.h" -#include "parsec/class/fifo.h" +#include "parsec/class/lifo.h" +#include "parsec/class/parsec_heap.h" +#include #include "parsec/mca/device/level_zero/device_level_zero_dpcpp.h" #include @@ -413,9 +416,9 @@ int parsec_level_zero_module_init( int dev_id, parsec_device_level_zero_driver_t /* Initialize internal lists */ PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru, parsec_list_t); PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t); - PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_fifo_t); + PARSEC_OBJ_CONSTRUCT(&gpu_device->pending, parsec_lifo_t); + parsec_heap_init(&gpu_device->pending_heap, offsetof(parsec_gpu_task_t, priority)); - gpu_device->sort_starting_p = NULL; gpu_device->peer_access_mask = 0; /* No GPU to GPU direct transfer by default */ device->memory_register = NULL; // TODO there seem to be no memory pinning in level_zero? @@ -501,8 +504,9 @@ parsec_level_zero_module_fini(parsec_device_module_t* device) /* Release the registered memory */ parsec_device_memory_release(gpu_device); - /* Release pending queue */ + /* Release pending queue and heap */ PARSEC_OBJ_DESTRUCT(&gpu_device->pending); + parsec_heap_fini(&gpu_device->pending_heap); /* Release all streams */ for( j = 0; j < gpu_device->num_exec_streams; j++ ) { diff --git a/parsec/mca/sched/ltq/sched_ltq_module.c b/parsec/mca/sched/ltq/sched_ltq_module.c index 48948ac53..e502b3777 100644 --- a/parsec/mca/sched/ltq/sched_ltq_module.c +++ b/parsec/mca/sched/ltq/sched_ltq_module.c @@ -25,7 +25,7 @@ #include "parsec/parsec_hwloc.h" #include "parsec/papi_sde.h" -#define parsec_heap_priority_comparator (offsetof(parsec_heap_t, priority)) +#define parsec_heap_priority_comparator (offsetof(parsec_task_heap_t, priority)) /** * Module functions @@ -163,8 +163,8 @@ static parsec_task_t* sched_ltq_select(parsec_execution_stream_t *es, int32_t* distance) { - parsec_heap_t* heap = NULL; - parsec_heap_t* new_heap = NULL; + parsec_task_heap_t* heap = NULL; + parsec_task_heap_t* new_heap = NULL; parsec_task_t * task = NULL; int i = 0; /* @@ -173,7 +173,7 @@ sched_ltq_select(parsec_execution_stream_t *es, and choose a tree that has the highest value then take that task from that tree. */ - heap = (parsec_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->task_queue, + heap = (parsec_task_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->task_queue, parsec_heap_priority_comparator); task = heap_remove(&heap); if( NULL != heap ) { @@ -187,7 +187,7 @@ sched_ltq_select(parsec_execution_stream_t *es, // if we failed to find one in our queue for(i = 1; i < PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->nb_hierarch_queues; i++ ) { - heap = (parsec_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->hierarch_queues[i], parsec_heap_priority_comparator); + heap = (parsec_task_heap_t*)parsec_hbbuffer_pop_best(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->hierarch_queues[i], parsec_heap_priority_comparator); task = heap_split_and_steal(&heap, &new_heap); if( NULL != heap ) { if (NULL != new_heap) { @@ -215,7 +215,7 @@ sched_ltq_select(parsec_execution_stream_t *es, } // if nothing yet, then go to system queue - heap = (parsec_heap_t *)parsec_dequeue_pop_front(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->system_queue); + heap = (parsec_task_heap_t *)parsec_dequeue_pop_front(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->system_queue); task = heap_split_and_steal(&heap, &new_heap); #if defined(PARSEC_PAPI_SDE) if( NULL != task ) { @@ -224,7 +224,7 @@ sched_ltq_select(parsec_execution_stream_t *es, #endif if (heap != NULL) { #if defined(PARSEC_PAPI_SDE) - PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->local_system_queue_balance-= heap->size; + PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->local_system_queue_balance -= (int32_t)heap->heap.size; #endif parsec_hbbuffer_push_all(PARSEC_MCA_SCHED_LOCAL_QUEUES_OBJECT(es)->task_queue, (parsec_list_item_t*)heap, 0); @@ -239,8 +239,8 @@ static int sched_ltq_schedule(parsec_execution_stream_t* es, { parsec_task_t * cur = new_context; parsec_task_t * next; - parsec_heap_t* heap = heap_create(); - parsec_heap_t* first_h = heap; + parsec_task_heap_t* heap = heap_create(); + parsec_task_heap_t* first_h = heap; int matches = 0; int i, j; @@ -272,7 +272,7 @@ static int sched_ltq_schedule(parsec_execution_stream_t* es, if (!matches) { // make new heap - parsec_heap_t * new_heap = heap_create(); + parsec_task_heap_t * new_heap = heap_create(); heap->list_item.list_next->list_prev = (parsec_list_item_t*)new_heap; new_heap->list_item.list_prev = (parsec_list_item_t*)heap; new_heap->list_item.list_next = (parsec_list_item_t*)heap->list_item.list_next;