From 3c92ede8eb78ca341ec0eb3d2d35428416aaf302 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Thu, 23 Sep 2021 13:17:15 -0400 Subject: [PATCH 01/41] create bcast key task class --- CMakeLists.txt | 2 +- parsec/interfaces/superscalar/CMakeLists.txt | 3 +- parsec/interfaces/superscalar/collectives.c | 267 ++++++++++++++++++ .../interfaces/superscalar/insert_function.c | 123 +++++++- .../superscalar/insert_function_internal.h | 15 + .../superscalar/overlap_strategies.c | 19 +- parsec/remote_dep.c | 5 + parsec/remote_dep.h | 2 + parsec/remote_dep_mpi.c | 2 +- tests/interfaces/superscalar/CMakeLists.txt | 1 + 10 files changed, 424 insertions(+), 15 deletions(-) create mode 100644 parsec/interfaces/superscalar/collectives.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c0fb8fbe..d55940945 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,7 +144,7 @@ if(PARSEC_DIST_WITH_MPI AND 0) message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one") endif() option(PARSEC_DIST_THREAD - "Use an extra thread to progress the data movements" ON) + "Use an extra thread to progress the data movements" OFF) option(PARSEC_DIST_PRIORITIES "Favor the communications that unlock the most prioritary tasks" ON) option(PARSEC_DIST_COLLECTIVES diff --git a/parsec/interfaces/superscalar/CMakeLists.txt b/parsec/interfaces/superscalar/CMakeLists.txt index e0f55d879..3186c48e2 100644 --- a/parsec/interfaces/superscalar/CMakeLists.txt +++ b/parsec/interfaces/superscalar/CMakeLists.txt @@ -2,7 +2,8 @@ if( BUILD_PARSEC ) LIST(APPEND EXTRA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/parsec_dtd_data_flush.c ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/overlap_strategies.c - ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/insert_function.c) + ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/insert_function.c + ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/collectives.c) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/insert_function.h diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c new file mode 100644 index 000000000..f3aa9b5f8 --- /dev/null +++ b/parsec/interfaces/superscalar/collectives.c @@ -0,0 +1,267 @@ +/** + * Copyright (c) 2013-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ + +#include "parsec/class/lifo.h" +#include "parsec/parsec_config.h" +#include "parsec/interfaces/superscalar/insert_function_internal.h" + +#ifdef PARSEC_DTD_DIST_COLLECTIVES + +/* static parsec_lifo_t parsec_dep_lifo; */ + +/** + * Create and return `parsec_remote_deps_t` structure associated with + * the broadcast of the a data to all the nodes set in the + * `dest_ranks` array. + **/ +parsec_remote_deps_t* parsec_dtd_create_remote_deps( + int myrank, int root, parsec_data_copy_t *data_copy, + parsec_arena_datatype_t *arenas_datatype, + int* dest_ranks, int num_dest_ranks) { + + parsec_remote_deps_t *deps = (parsec_remote_deps_t*)remote_deps_allocate(&parsec_remote_dep_context.freelist); + + assert(NULL != deps); + assert(NULL == deps->taskpool); + + deps->root = root; + deps->outgoing_mask |= (1 << 0); /* only 1 flow */ + deps->max_priority = 0; + + struct remote_dep_output_param_s* output = &deps->output[0]; + output->data.data = NULL; + output->data.arena = arenas_datatype->arena; + output->data.layout = arenas_datatype->opaque_dtt; + output->data.count = 1; + output->data.displ = 0; + output->priority = 0; + + if (myrank == root) { + // if my rank corresponds to the root for this broadcast then we + // add `data_copy` to the remote deps information + // `data.data`. Otherwise, we leave it to NULL. + output->data.data = data_copy; + } + + int _array_pos, _array_mask; + uint32_t dest_rank_idx; + if(myrank == root) { + // Loop through destination ranks in `dest_rank` array + for (dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { + + // Get rank from `dest_rank` array + uint32_t dest_rank = dest_ranks[dest_rank_idx]; + + // Skip if we are root + if(deps->root == dest_rank) continue; + + _array_pos = dest_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (dest_rank % (8 * sizeof(uint32_t))); + + if( !(output->rank_bits[_array_pos] & _array_mask) ) { + output->rank_bits[_array_pos] |= _array_mask; + output->deps_mask |= (1 << 0); /* not used by DTD? */ + output->count_bits++; + } + } + } else{ + _array_pos = myrank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (myrank % (8 * sizeof(uint32_t))); + + if( !(output->rank_bits[_array_pos] & _array_mask) ) { + output->rank_bits[_array_pos] |= _array_mask; + output->deps_mask |= (1 << 0); /* not used by DTD? */ + output->count_bits++; + } + } + + return deps; +} + +/** + * Free remote deps if it does not involve any participants. + **/ +static +int remote_deps_free_if_empty(parsec_remote_deps_t* deps) { + + // Return 1 if the remote_deps has no participants, 0 otherwise. + int ret = 0; + + struct remote_dep_output_param_s* output = &deps->output[0]; + + // TODO: loop through the whole output array are use max_priority + // instead + if (output->count_bits <= 0) { + // No participants + + deps->pending_ack = 0; + deps->incoming_mask = 0; + deps->outgoing_mask = 0; + remote_deps_free(deps); + + // Indicate that remote deps is empty + ret = 1; + } + + return ret; +} + +/* +static +int parsec_dtd_bcast_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es; + + // parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *)this_task; + + // INPUT or INOUT data + void *val_in; + // Root index + int root_in; + // Task rank + int dest_rank; + + printf("[parsec_dtd_bcast_task_fn]\n"); + + return PARSEC_HOOK_RETURN_DONE; +} + +int parsec_dtd_aux_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es; + // INPUT data + int *val_in; + int *val_bcast; + // Task rank + int dest_rank; + + + parsec_dtd_unpack_args(this_task, &val_bcast, &dest_rank); + fprintf(stderr, "aux_fn on rank %d value %d\n", es->virtual_process->parsec_context->my_rank, *val_bcast); + return PARSEC_HOOK_RETURN_DONE; +} + +int parsec_dtd_aux_fn2( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es; + + return PARSEC_HOOK_RETURN_DONE; +} +*/ + +/** + * Perform a broadcast for of the dtd tile `dtd_tile_root` from the + * root node associated with the rank `root` to the nodes with ranks + * set in the `dest_ranks` array. + **/ +void parsec_dtd_broadcast( + parsec_taskpool_t *taskpool, int myrank, int root, + parsec_dtd_tile_t* dtd_tile_root, int arena_index, + parsec_dtd_tile_t* bcast_keys_root, int bcast_arena_index, + int* dest_ranks, int num_dest_ranks) { + + parsec_data_copy_t *parsec_data_copy; + int *data_ptr; + int key; + int bcast_id; + parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)taskpool; + + if(myrank == root) { + bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); + dtd_tp->bcast_id++; + + parsec_data_copy = bcast_keys_root->data_copy; + data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); + data_ptr[0] = bcast_id; + data_ptr[100] = num_dest_ranks; + for(int i = 0; i < num_dest_ranks; i++) { + data_ptr[dest_ranks[i]+1] = dtd_tp->send_task_id[dest_ranks[i]]++; + //pack the ranks at the end of the tiles as well + data_ptr[100+i+1] = dest_ranks[i]; + } + } + // Retrieve DTD tile's data_copy + parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; + parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; + + // Create remote deps corresponding to the braodcast + //parsec_remote_deps_t *deps_0 = parsec_dtd_create_remote_deps( + // myrank, root, data_copy, &parsec_dtd_arenas_datatypes[arena_index], + // dest_ranks, num_dest_ranks); + parsec_remote_deps_t *deps_1 = parsec_dtd_create_remote_deps( + myrank, root, key_copy, &parsec_dtd_arenas_datatypes[bcast_arena_index], + dest_ranks, num_dest_ranks); + + //parsec_task_t *bcast_task_root = parsec_dtd_taskpool_create_task( + // taskpool, parsec_dtd_aux_fn2, 0, "bcast_task_root", + // PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | arena_index, + // sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, + // PARSEC_DTD_ARG_END); + + //parsec_dtd_task_t *dtd_bcast_task_root = (parsec_dtd_task_t *)bcast_task_root; + //dtd_bcast_task_root->super.locals[2].value = -1; + //dtd_bcast_task_root->super.locals[3].value = -1; + + // Set broadcast topology info + //dtd_bcast_task_root->deps_out = NULL; + + //dtd_bcast_task_root->deps_out = deps_0; + + //if(myrank == root) { + // dtd_bcast_task_root->ht_item.key = bcast_id; + // dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; + //}else{ + // bcast_id = ( (1<<28) | dtd_tp->recv_task_id[root]++); + // dtd_bcast_task_root->ht_item.key = bcast_id; + // dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; + //} + /* Post the bcast tasks for the actual data */ + //parsec_insert_dtd_task(dtd_bcast_task_root); + + parsec_task_t *bcast_key_root = parsec_dtd_taskpool_create_task( + taskpool, parsec_dtd_bcast_task_fn, 0, "bcast_task_root", + PASSED_BY_REF, bcast_keys_root, PARSEC_INOUT | bcast_arena_index, + sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, + PARSEC_DTD_ARG_END); + parsec_dtd_task_t *dtd_bcast_key_root = (parsec_dtd_task_t *)bcast_key_root; + dtd_bcast_key_root->deps_out = NULL; + dtd_bcast_key_root->deps_out = deps_1; + if(myrank == root) { + //dtd_bcast_task_root->ht_item.key = bcast_id; + //dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; + }else{ + bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] -1)); + dtd_bcast_key_root->ht_item.key = bcast_id; + dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; + } + /* Post the bcast of keys and ranks array */ + parsec_insert_dtd_task(dtd_bcast_key_root); + + if(myrank == root) { + //for (int dest_rank = 0; dest_rank < num_dest_ranks; ++dest_rank) { + // parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( + // dtd_tp, parsec_dtd_aux_fn, 0, "retrieve_task", + // PASSED_BY_REF, bcast_keys_root, PARSEC_INPUT | bcast_arena_index, + // sizeof(int), &dest_ranks[dest_rank], PARSEC_VALUE | PARSEC_AFFINITY, + // PARSEC_DTD_ARG_END); + // parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + // parsec_insert_dtd_task(dtd_retrieve_task); + //} + }else { + parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( + dtd_tp, parsec_dtd_aux_fn, 0, "retrieve_task", + PASSED_BY_REF, bcast_keys_root, PARSEC_INPUT | bcast_arena_index, + sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, + PARSEC_DTD_ARG_END); + parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + parsec_insert_dtd_task(dtd_retrieve_task); + } +} + +#endif diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index fbb59f51d..077d4ff49 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -214,6 +214,9 @@ parsec_dtd_enqueue_taskpool(parsec_taskpool_t *tp, void *data) /* The first taskclass of every taskpool is the flush taskclass */ parsec_dtd_create_task_class(dtd_tp, parsec_dtd_data_flush_sndrcv, "parsec_dtd_data_flush", 0, 0, 1); + /* The second taskclass of every taskpool is the bcast key array propagation taskclass */ + parsec_dtd_create_task_class(dtd_tp, parsec_dtd_bcast_key_fn, "parsec_dtd_bcast_key_fn", + 1, sizeof(int), 1); return 0; } @@ -1274,6 +1277,9 @@ parsec_dtd_taskpool_new(void) __tp->current_thread_id = 0; __tp->function_counter = 0; __tp->enqueue_flag = 0; + __tp->bcast_id = 0; + memset(__tp->send_task_id, 0, MAX_RANK_INFO*sizeof(int)*8*sizeof(int)); + memset(__tp->recv_task_id, 0, MAX_RANK_INFO*sizeof(int)*8*sizeof(int)); (void)parsec_taskpool_reserve_id((parsec_taskpool_t *) __tp); if( 0 < asprintf(&__tp->super.taskpool_name, "DTD Taskpool %d", @@ -1447,12 +1453,21 @@ dtd_release_dep_fct( parsec_execution_stream_t *es, struct remote_dep_output_param_s* output; int _array_pos, _array_mask; + /* On the sender side, update the key of the dep flow */ + parsec_dtd_task_t * real_parent_task = (parsec_dtd_task_t *)oldcontext; + parsec_dtd_task_t * real_child_task = (parsec_dtd_task_t *)newcontext; + #if !defined(PARSEC_DIST_COLLECTIVES) assert(src_rank == es->virtual_process->parsec_context->my_rank); #endif _array_pos = dst_rank / (8 * sizeof(uint32_t)); _array_mask = 1 << (dst_rank % (8 * sizeof(uint32_t))); PARSEC_ALLOCATE_REMOTE_DEPS_IF_NULL(arg->remote_deps, oldcontext, MAX_PARAM_COUNT); + if(real_parent_task->deps_out == NULL) { + arg->remote_deps->bcast_keys[dep->dep_datatype_index] = 0; + arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= src_rank<<18; + arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= (FLOW_OF(real_parent_task, dep->belongs_to->flow_index))->msg_keys[dst_rank]; + } output = &arg->remote_deps->output[dep->dep_datatype_index]; assert( (-1 == arg->remote_deps->root) || (arg->remote_deps->root == src_rank) ); arg->remote_deps->root = src_rank; @@ -2016,9 +2031,16 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* tc->nb_flows = flow_count; /* set to one so that prof_grpaher prints the task id properly */ tc->nb_parameters = 1; - tc->nb_locals = 1; + tc->nb_locals = 8; params[0] = &symb_dtd_taskid; locals[0] = &symb_dtd_taskid; + locals[1] = &symb_dtd_taskid; + locals[2] = &symb_dtd_taskid; + locals[3] = &symb_dtd_taskid; + locals[4] = &symb_dtd_taskid; + locals[5] = &symb_dtd_taskid; + locals[6] = &symb_dtd_taskid; + locals[7] = &symb_dtd_taskid; tc->data_affinity = NULL; tc->initial_data = NULL; tc->final_data = (parsec_data_ref_fn_t *) NULL; @@ -2224,6 +2246,18 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in parsec_dtd_remote_task_retain( real_parent_task ); } + /* On the receiver side, based on the previous parent key, update next recv key for dep flow */ + if(real_parent_task->deps_out == NULL) { + if(real_parent_task->ht_item.key == 0xffffffff) { + real_parent_task->ht_item.key = 0; + real_parent_task->ht_item.key |= real_parent_task->rank<<18; + real_parent_task->ht_item.key |= tp->recv_task_id[real_parent_task->rank]++; + real_parent_task->super.locals[0].value = real_parent_task->ht_item.key; + } + } else { + /* parent is a collective, so ID is provided and we don't do anything here */ + } + uint64_t key = (((uint64_t)real_parent_task->ht_item.key)<<32) | (1U<task_hash_table, (parsec_key_t)key); parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( tp, key ); @@ -2310,11 +2344,19 @@ parsec_dtd_create_and_initialize_task( parsec_dtd_taskpool_t *dtd_tp, assert(this_task->super.super.super.obj_reference_count == 1); this_task->orig_task = NULL; + /* DTD Collective */ + this_task->deps_out = NULL; + this_task->super.taskpool = (parsec_taskpool_t*)dtd_tp; - this_task->ht_item.key = (parsec_key_t)(uintptr_t)(dtd_tp->task_id++); + /* this_task->ht_item.key = (parsec_key_t)(uintptr_t)(dtd_tp->task_id++); */ + this_task->ht_item.key = (uintptr_t)0xffffffff; + /* this is needed for grapher to work properly */ this_task->super.locals[0].value = (int)(uintptr_t)this_task->ht_item.key; - assert( (uintptr_t)this_task->super.locals[0].value == (uintptr_t)this_task->ht_item.key ); + //assert( (uintptr_t)this_task->super.locals[0].value == (uintptr_t)this_task->ht_item.key ); + for(int idx = 0; idx < 8; idx++) { + this_task->super.locals[idx].value = 0; + } this_task->super.task_class = tc; /** * +1 to make sure the task cannot be completed by the potential predecessors, @@ -2327,6 +2369,7 @@ parsec_dtd_create_and_initialize_task( parsec_dtd_taskpool_t *dtd_tp, this_task->super.priority = 0; this_task->super.chore_id = 0; this_task->super.status = PARSEC_TASK_STATUS_NONE; + memset(this_task->rank_bits, 0, MAX_RANK_INFO*sizeof(int)); int j; parsec_dtd_flow_info_t *flow; @@ -2421,6 +2464,38 @@ fake_first_out_body( parsec_execution_stream_t *es, parsec_task_t *this_task) return PARSEC_HOOK_RETURN_DONE; } +/* **************************************************************************** */ +/** + * Body of bcast key task we insert that will propagate the key array + * empty body! + * + * @param context, this_task + * + * @ingroup DTD_INTERFACE_INTERNAL + */ +int +parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; (void)this_task; + return PARSEC_HOOK_RETURN_DONE; +} + +/* **************************************************************************** */ +/** + * Body of bcast key receiver task we insert that will ensure propagation of the key array + * on the receiver side, empty body! + * + * @param context, this_task + * + * @ingroup DTD_INTERFACE_INTERNAL + */ +int +parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; (void)this_task; + return PARSEC_HOOK_RETURN_DONE; +} + int parsec_dtd_schedule_task_if_ready(int satisfied_flow, parsec_dtd_task_t *this_task, parsec_dtd_taskpool_t *dtd_tp, int *vpid) @@ -2607,6 +2682,27 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) tile_op_type, last_user.alive); } + if(last_writer.task->deps_out == NULL) { + /* local parent and we are inserting a remote task, indicates it needs to send data */ + if(parsec_dtd_task_is_local(last_writer.task) && parsec_dtd_task_is_remote(this_task)) + { + int _array_pos, _array_mask; + _array_pos = this_task->rank / (8 * sizeof(int)); + _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); + if(last_writer.task->rank_bits[_array_pos] & _array_mask) + { + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + } else + { + last_writer.task->rank_bits[_array_pos] |= _array_mask; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; + last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + } + } + } else { + /* do nothing */ + } + /* Are we using the same data multiple times for the same task? */ if(last_user.task == this_task) { satisfied_flow += 1; @@ -2683,6 +2779,27 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) } } + + if(last_writer.task->deps_out == NULL) { + /* local parent and we are inserting a remote task, indicates it needs to send data */ + if(parsec_dtd_task_is_local(last_writer.task) && parsec_dtd_task_is_remote(this_task)) + { + int _array_pos, _array_mask; + _array_pos = this_task->rank / (8 * sizeof(int)); + _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); + if(last_writer.task->rank_bits[_array_pos] & _array_mask) + { + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + } else + { + last_writer.task->rank_bits[_array_pos] |= _array_mask; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; + last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + } + } + } else { + /* do nothing */ + } /* we can avoid all the hash table crap if the last_writer is not alive */ if( put_in_chain ) { diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index bbf4ff1dd..823872b45 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -38,6 +38,7 @@ extern int parsec_dtd_debug_output; extern int parsec_dtd_dump_traversal_info; /**< For printing traversal info */ #define PARSEC_DTD_FLUSH_TC_ID ((uint8_t)0x00) +#define PARSEC_DTD_BCAST_KEY_TC_ID ((uint8_t)0x01) /* To flag the task we are trying to complete as a local one */ #define PARSEC_ACTION_COMPLETE_LOCAL_TASK 0x08000000 @@ -117,6 +118,7 @@ typedef struct parsec_dtd_min_flow_info_s { 3 release remote data */ parsec_dtd_tile_t *tile; + int msg_keys[MAX_RANK_INFO]; /* enable user trimming, store dest rank send ID for a flow */ } parsec_dtd_min_flow_info_t; typedef struct parsec_dtd_flow_info_s { @@ -131,6 +133,7 @@ typedef struct parsec_dtd_flow_info_s { 4 release ownership even when the flow is of type R */ parsec_dtd_tile_t *tile; + int msg_keys[MAX_RANK_INFO]; /* enable user trimming, store dest rank send ID for a flow */ int rank_sent_to[MAX_RANK_INFO]; /* currently support 1024 nodes */ } parsec_dtd_flow_info_t; @@ -178,8 +181,10 @@ struct parsec_dtd_task_s { parsec_thread_mempool_t *mempool_owner; int32_t rank; int32_t flow_count; + int32_t rank_bits[MAX_RANK_INFO]; /* for testing PTG inserting task in DTD */ parsec_task_t *orig_task; + parsec_remote_deps_t *deps_out; }; /* For creating objects of class parsec_dtd_task_t */ @@ -237,6 +242,9 @@ struct parsec_dtd_taskpool_s { parsec_hash_table_t *function_h_table; /* ring of initial ready tasks */ parsec_task_t **startup_list; + int bcast_id; + int send_task_id[MAX_RANK_INFO*sizeof(int)*8]; + int recv_task_id[MAX_RANK_INFO*sizeof(int)*8]; /* from here to end is for the testing interface */ struct hook_info actual_hook[PARSEC_DTD_NB_TASK_CLASSES]; }; @@ -279,6 +287,10 @@ typedef struct parsec_dtd_common_args_s { } parsec_dtd_common_args_t; /* Function prototypes */ +int parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task); + +int parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_task); + void parsec_detach_all_dtd_taskpool_from_context( parsec_context_t *context ); @@ -420,6 +432,9 @@ parsec_dtd_tile_retain( parsec_dtd_tile_t *tile ); void parsec_dtd_tile_release( parsec_dtd_tile_t *tile ); +int +parsec_dtd_rank_of_data(parsec_dc_t *dc, int i, int j); + int parsec_dtd_data_flush_sndrcv(parsec_execution_stream_t *es, parsec_task_t *this_task); diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index d1c852797..024ca1852 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -153,6 +153,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, parsec_release_dep_fct_arg_t *arg = (parsec_release_dep_fct_arg_t *)ontask_arg; parsec_dep_data_description_t data; int rank_src = 0, rank_dst = 0, vpid_dst=0; + parsec_dtd_flow_info_t* flow; /* finding for which flow we need to iterate successors of */ int flow_mask = action_mask; @@ -310,7 +311,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, deps.ctl_gather_nb = NULL; deps.task_class_id = current_desc->super.task_class->task_class_id; deps.flow = current_desc->super.task_class->in[tmp_desc_flow_index]; - deps.dep_index = 0; /* it will not be used anywhere for DTD, so whatever */ + deps.dep_index = tmp_desc_flow_index; deps.belongs_to = current_task->super.task_class->out[current_dep]; deps.direct_data = NULL; deps.dep_datatype_index = current_dep; @@ -321,6 +322,14 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; +#if defined(DISTRIBUTED) + if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) && (NULL != arg->remote_deps) ) { + (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate(es, (parsec_task_t *)current_task, arg->remote_deps, arg->remote_deps->outgoing_mask); + arg->remote_deps = NULL; + } +#endif + /* releasing remote tasks that is a descendant of a local task */ if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { if( parsec_dtd_task_is_remote(current_desc) && parsec_dtd_task_is_local(current_task) ) { @@ -333,14 +342,6 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, } } } while (0 == get_out); - -#if defined(DISTRIBUTED) - if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) && (NULL != arg->remote_deps) ) { - (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate(es, (parsec_task_t *)current_task, arg->remote_deps, arg->remote_deps->outgoing_mask); - arg->remote_deps = NULL; - } -#endif } } } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index ade071b03..09a3131a4 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -169,6 +169,8 @@ inline parsec_remote_deps_t* remote_deps_allocate( parsec_lifo_t* lifo ) } /* fw_mask immediatly follows outputs */ remote_deps->remote_dep_fw_mask = (uint32_t*) ptr; + remote_deps->bcast_flag = 0; /* default this dep is not for bcast */ + memset(remote_deps->bcast_keys, 0, sizeof(uint32_t)*16); assert( (int)(ptr - (char*)remote_deps) == (int)(parsec_remote_dep_context.elem_size - rank_bit_size)); } else { @@ -530,6 +532,9 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, int remote_dep_bcast_child_permits = 0; /* Right now DTD only supports a star broadcast topology */ if( PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type ) { + parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; + if(this_dtd_task->deps_out == NULL) + remote_deps->msg.locals[0].value = remote_deps->bcast_keys[i]; /* p2p, update the key for this message */ remote_dep_bcast_child_permits = remote_dep_bcast_star_child(my_idx, idx); } else { remote_dep_bcast_child_permits = remote_dep_bcast_child(my_idx, idx); diff --git a/parsec/remote_dep.h b/parsec/remote_dep.h index 68398aba5..bd85e8772 100644 --- a/parsec/remote_dep.h +++ b/parsec/remote_dep.h @@ -91,6 +91,8 @@ struct parsec_remote_deps_s { int32_t priority; uint32_t *remote_dep_fw_mask; /**< list of peers already notified about * the control sequence (only used for control messages) */ + int32_t bcast_flag; + uint32_t bcast_keys[16]; struct data_repo_entry_s *repo_entry; struct remote_dep_output_param_s output[1]; }; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index fc130e098..48af315b4 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -97,7 +97,7 @@ static size_t parsec_param_eager_limit = RDEP_MSG_EAGER_LIMIT; */ static size_t parsec_param_eager_limit = 0; #endif /* RDEP_MSG_EAGER_LIMIT != 0 */ -static int parsec_param_enable_aggregate = 1; +static int parsec_param_enable_aggregate = 0; #if defined(PARSEC_HAVE_MPI_OVERTAKE) static int parsec_param_enable_mpi_overtake = 1; #endif diff --git a/tests/interfaces/superscalar/CMakeLists.txt b/tests/interfaces/superscalar/CMakeLists.txt index ef91ab984..aba15aecf 100644 --- a/tests/interfaces/superscalar/CMakeLists.txt +++ b/tests/interfaces/superscalar/CMakeLists.txt @@ -20,6 +20,7 @@ parsec_addtest(C dtd_test_data_flush "dtd_test_data_flush.c;${COMMON_DATA}") parsec_addtest(C dtd_test_global_id_for_dc_assumed "dtd_test_global_id_for_dc_assumed.c;${COMMON_DATA}") parsec_addtest(C dtd_test_explicit_task_creation "dtd_test_explicit_task_creation.c;${COMMON_DATA}") parsec_addtest(C dtd_test_tp_enqueue_dequeue "dtd_test_tp_enqueue_dequeue.c") +parsec_addtest(C dtd_test_broadcast_collective "dtd_test_broadcast_collective.c") # # Shared Memory Testings From ef3117fc3be0a4910194ff63f90e388b08b4c80a Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sat, 25 Sep 2021 15:04:31 -0400 Subject: [PATCH 02/41] setting release dep func for BCAST KEY class --- parsec/interfaces/superscalar/collectives.c | 8 +- .../interfaces/superscalar/insert_function.c | 133 +++++- .../interfaces/superscalar/insert_function.h | 30 ++ .../superscalar/parsec_dtd_data_flush.c | 1 + parsec/remote_dep_mpi.c | 3 +- parsec/scheduling.c | 1 + .../dtd_test_broadcast_collective.c | 419 ++++++++++++++++++ 7 files changed, 590 insertions(+), 5 deletions(-) create mode 100644 tests/interfaces/superscalar/dtd_test_broadcast_collective.c diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index f3aa9b5f8..d1d249b64 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -186,6 +186,7 @@ void parsec_dtd_broadcast( data_ptr[100+i+1] = dest_ranks[i]; } } + fprintf(stderr, "finished bcast key packing\n"); // Retrieve DTD tile's data_copy parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; @@ -225,7 +226,7 @@ void parsec_dtd_broadcast( //parsec_insert_dtd_task(dtd_bcast_task_root); parsec_task_t *bcast_key_root = parsec_dtd_taskpool_create_task( - taskpool, parsec_dtd_bcast_task_fn, 0, "bcast_task_root", + taskpool, parsec_dtd_bcast_key_fn, 0, "bcast_key_fn", PASSED_BY_REF, bcast_keys_root, PARSEC_INOUT | bcast_arena_index, sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, PARSEC_DTD_ARG_END); @@ -236,7 +237,8 @@ void parsec_dtd_broadcast( //dtd_bcast_task_root->ht_item.key = bcast_id; //dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] -1)); + //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; } @@ -255,7 +257,7 @@ void parsec_dtd_broadcast( //} }else { parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( - dtd_tp, parsec_dtd_aux_fn, 0, "retrieve_task", + dtd_tp, parsec_dtd_bcast_key_recv, 0, "retrieve_task", PASSED_BY_REF, bcast_keys_root, PARSEC_INPUT | bcast_arena_index, sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, PARSEC_DTD_ARG_END); diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 077d4ff49..f88d0d134 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -102,6 +102,11 @@ parsec_dtd_release_deps(parsec_execution_stream_t *, parsec_task_t *, uint32_t, parsec_remote_deps_t *); +static int +parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *, + parsec_task_t *, + uint32_t, parsec_remote_deps_t *); + static parsec_hook_return_t complete_hook_of_dtd(parsec_execution_stream_t *, @@ -216,7 +221,7 @@ parsec_dtd_enqueue_taskpool(parsec_taskpool_t *tp, void *data) 0, 0, 1); /* The second taskclass of every taskpool is the bcast key array propagation taskclass */ parsec_dtd_create_task_class(dtd_tp, parsec_dtd_bcast_key_fn, "parsec_dtd_bcast_key_fn", - 1, sizeof(int), 1); + 2, sizeof(int), 1); return 0; } @@ -1689,6 +1694,126 @@ parsec_dtd_release_deps(parsec_execution_stream_t *es, return 0; } +static int +parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *es, + parsec_task_t *this_task, + uint32_t action_mask, + parsec_remote_deps_t *deps) +{ + (void)deps; + parsec_release_dep_fct_arg_t arg; + int __vp_id; + + assert(NULL != es); + + PARSEC_PINS(es, RELEASE_DEPS_BEGIN, this_task); +#if defined(DISTRIBUTED) + arg.remote_deps = deps; +#endif /* defined(DISTRIBUTED) */ + + arg.action_mask = action_mask; + arg.output_usage = 0; + arg.output_entry = NULL; + arg.ready_lists = alloca(sizeof(parsec_task_t *) * es->virtual_process->parsec_context->nb_vp); + + for (__vp_id = 0; __vp_id < es->virtual_process->parsec_context->nb_vp; __vp_id++) + arg.ready_lists[__vp_id] = NULL; + + parsec_dtd_task_t *this_dtd_task = NULL; + const parsec_task_class_t *tc = this_task->task_class; + parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)this_task->taskpool; + + if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) ) { + this_dtd_task = (parsec_dtd_task_t *)this_task; + int bcast_id = ( (1<<29) | 0 ); + this_dtd_task->ht_item.key = bcast_id; + this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key; + (void)parsec_atomic_fetch_inc_int32(&this_dtd_task->super.data[0].data_out->readers); + //parsec_dtd_retain_data_copy(this_dtd_task->super.data[0].data_out); + parsec_remote_dep_activate(es, (parsec_task_t *)this_dtd_task, this_dtd_task->deps_out, this_dtd_task->deps_out->outgoing_mask); + parsec_dtd_remote_task_retain(this_dtd_task); + this_dtd_task->deps_out = NULL; + tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); + fprintf(stderr, "bcast key release on rank %d\n", this_dtd_task->rank); + parsec_dtd_release_local_task(this_dtd_task); + } else { + int flow_index, track_flow = 0; + for(flow_index = 0; flow_index < tc->nb_flows; flow_index++) { + if((action_mask & (1 << flow_index))) { + if(!(track_flow & (1U << flow_index))) { + uint64_t key = (((uint64_t)this_task->locals[0].value<<32) | (1U<task_hash_table, (parsec_key_t)key); + this_dtd_task = parsec_dtd_find_task( tp, key ); + assert(this_dtd_task != NULL); + + if( this_task->data[flow_index].data_out != NULL ) { + assert(this_task->data[flow_index].data_out != NULL); + this_dtd_task->super.data[flow_index].data_in = this_task->data[flow_index].data_in; + this_dtd_task->super.data[flow_index].data_out = this_task->data[flow_index].data_out; + parsec_dtd_retain_data_copy(this_task->data[flow_index].data_out); + + } + track_flow |= (1U<task_hash_table, (parsec_key_t)key); + } + } + } + assert(NULL != this_dtd_task); + tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); + } + + //fprintf(stderr, "this task locals[0] = %d\n", this_task->locals[0].value); + //int *data_ptr; + //data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); + //assert(NULL != this_dtd_task); + //tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); + +#if defined(DISTRIBUTED) + /* We perform this only for remote tasks that are being activated + * from the comm engine. We remove the task from the hash table + * for each flow a rank is concerned about. + */ + if( parsec_dtd_task_is_remote(this_dtd_task) && !(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) ) { + int flow_index, track_flow = 0; + for(flow_index = 0; flow_index < tc->nb_flows; flow_index++) { + if((action_mask & (1 << flow_index))) { + if(!(track_flow & (1U << flow_index))) { + uint64_t key = (((uint64_t)this_task->locals[0].value<<32) | (1U<task_hash_table, (parsec_key_t)key); + if( NULL != parsec_dtd_untrack_task( tp, key) ) { + /* also releasing task */ + parsec_dtd_remote_task_release( this_dtd_task ); + } + track_flow |= (1U<task_hash_table, (parsec_key_t)key); + } + } + } + } +#else + (void)deps; +#endif + + /* Scheduling tasks */ + if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + parsec_vp_t **vps = es->virtual_process->parsec_context->virtual_processes; + for (__vp_id = 0; __vp_id < es->virtual_process->parsec_context->nb_vp; __vp_id++) { + if (NULL == arg.ready_lists[__vp_id]) { + continue; + } + if (__vp_id == es->virtual_process->vp_id) { + __parsec_schedule(es, arg.ready_lists[__vp_id], 0); + }else { + __parsec_schedule(vps[__vp_id]->execution_streams[0], arg.ready_lists[__vp_id], 0); + } + arg.ready_lists[__vp_id] = NULL; + } + } + + PARSEC_PINS(es, RELEASE_DEPS_END, this_task); + return 0; +} + /* **************************************************************************** */ /** * This function is called internally by PaRSEC once a task is done @@ -2055,6 +2180,8 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* tc->iterate_successors = parsec_dtd_iterate_successors; tc->iterate_predecessors = NULL; tc->release_deps = parsec_dtd_release_deps; + if(tc->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) + tc->release_deps = parsec_dtd_bcast_key_release_deps; tc->prepare_input = data_lookup_of_dtd_task; tc->prepare_output = output_data_of_dtd_task; tc->get_datatype = (parsec_datatype_lookup_t *)datatype_lookup_of_dtd_task, @@ -2477,6 +2604,8 @@ int parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) { (void)es; (void)this_task; + + fprintf(stderr, "bcast_key_fn executed\n"); return PARSEC_HOOK_RETURN_DONE; } @@ -2493,6 +2622,8 @@ int parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_task) { (void)es; (void)this_task; + + fprintf(stderr, "bcast_key_recv executed\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/interfaces/superscalar/insert_function.h b/parsec/interfaces/superscalar/insert_function.h index 215fdf87b..cfbe055ac 100644 --- a/parsec/interfaces/superscalar/insert_function.h +++ b/parsec/interfaces/superscalar/insert_function.h @@ -19,6 +19,11 @@ BEGIN_C_DECLS +#ifdef PARSEC_DIST_COLLECTIVES +// Availability of collective operations with DTD interface. +#define PARSEC_DTD_DIST_COLLECTIVES +#endif + /** * @addtogroup DTD_INTERFACE * @{ @@ -321,6 +326,31 @@ parsec_dtd_get_taskpool(parsec_task_t *this_task); int parsec_dtd_dequeue_taskpool(parsec_taskpool_t *tp); +#ifdef PARSEC_DTD_DIST_COLLECTIVES +/** + * Create and return `parsec_remote_deps_t` structure associated with + * the broadcast of the a data to all the nodes set in the + * `dest_ranks` array. + **/ + +parsec_remote_deps_t* parsec_dtd_create_remote_deps( + int myrank, int root, parsec_data_copy_t *data_copy, + parsec_arena_datatype_t *arenas_datatype, + int* dest_ranks, int num_dest_ranks); + +/** + * Perform a broadcast for of the dtd tile `dtd_tile_root` from the + * root node associated with the rank `root` to the nodes with ranks + * set in the `dest_ranks` array. + **/ + +void parsec_dtd_broadcast( + parsec_taskpool_t *taskpool, int myrank, int root, + parsec_dtd_tile_t* dtd_tile_root, int arena_index, + parsec_dtd_tile_t* bcast_keys_root, int bcast_arena_index, + int* dest_ranks, int num_dest_ranks); +#endif + /** * @} */ diff --git a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c index e93d21e76..0bc1a9890 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c +++ b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c @@ -38,6 +38,7 @@ parsec_dtd_data_flush_sndrcv(parsec_execution_stream_t *es, parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; parsec_dtd_tile_t *tile = (FLOW_OF(current_task, 0))->tile; + fprintf(stderr, "Executed data flush body in rank %d\n", current_task->rank); assert(tile != NULL); #if defined(DISTRIBUTED) diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 48af315b4..d35da7c5e 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -730,7 +730,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, parsec_dtd_task_t *dtd_task = NULL; dtd_tp = (parsec_dtd_taskpool_t *)origin->taskpool; - + fprintf(stderr, "working in get datatype\n"); /* if( NULL == task.task_class ), this case will be taken care of automatically */ /* We need to convert from a dep_datatype_index mask into a dep_index @@ -932,6 +932,7 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, if(PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type) { remote_dep_complete_and_cleanup(&origin, 1); } else { + //remote_dep_complete_and_cleanup(&origin, 1); remote_deps_free(origin); } #else diff --git a/parsec/scheduling.c b/parsec/scheduling.c index a2620681d..891da44c8 100644 --- a/parsec/scheduling.c +++ b/parsec/scheduling.c @@ -703,6 +703,7 @@ int parsec_context_wait( parsec_context_t* context ) { int ret = 0; + fprintf(stderr, "in parsec_context_wait on rank %d\n", context->my_rank); if( !(PARSEC_CONTEXT_FLAG_CONTEXT_ACTIVE & context->flags) ) { parsec_warning("parsec_context_wait detected on a non started context\n"); return -1; diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c new file mode 100644 index 000000000..c26bbf070 --- /dev/null +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -0,0 +1,419 @@ +#include "mpi.h" + +#include "parsec.h" +#include "parsec/arena.h" +#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/remote_dep.h" +#include "parsec/data_internal.h" +#include "parsec/interfaces/superscalar/insert_function_internal.h" +#include "parsec/interfaces/superscalar/insert_function.h" + +#include +#include + +enum regions + { + TILE_FULL, + TILE_BCAST + }; + +parsec_tiled_matrix_dc_t *create_and_distribute_data(int rank, int world, int mb, int mt) +{ + two_dim_block_cyclic_t *m = (two_dim_block_cyclic_t*)malloc(sizeof(two_dim_block_cyclic_t)); + two_dim_block_cyclic_init(m, matrix_ComplexDouble, matrix_Tile, + rank, + mb, 1, + mt*mb, 1, + 0, 0, + mt*mb, 1, + world, 1, + 1, 1, + 0, 0); + + m->mat = parsec_data_allocate((size_t)m->super.nb_local_tiles * + (size_t)m->super.bsiz * + (size_t)parsec_datadist_getsizeoftype(m->super.mtype)); + + return (parsec_tiled_matrix_dc_t*)m; +} + +void free_data(parsec_tiled_matrix_dc_t *d) +{ + parsec_matrix_destroy_data(d); + parsec_data_collection_destroy(&d->super); + free(d); +} + +// Read data +int read_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task ) { + (void)es; + + // INPUT data + int *val_in; + // Task rank + int dest_rank; + + parsec_dtd_unpack_args(this_task, &val_in, &dest_rank); + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + printf("[read_task] rank = %d, val_in = %d\n", myrank, *val_in); + + return PARSEC_HOOK_RETURN_DONE; +} + +// Write data +int write_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es; + + // INOUT data + int *val_out; + // Value to set the data to + int data_value; + // Task rank + int dest_rank; + + parsec_dtd_unpack_args(this_task, &val_out, &data_value, &dest_rank); + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + printf("[write_task] rank = %d, data_value = %d\n", myrank, data_value); + + *val_out = data_value; + + return PARSEC_HOOK_RETURN_DONE; +} + +// For debugging purpose +void busy_wait() { + // Debug + int stop = 1; + while (stop) {} +} + + +// Retrieve value associated with input data_copy for verification. +int retrieve_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task ) { + (void)es; + + int myrank = -1; + // INPUT data + int *val_in; + // Task rank + int dest_rank; + + int *val_out; + + parsec_dtd_unpack_args(this_task, &val_in, &dest_rank, &val_out); + + /* int myrank; */ + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + printf("[read_task] rank = %d, val_in = %d\n", myrank, *val_in); + + *val_out = *val_in; + + return PARSEC_HOOK_RETURN_DONE; +} + + +int dummy_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es;(void)this_task; + + return PARSEC_HOOK_RETURN_DONE; +} + +int test_broadcast_mixed( + int world, int myrank, parsec_context_t* parsec_context, int root) { + + // Test return value: + // - 0: success + // - Failure otherwise + int ret = 0; + + // Error code return by parsec routines + int perr; + + // Tile size + int nb = 1; + int nb_bcast = 200; + // Total number of tiles + int nt = 1; + int data_value = 0; + + //sleep(40); + // One element per tile + nb = 1; + // few tiles per node + nt = world*5; + + parsec_taskpool_t *dtd_tp = parsec_dtd_taskpool_new(); + + parsec_matrix_add2arena_rect( + &parsec_dtd_arenas_datatypes[TILE_FULL], + parsec_datatype_int32_t, + nb, 1, nb); + + parsec_matrix_add2arena_rect( + &parsec_dtd_arenas_datatypes[TILE_BCAST], + parsec_datatype_int32_t, + nb_bcast, 1, nb_bcast); + // Initial value on the root node. All node should have this value + // at the end of the operation. + int data_root = 55; + + // Final value received on non-root nodes. + int *data_value_out = (int*) calloc(1, sizeof(int)); + *data_value_out = -33; + + if( root == myrank ) { + data_value = data_root; + } + else { + data_value = -10-myrank; + } + + parsec_tiled_matrix_dc_t *dcB; + dcB = create_and_distribute_data(myrank, world, nb_bcast, nt); + parsec_data_collection_set_key((parsec_data_collection_t *)dcB, "B"); + + parsec_data_collection_t *B = (parsec_data_collection_t *)dcB; + parsec_dtd_data_collection_init(B); + + parsec_tiled_matrix_dc_t *dcA; + dcA = create_and_distribute_data(myrank, world, nb, nt); + parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); + + parsec_data_collection_t *A = (parsec_data_collection_t *)dcA; + parsec_dtd_data_collection_init(A); + // Initialize tiles + + parsec_data_copy_t *parsec_data_copy; + parsec_data_t *parsec_data; + // Pointer to local tile data + int *data_ptr; + // Local tile key + int key; + + key = A->data_key(A, myrank, 0); + parsec_data = A->data_of_key(A, key); + parsec_data_copy = parsec_data_get_copy(parsec_data, 0); + data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); + if (root == myrank) { + *data_ptr = data_value; + } + else { + // Initialise this value with rubbish. It should be equal to + // `data_value` after the execution on even-indexed processes. + data_value_out = data_ptr; + } + //parsec_output(0, "Initial data, node: %d A At key[%d]: %d\n", myrank, key, *data_ptr); + + // Registering the dtd_handle with PARSEC context + perr = parsec_context_add_taskpool( parsec_context, dtd_tp ); + PARSEC_CHECK_ERROR(perr, "parsec_context_add_taskpool"); + + perr = parsec_context_start(parsec_context); + PARSEC_CHECK_ERROR(perr, "parsec_context_start"); + + fprintf(stderr, "parsec context started\n"); + // Key of tile associated with root node + int key_root = key = A->data_key(A, root, 0); + parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); + key_root = B->data_key(B, root, 0); + parsec_dtd_tile_t* bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); + + // Create array of destination ranks + int num_dest_ranks = 0; + int *dest_ranks = (int*)malloc(world*sizeof(int)); + + // Destination rank index + int dest_rank_idx = 0 ; + + // Put odd rank indexes into `dest_ranks` array except for the root + // node. VALID ONLY ON THE ROOT NODE + for (int rank = 0; rank < world; ++rank) { + if (rank % 2 == 0 || rank == root) continue; + + dest_ranks[dest_rank_idx] = rank; + ++dest_rank_idx; + } + num_dest_ranks = dest_rank_idx; + + // + // Perform Broadcast + // + if(myrank % 2 == 1 || myrank == root) { + fprintf(stderr, "perform bcast from rank %d\n", myrank); + parsec_dtd_broadcast( + dtd_tp, myrank, root, + dtd_tile_root, TILE_FULL, + bcast_keys_root, TILE_BCAST, + dest_ranks, num_dest_ranks); + } + + // + // Retrieve value of broadcasted data + // + if(myrank % 2 == 1 || myrank == root) { + for (int rank = 0; rank < world; ++rank) { + + if (rank % 2 == 0 || rank == root) continue; + + parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( + dtd_tp, retrieve_task_fn, 0, "retrieve_task", + PASSED_BY_REF, dtd_tile_root, PARSEC_INPUT | TILE_FULL, + sizeof(int), &rank, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(int*), &data_value_out, PARSEC_VALUE, + PARSEC_DTD_ARG_END); + //parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + //parsec_insert_dtd_task(retrieve_task); + + } + } +for(int iter=1; iter <= 0; iter++) { + // Second round of broadcast, create another array of keys for this bcast + key_root = B->data_key(B, root+iter*world, 0); + //key_root = B->data_key(B, root, 0); + bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); + + //sleep(5); + int new_value = -1; + if (root == myrank) { + //*data_ptr = 1998; + new_value = 1998+iter; + } + else { + //data_value_out = data_ptr; + } + + parsec_dtd_taskpool_insert_task(dtd_tp, + write_task_fn, 0, "write_task", + PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | TILE_FULL, + sizeof(int), &new_value, PARSEC_VALUE, + sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, + PARSEC_DTD_ARG_END); + + // Put all rank indexes into `dest_ranks` array except for the root + // node. + dest_rank_idx = 0; + for (int rank = 0; rank < world; ++rank) { + if (rank == root) continue; + dest_ranks[dest_rank_idx] = rank; + ++dest_rank_idx; + } + num_dest_ranks = dest_rank_idx; + + // + // Perform Broadcast AGAIN + // + parsec_dtd_broadcast( + dtd_tp, myrank, root, + dtd_tile_root, TILE_FULL, + bcast_keys_root, TILE_BCAST, + dest_ranks, num_dest_ranks); + + // + // Retrieve value of broadcasted data + // + for (int rank = 0; rank < world; ++rank) { + if ( rank == root) continue; + parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( + dtd_tp, retrieve_task_fn, 0, "retrieve_task", + PASSED_BY_REF, dtd_tile_root, PARSEC_INPUT | TILE_FULL, + sizeof(int), &rank, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(int*), &data_value_out, PARSEC_VALUE, + PARSEC_DTD_ARG_END); + //parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + //parsec_insert_dtd_task(retrieve_task); + + } +} + parsec_dtd_data_flush_all( dtd_tp, A ); + parsec_dtd_data_flush_all( dtd_tp, B ); + + // Wait for task completion + perr = parsec_dtd_taskpool_wait( dtd_tp ); + PARSEC_CHECK_ERROR(perr, "parsec_dtd_taskpool_wait"); + + perr = parsec_context_wait(parsec_context); + PARSEC_CHECK_ERROR(perr, "parsec_context_wait"); + + // Check whether we obtained the correct value on the current node + // at the end of the test. Odd processes should have received the + // value form the root and other processes should have kept their + // original value + if ((myrank == root) || + ((myrank % 2 == 1) && (data_root == *data_value_out)) || + ((myrank % 2 == 0) && (data_value == *data_ptr))) { + // Data received as expected + ret = 0; + } + else { + // Error + ret = -1; + } + +// parsec_output( 0, "Checking result, node: %d, data_value_out: %d\n", myrank, *data_value_out ); + + // Cleanup data and parsec data structures + parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_FULL].opaque_dtt); + PARSEC_OBJ_RELEASE(parsec_dtd_arenas_datatypes[TILE_FULL].arena); + parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_BCAST].opaque_dtt); + PARSEC_OBJ_RELEASE(parsec_dtd_arenas_datatypes[TILE_BCAST].arena); + parsec_dtd_data_collection_fini( A ); + parsec_dtd_data_collection_fini( B ); + free_data(dcA); + free_data(dcB); + + parsec_taskpool_free( dtd_tp ); + + return ret; + +} + +int main(int argc, char **argv) { + + int ret; + parsec_context_t* parsec_context = NULL; + + int rank, world; + + { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided); + } + MPI_Comm_size(MPI_COMM_WORLD, &world); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* int ncores = 1; */ + int ncores = 2; + parsec_context = parsec_init(ncores, &argc, &argv); + + // Root node for the broadcast operation + + //sleep(30); + // + // Simple broadcast + + // Testing trimming with a mixed destinations of receivers for broadcast + ret = test_broadcast_mixed(world, rank, parsec_context, 0); + + + parsec_fini(&parsec_context); + + MPI_Finalize(); + (void)ret; + return 0; +} From 3f7320f4b1109b3c83e907cebed762239d8629aa Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sun, 26 Sep 2021 22:16:49 -0400 Subject: [PATCH 03/41] added a sleep to have a temp solution for the last data flush iterate successor issue --- .../interfaces/superscalar/insert_function.c | 197 ++++++++++++++++-- 1 file changed, 182 insertions(+), 15 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index f88d0d134..4fc0b5975 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -96,6 +96,12 @@ parsec_dtd_iterate_successors(parsec_execution_stream_t *es, uint32_t action_mask, parsec_ontask_function_t *ontask, void *ontask_arg); +static void +parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, + const parsec_task_t *this_task, + uint32_t action_mask, + parsec_ontask_function_t *ontask, + void *ontask_arg); static int parsec_dtd_release_deps(parsec_execution_stream_t *, @@ -1573,6 +1579,176 @@ parsec_dtd_iterate_successors(parsec_execution_stream_t *es, action_mask, ontask, ontask_arg ); } +static void +parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, + const parsec_task_t *this_task, + uint32_t action_mask, + parsec_ontask_function_t *ontask, + void *ontask_arg) +{ + parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; + int current_dep; + parsec_dtd_task_t *current_desc = NULL; + int op_type_on_current_flow, desc_op_type, desc_flow_index; + parsec_dtd_tile_t *tile; + + parsec_dep_t deps; + parsec_release_dep_fct_arg_t *arg = (parsec_release_dep_fct_arg_t *)ontask_arg; + parsec_dep_data_description_t data; + int rank_src = 0, rank_dst = 0, vpid_dst=0; + parsec_dtd_flow_info_t* flow; + + /* finding for which flow we need to iterate successors of */ + int flow_mask = action_mask; + int my_rank = current_task->super.taskpool->context->my_rank; + + rank_src = current_task->rank; + for( current_dep = 0; current_dep < current_task->super.task_class->nb_flows; current_dep++ ) { + if( (flow_mask & (1<deps_out); + //int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | 0); + fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, 1); + (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + parsec_dtd_release_local_task( current_task ); + } else { + /* a node in the key array propagation */ + int root = current_task->deps_out->root; + int my_rank = current_task->super.taskpool->context->my_rank; + + int _array_pos, _array_mask; + struct remote_dep_output_param_s* output; + output = ¤t_task->deps_out->output[0]; + _array_pos = my_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); + + if ((output->rank_bits[_array_pos] & _array_mask)) { + // We are part of the broadcast, forward message + } + } + sleep(1); + //if(my_rank == 0) + // goto skip_iterate; + current_desc = (DESC_OF(current_task, current_dep))->task; + op_type_on_current_flow = (FLOW_OF(current_task, current_dep)->op_type & PARSEC_GET_OP_TYPE); + tile = FLOW_OF(current_task, current_dep)->tile; + + if( NULL == current_desc ) { + if( PARSEC_INOUT == op_type_on_current_flow || + PARSEC_OUTPUT == op_type_on_current_flow ) { + if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + if(0) { /* trying to release ownership */ + continue; /* no descendent for this data */ + } else { + current_desc = (DESC_OF(current_task, current_dep))->task; + } /* Current task has a descendant hence we must activate her */ + } + } + } + +#if defined(PARSEC_DEBUG_ENABLE) + assert(current_desc != NULL); +#endif + + /* setting data */ + data.data = current_task->super.data[current_dep].data_out; + data.arena = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].arena; + data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; + data.count = 1; + data.displ = 0; + + desc_op_type = ((DESC_OF(current_task, current_dep))->op_type & PARSEC_GET_OP_TYPE); + desc_flow_index = (DESC_OF(current_task, current_dep))->flow_index; + + int get_out = 0, tmp_desc_flow_index, release_parent = 0; + parsec_dtd_task_t *nextinline = current_desc; + + do { + tmp_desc_flow_index = desc_flow_index; + current_desc = nextinline; + assert(NULL != current_desc); + /* Forward the data to each successor */ + if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + if(parsec_dtd_task_is_local(current_desc)) { + current_desc->super.data[desc_flow_index].data_in = current_task->super.data[current_dep].data_out; + /* We retain local, remote data for each successor */ + parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); + } + } + + get_out = 1; /* by default escape */ + if( !(PARSEC_OUTPUT == desc_op_type || PARSEC_INOUT == desc_op_type) ) { + + look_for_next: + nextinline = (DESC_OF(current_desc, desc_flow_index))->task; + if( NULL != nextinline ) { + desc_op_type = ((DESC_OF(current_desc, desc_flow_index))->op_type & PARSEC_GET_OP_TYPE); + desc_flow_index = (DESC_OF(current_desc, desc_flow_index))->flow_index; + get_out = 0; /* We have a successor, keep going */ + if( nextinline == current_desc ) { + /* We have same descendant using same data in multiple flows + * So we activate the successor once and skip the other times + */ + if( parsec_dtd_task_is_remote(current_desc) ) { + /* releasing remote read task that is in the chain */ + parsec_dtd_remote_task_release( current_desc ); + } + continue; + } else { + if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) + (DESC_OF(current_desc, tmp_desc_flow_index))->task = NULL; + } + } else { + if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + /* Make sure there is no nextinline */ + if( 1 ) { + } else { + goto look_for_next; + } + } + } + + if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + if(parsec_dtd_task_is_local(current_desc)){ + (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); + } + } + /* Each reader increments the ref count of the data_copy + * We should have a function to retain data copies like + * PARSEC_DATA_COPY_RELEASE + */ + } + + deps.cond = NULL; + deps.ctl_gather_nb = NULL; + deps.task_class_id = current_desc->super.task_class->task_class_id; + deps.flow = current_desc->super.task_class->in[tmp_desc_flow_index]; + deps.dep_index = tmp_desc_flow_index; + deps.belongs_to = current_task->super.task_class->out[current_dep]; + deps.direct_data = NULL; + deps.dep_datatype_index = current_dep; + + rank_dst = current_desc->rank; + + ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, + &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); + vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; + + } while (0 == get_out); + +//skip_iterate: + } + } +} + /* **************************************************************************** */ /** * Release dependencies after a task is done @@ -1724,18 +1900,10 @@ parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *es, parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)this_task->taskpool; if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) ) { + /* root of the bcast key operation */ this_dtd_task = (parsec_dtd_task_t *)this_task; - int bcast_id = ( (1<<29) | 0 ); - this_dtd_task->ht_item.key = bcast_id; - this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key; - (void)parsec_atomic_fetch_inc_int32(&this_dtd_task->super.data[0].data_out->readers); - //parsec_dtd_retain_data_copy(this_dtd_task->super.data[0].data_out); - parsec_remote_dep_activate(es, (parsec_task_t *)this_dtd_task, this_dtd_task->deps_out, this_dtd_task->deps_out->outgoing_mask); - parsec_dtd_remote_task_retain(this_dtd_task); - this_dtd_task->deps_out = NULL; - tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); fprintf(stderr, "bcast key release on rank %d\n", this_dtd_task->rank); - parsec_dtd_release_local_task(this_dtd_task); + //parsec_dtd_remote_task_retain(this_dtd_task); } else { int flow_index, track_flow = 0; for(flow_index = 0; flow_index < tc->nb_flows; flow_index++) { @@ -1758,15 +1926,12 @@ parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *es, } } } - assert(NULL != this_dtd_task); - tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); } - //fprintf(stderr, "this task locals[0] = %d\n", this_task->locals[0].value); //int *data_ptr; //data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); - //assert(NULL != this_dtd_task); - //tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); + assert(NULL != this_dtd_task); + tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); #if defined(DISTRIBUTED) /* We perform this only for remote tasks that are being activated @@ -2178,6 +2343,8 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* *incarnations = (__parsec_chore_t *)dtd_chore; tc->find_deps = NULL; tc->iterate_successors = parsec_dtd_iterate_successors; + if(tc->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) + tc->iterate_successors = parsec_dtd_bcast_key_iterate_successors; tc->iterate_predecessors = NULL; tc->release_deps = parsec_dtd_release_deps; if(tc->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) From db21cdadc582074c774fc9c9176e5b6b62a2b34f Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 27 Sep 2021 12:20:51 -0400 Subject: [PATCH 04/41] add in populate remote dep and check successors code for chain topology --- .../interfaces/superscalar/insert_function.c | 96 +++++++++++++++++-- 1 file changed, 86 insertions(+), 10 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 4fc0b5975..09b5f938e 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1579,6 +1579,74 @@ parsec_dtd_iterate_successors(parsec_execution_stream_t *es, action_mask, ontask, ontask_arg ); } +/* when the comm_coll_bcast is 1 we use the chain topology, get the successor's rank */ +static int +get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_remote_deps_t* remote_deps) +{ + int my_idx, idx, current_mask; + unsigned int array_index, count, bit_index; + uint32_t boffset; + uint32_t dep_fw_mask[es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof]; + memset(dep_fw_mask, 0, es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof); + memcpy(&dep_fw_mask, remote_deps->remote_dep_fw_mask, es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof); + struct remote_dep_output_param_s* output = &remote_deps->output[0]; + boffset = remote_deps->root / (8 * sizeof(uint32_t)); + dep_fw_mask[boffset] |= ((uint32_t)1) << (remote_deps->root % (8 * sizeof(uint32_t))); + my_idx = (remote_deps->root == es->virtual_process->parsec_context->my_rank) ? 0 : -1; + idx = 0; + for(array_index = count = 0; count < remote_deps->output[0].count_bits; array_index++) { + current_mask = output->rank_bits[array_index]; + if( 0 == current_mask ) continue; + for( bit_index = 0; current_mask != 0; bit_index++ ) { + if( !(current_mask & (1 << bit_index)) ) continue; + int rank = (array_index * sizeof(uint32_t) * 8) + bit_index; + current_mask ^= (1 << bit_index); + count++; + + boffset = rank / (8 * sizeof(uint32_t)); + if(dep_fw_mask[boffset] & ((uint32_t)1) << (rank % (8 * sizeof(uint32_t)))) + continue; + idx++; + if(my_idx == -1) { + if(rank == es->virtual_process->parsec_context->my_rank) { + my_idx = idx; + } + boffset = rank / (8 * sizeof(uint32_t)); + dep_fw_mask[boffset] |= ((uint32_t)1) << (rank % (8 * sizeof(uint32_t))); + continue; + } + if(my_idx != -1){ + if(idx == my_idx+1) + { + return rank; + } + } + } + } + return 0; +} + +static int +populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) +{ + struct remote_dep_output_param_s* output = &remote_deps->output[0]; + int _array_pos, _array_mask; + uint32_t dest_rank_idx; + /* TODO: don't assume the length of data_ptr */ + int num_dest_ranks = data_ptr[100]; + + for(dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { + uint32_t dest_rank = data_ptr[100+dest_rank_idx+1]; + _array_pos = dest_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (dest_rank % (8 * sizeof(uint32_t))); + + if( !(output->rank_bits[_array_pos] & _array_mask) ) { + output->rank_bits[_array_pos] |= _array_mask; + output->count_bits++; + } + } +} + static void parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, const parsec_task_t *this_task, @@ -1601,17 +1669,23 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* finding for which flow we need to iterate successors of */ int flow_mask = action_mask; int my_rank = current_task->super.taskpool->context->my_rank; + int successor = -1; rank_src = current_task->rank; + + int rc; /* retrive the mca number for comm_coll_bcast */ + int comm_coll_bcast; /* retrive the value set for comm_coll_bcast */ + if (0 < (rc = parsec_mca_param_find("runtime", NULL, "comm_coll_bcast")) ) { + parsec_mca_param_lookup_int(rc, &comm_coll_bcast); + } for( current_dep = 0; current_dep < current_task->super.task_class->nb_flows; current_dep++ ) { if( (flow_mask & (1<deps_out); - //int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + successor = get_chain_successor(es, current_task, current_task->deps_out); + int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | 0); - fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, 1); + fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, successor); (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, @@ -1619,7 +1693,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; parsec_dtd_release_local_task( current_task ); - } else { + } else if (action_mask == PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* a node in the key array propagation */ int root = current_task->deps_out->root; int my_rank = current_task->super.taskpool->context->my_rank; @@ -1631,12 +1705,16 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); if ((output->rank_bits[_array_pos] & _array_mask)) { - // We are part of the broadcast, forward message + /* We are part of the broadcast, forward message */ + int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + populate_remote_deps(data_ptr, current_task->deps_out); + successor = get_chain_successor(es, current_task, current_task->deps_out); + fprintf(stderr, "continuation with chain successor %d\n", successor); + } } + // temp fix to ensure descendent exist sleep(1); - //if(my_rank == 0) - // goto skip_iterate; current_desc = (DESC_OF(current_task, current_dep))->task; op_type_on_current_flow = (FLOW_OF(current_task, current_dep)->op_type & PARSEC_GET_OP_TYPE); tile = FLOW_OF(current_task, current_dep)->tile; @@ -1743,8 +1821,6 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; } while (0 == get_out); - -//skip_iterate: } } } From 5345981b21106d052647e6aa62ba3181a5f39d0b Mon Sep 17 00:00:00 2001 From: yu-pei Date: Tue, 28 Sep 2021 14:03:42 -0400 Subject: [PATCH 05/41] handle calling iterate successor from get datatype --- .../interfaces/superscalar/insert_function.c | 153 ++++++------------ .../dtd_test_broadcast_collective.c | 2 +- 2 files changed, 54 insertions(+), 101 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 09b5f938e..12f9230bd 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1712,115 +1712,49 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, fprintf(stderr, "continuation with chain successor %d\n", successor); } + } else { + /* on the receiver side, get datatype to aquire datatype, arena etc info */ + data.data = current_task->super.data[current_dep].data_out; + data.arena = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].arena; + data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; + data.count = 1; + data.displ = 0; + deps.cond = NULL; + deps.ctl_gather_nb = NULL; + //deps.task_class_id = current_desc->super.task_class->task_class_id; + deps.flow = current_task->super.task_class->out[current_dep]; + deps.dep_index = desc_flow_index; + deps.belongs_to = current_task->super.task_class->out[current_dep]; + deps.direct_data = NULL; + deps.dep_datatype_index = current_dep; + ontask( es, (parsec_task_t *)current_task, (parsec_task_t *)current_task, + &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); } // temp fix to ensure descendent exist - sleep(1); - current_desc = (DESC_OF(current_task, current_dep))->task; - op_type_on_current_flow = (FLOW_OF(current_task, current_dep)->op_type & PARSEC_GET_OP_TYPE); - tile = FLOW_OF(current_task, current_dep)->tile; - - if( NULL == current_desc ) { - if( PARSEC_INOUT == op_type_on_current_flow || - PARSEC_OUTPUT == op_type_on_current_flow ) { - if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - if(0) { /* trying to release ownership */ - continue; /* no descendent for this data */ - } else { - current_desc = (DESC_OF(current_task, current_dep))->task; - } /* Current task has a descendant hence we must activate her */ - } - } - } + //sleep(1); + //op_type_on_current_flow = (FLOW_OF(current_task, current_dep)->op_type & PARSEC_GET_OP_TYPE); + //tile = FLOW_OF(current_task, current_dep)->tile; -#if defined(PARSEC_DEBUG_ENABLE) - assert(current_desc != NULL); -#endif - /* setting data */ - data.data = current_task->super.data[current_dep].data_out; - data.arena = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].arena; - data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; - data.count = 1; - data.displ = 0; - - desc_op_type = ((DESC_OF(current_task, current_dep))->op_type & PARSEC_GET_OP_TYPE); - desc_flow_index = (DESC_OF(current_task, current_dep))->flow_index; - - int get_out = 0, tmp_desc_flow_index, release_parent = 0; - parsec_dtd_task_t *nextinline = current_desc; - - do { - tmp_desc_flow_index = desc_flow_index; - current_desc = nextinline; - assert(NULL != current_desc); - /* Forward the data to each successor */ - if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - if(parsec_dtd_task_is_local(current_desc)) { - current_desc->super.data[desc_flow_index].data_in = current_task->super.data[current_dep].data_out; - /* We retain local, remote data for each successor */ - parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); - } - } - get_out = 1; /* by default escape */ - if( !(PARSEC_OUTPUT == desc_op_type || PARSEC_INOUT == desc_op_type) ) { - - look_for_next: - nextinline = (DESC_OF(current_desc, desc_flow_index))->task; - if( NULL != nextinline ) { - desc_op_type = ((DESC_OF(current_desc, desc_flow_index))->op_type & PARSEC_GET_OP_TYPE); - desc_flow_index = (DESC_OF(current_desc, desc_flow_index))->flow_index; - get_out = 0; /* We have a successor, keep going */ - if( nextinline == current_desc ) { - /* We have same descendant using same data in multiple flows - * So we activate the successor once and skip the other times - */ - if( parsec_dtd_task_is_remote(current_desc) ) { - /* releasing remote read task that is in the chain */ - parsec_dtd_remote_task_release( current_desc ); - } - continue; - } else { - if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) - (DESC_OF(current_desc, tmp_desc_flow_index))->task = NULL; - } - } else { - if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - /* Make sure there is no nextinline */ - if( 1 ) { - } else { - goto look_for_next; - } - } - } - if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - if(parsec_dtd_task_is_local(current_desc)){ - (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); - } - } - /* Each reader increments the ref count of the data_copy - * We should have a function to retain data copies like - * PARSEC_DATA_COPY_RELEASE - */ - } + if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + //if(parsec_dtd_task_is_local(current_desc)){ + (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); + //} + } + /* Each reader increments the ref count of the data_copy + * We should have a function to retain data copies like + * PARSEC_DATA_COPY_RELEASE + */ - deps.cond = NULL; - deps.ctl_gather_nb = NULL; - deps.task_class_id = current_desc->super.task_class->task_class_id; - deps.flow = current_desc->super.task_class->in[tmp_desc_flow_index]; - deps.dep_index = tmp_desc_flow_index; - deps.belongs_to = current_task->super.task_class->out[current_dep]; - deps.direct_data = NULL; - deps.dep_datatype_index = current_dep; - rank_dst = current_desc->rank; + //rank_dst = current_desc->rank; - ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, - &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); - vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; + //ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, + // &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); + //vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; - } while (0 == get_out); } } } @@ -2273,6 +2207,23 @@ static int datatype_lookup_of_dtd_task(parsec_execution_stream_t *es, return PARSEC_HOOK_RETURN_DONE; } +static int bcast_key_datatype_lookup_of_dtd_task(parsec_execution_stream_t *es, + const parsec_task_t *this_task, + uint32_t *flow_mask, parsec_dep_data_description_t *data) +{ + (void)es; + data->count = 1; + data->displ = 0; + data->arena = NULL; + data->data = NULL; + data->layout = PARSEC_DATATYPE_NULL; + data->count = 0; + data->displ = 0; + (*flow_mask) = 0; /* nothing left */ + + return PARSEC_HOOK_RETURN_DONE; +} + /* This function creates relationship between two task function classes. * Arguments: - parsec taskpool (parsec_taskpool_t *) - parent master structure (parsec_task_class_t *) @@ -2427,7 +2378,9 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* tc->release_deps = parsec_dtd_bcast_key_release_deps; tc->prepare_input = data_lookup_of_dtd_task; tc->prepare_output = output_data_of_dtd_task; - tc->get_datatype = (parsec_datatype_lookup_t *)datatype_lookup_of_dtd_task, + tc->get_datatype = (parsec_datatype_lookup_t *)datatype_lookup_of_dtd_task; + if(tc->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) + tc->get_datatype = (parsec_datatype_lookup_t *)bcast_key_datatype_lookup_of_dtd_task; tc->complete_execution = complete_hook_of_dtd; tc->release_task = parsec_release_dtd_task_to_mempool; diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index c26bbf070..e1397ef09 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -403,7 +403,7 @@ int main(int argc, char **argv) { // Root node for the broadcast operation - //sleep(30); + sleep(30); // // Simple broadcast From 95c9066229d878314814d673a503c67dc37890fc Mon Sep 17 00:00:00 2001 From: yu-pei Date: Tue, 28 Sep 2021 17:58:22 -0400 Subject: [PATCH 06/41] node in the chain continue propagation; TODO: data, remote dep ref count accounting --- parsec/interfaces/superscalar/insert_function.c | 16 +++++++++++++--- parsec/remote_dep.c | 7 +++++-- parsec/remote_dep_mpi.c | 4 ++-- .../superscalar/dtd_test_broadcast_collective.c | 2 +- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 12f9230bd..365be4724 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1684,7 +1684,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* root of the bcast key */ successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | 0); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | *(data_ptr+1+successor)); fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, successor); (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); parsec_remote_dep_activate( @@ -1693,7 +1693,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; parsec_dtd_release_local_task( current_task ); - } else if (action_mask == PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + } else if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* a node in the key array propagation */ int root = current_task->deps_out->root; int my_rank = current_task->super.taskpool->context->my_rank; @@ -1710,7 +1710,17 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, populate_remote_deps(data_ptr, current_task->deps_out); successor = get_chain_successor(es, current_task, current_task->deps_out); fprintf(stderr, "continuation with chain successor %d\n", successor); - + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | *(data_ptr+1+successor)); + assert(NULL != current_task->super.data[current_dep].data_out); + + current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; + //parsec_atomic_fetch_inc_int32(¤t_task->deps_out->pending_ack); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + //parsec_dtd_release_local_task( current_task ); } } else { /* on the receiver side, get datatype to aquire datatype, arena etc info */ diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 09a3131a4..a256bd518 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -533,9 +533,12 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, /* Right now DTD only supports a star broadcast topology */ if( PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type ) { parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; - if(this_dtd_task->deps_out == NULL) + if(this_dtd_task->deps_out == NULL) { remote_deps->msg.locals[0].value = remote_deps->bcast_keys[i]; /* p2p, update the key for this message */ - remote_dep_bcast_child_permits = remote_dep_bcast_star_child(my_idx, idx); + remote_dep_bcast_child_permits = remote_dep_bcast_star_child(my_idx, idx); + } else { + remote_dep_bcast_child_permits = remote_dep_bcast_child(my_idx, idx); + } } else { remote_dep_bcast_child_permits = remote_dep_bcast_child(my_idx, idx); } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index d35da7c5e..6f89cce8f 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -932,8 +932,8 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, if(PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type) { remote_dep_complete_and_cleanup(&origin, 1); } else { - //remote_dep_complete_and_cleanup(&origin, 1); - remote_deps_free(origin); + remote_dep_complete_and_cleanup(&origin, 1); + //remote_deps_free(origin); } #else remote_deps_free(origin); diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index e1397ef09..88add0ea9 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -341,7 +341,7 @@ for(int iter=1; iter <= 0; iter++) { } } parsec_dtd_data_flush_all( dtd_tp, A ); - parsec_dtd_data_flush_all( dtd_tp, B ); + //parsec_dtd_data_flush_all( dtd_tp, B ); // Wait for task completion perr = parsec_dtd_taskpool_wait( dtd_tp ); From 0492882a899cf7f2937e2afb3530513be3b523ed Mon Sep 17 00:00:00 2001 From: yu-pei Date: Wed, 29 Sep 2021 16:09:40 -0400 Subject: [PATCH 07/41] add in some tile clean up on root node from data flush --- .../interfaces/superscalar/insert_function.c | 37 ++++++++++++++----- parsec/remote_dep_mpi.c | 4 +- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 365be4724..0b568b256 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1687,12 +1687,16 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | *(data_ptr+1+successor)); fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, successor); (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + tile = FLOW_OF(current_task, current_dep)->tile; + parsec_dtd_tile_retain(tile); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, current_task->deps_out, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; - parsec_dtd_release_local_task( current_task ); + parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ + //parsec_dtd_release_local_task( current_task ); + } else if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* a node in the key array propagation */ int root = current_task->deps_out->root; @@ -1715,12 +1719,19 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; //parsec_atomic_fetch_inc_int32(¤t_task->deps_out->pending_ack); + (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); + (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); + parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, current_task->deps_out, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; //parsec_dtd_release_local_task( current_task ); + /* releasing the receiver task as the only desc task */ + current_desc = (DESC_OF(current_task, current_dep))->task; + ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, + &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); } } else { /* on the receiver side, get datatype to aquire datatype, arena etc info */ @@ -1729,14 +1740,14 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; data.count = 1; data.displ = 0; - deps.cond = NULL; - deps.ctl_gather_nb = NULL; - //deps.task_class_id = current_desc->super.task_class->task_class_id; - deps.flow = current_task->super.task_class->out[current_dep]; - deps.dep_index = desc_flow_index; - deps.belongs_to = current_task->super.task_class->out[current_dep]; - deps.direct_data = NULL; - deps.dep_datatype_index = current_dep; + deps.cond = NULL; + deps.ctl_gather_nb = NULL; + //deps.task_class_id = current_desc->super.task_class->task_class_id; + deps.flow = current_task->super.task_class->out[current_dep]; + deps.dep_index = desc_flow_index; + deps.belongs_to = current_task->super.task_class->out[current_dep]; + deps.direct_data = NULL; + deps.dep_datatype_index = current_dep; ontask( es, (parsec_task_t *)current_task, (parsec_task_t *)current_task, &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); } @@ -1750,7 +1761,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { //if(parsec_dtd_task_is_local(current_desc)){ - (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); + //(void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); //} } /* Each reader increments the ref count of the data_copy @@ -2078,6 +2089,12 @@ parsec_dtd_release_local_task( parsec_dtd_task_t *this_task ) if(PARSEC_DTD_FLUSH_TC_ID == this_task->super.task_class->task_class_id) { assert( current_flow == 0 ); parsec_dtd_tile_release( tile ); + } + if(PARSEC_DTD_BCAST_KEY_TC_ID == this_task->super.task_class->task_class_id) { + assert( current_flow == 0 ); + tile->flushed = FLUSHED; + parsec_dtd_tile_remove( tile->dc, tile->key ); + parsec_dtd_tile_release( tile ); } } assert(this_task->super.super.super.obj_reference_count == 1); diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 6f89cce8f..d35da7c5e 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -932,8 +932,8 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, if(PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type) { remote_dep_complete_and_cleanup(&origin, 1); } else { - remote_dep_complete_and_cleanup(&origin, 1); - //remote_deps_free(origin); + //remote_dep_complete_and_cleanup(&origin, 1); + remote_deps_free(origin); } #else remote_deps_free(origin); From bcf2c35266690a0feb00464290f56440998bcce3 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Wed, 29 Sep 2021 23:24:01 -0400 Subject: [PATCH 08/41] fixes tile accounting on nodes in the chain; TODO: clearn up of non participating nodes --- parsec/interfaces/superscalar/insert_function.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 0b568b256..6fc2f5b58 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1720,16 +1720,20 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; //parsec_atomic_fetch_inc_int32(¤t_task->deps_out->pending_ack); (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); - (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, current_task->deps_out, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; + parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ //parsec_dtd_release_local_task( current_task ); /* releasing the receiver task as the only desc task */ + tile = FLOW_OF(current_task, current_dep)->tile; + parsec_dtd_tile_retain(tile); current_desc = (DESC_OF(current_task, current_dep))->task; + current_desc->super.data[0].data_in = current_task->super.data[current_dep].data_out; + (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); } @@ -2143,6 +2147,12 @@ parsec_dtd_remote_task_release( parsec_dtd_task_t *this_task ) assert( current_flow == 0 ); parsec_dtd_tile_release( tile ); } + if(PARSEC_DTD_BCAST_KEY_TC_ID == this_task->super.task_class->task_class_id) { + assert( current_flow == 0 ); + tile->flushed = FLUSHED; + parsec_dtd_tile_remove( tile->dc, tile->key ); + parsec_dtd_tile_release( tile ); + } } assert(this_task->super.super.super.obj_reference_count == 1); parsec_taskpool_t *tp = this_task->super.taskpool; From 1b7534e2e9a820a42f0ebf8e1fce4969460365aa Mon Sep 17 00:00:00 2001 From: yu-pei Date: Thu, 30 Sep 2021 13:52:26 -0400 Subject: [PATCH 09/41] not involved nodes calling TILE_OF will insert tile into hash table --- .../superscalar/dtd_test_broadcast_collective.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 88add0ea9..34621220a 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -227,13 +227,17 @@ int test_broadcast_mixed( perr = parsec_context_start(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_start"); - fprintf(stderr, "parsec context started\n"); // Key of tile associated with root node - int key_root = key = A->data_key(A, root, 0); - parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); - key_root = B->data_key(B, root, 0); - parsec_dtd_tile_t* bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); - + int key_root; + parsec_dtd_tile_t* dtd_tile_root; + parsec_dtd_tile_t* bcast_keys_root; + if(myrank % 2 == 1 || myrank == root) { + key_root = key = A->data_key(A, root, 0); + dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); + key_root = B->data_key(B, root, 0); + bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); + } + // Create array of destination ranks int num_dest_ranks = 0; int *dest_ranks = (int*)malloc(world*sizeof(int)); From de4f29956d4c6eedd56095d2e486e42cc1c6de51 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Fri, 1 Oct 2021 13:45:39 -0400 Subject: [PATCH 10/41] some code clean up --- .../interfaces/superscalar/insert_function.c | 33 ++----------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 6fc2f5b58..a6ea57945 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1686,7 +1686,6 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | *(data_ptr+1+successor)); fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, successor); - (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); parsec_remote_dep_activate( @@ -1694,8 +1693,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; - parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ - //parsec_dtd_release_local_task( current_task ); + /* decrease the count as in the data flush */ + parsec_dtd_release_local_task( current_task ); } else if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* a node in the key array propagation */ @@ -1718,8 +1717,6 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, assert(NULL != current_task->super.data[current_dep].data_out); current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; - //parsec_atomic_fetch_inc_int32(¤t_task->deps_out->pending_ack); - (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, @@ -1727,7 +1724,6 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ - //parsec_dtd_release_local_task( current_task ); /* releasing the receiver task as the only desc task */ tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); @@ -1755,31 +1751,6 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, ontask( es, (parsec_task_t *)current_task, (parsec_task_t *)current_task, &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); } - // temp fix to ensure descendent exist - //sleep(1); - //op_type_on_current_flow = (FLOW_OF(current_task, current_dep)->op_type & PARSEC_GET_OP_TYPE); - //tile = FLOW_OF(current_task, current_dep)->tile; - - - - - if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - //if(parsec_dtd_task_is_local(current_desc)){ - //(void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); - //} - } - /* Each reader increments the ref count of the data_copy - * We should have a function to retain data copies like - * PARSEC_DATA_COPY_RELEASE - */ - - - //rank_dst = current_desc->rank; - - //ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, - // &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); - //vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; - } } } From 3b2fd373fd8a2bcb34d9d87f62a09384781105a2 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sun, 3 Oct 2021 15:52:48 -0400 Subject: [PATCH 11/41] BCAST DATA task class piggyback to the normal workflow --- parsec/interfaces/superscalar/collectives.c | 54 +++++++++---------- .../interfaces/superscalar/insert_function.c | 47 +++++++++++++++- .../superscalar/insert_function_internal.h | 3 ++ .../superscalar/overlap_strategies.c | 41 ++++++++++++++ parsec/remote_dep_mpi.c | 1 - .../dtd_test_broadcast_collective.c | 17 +++--- 6 files changed, 124 insertions(+), 39 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index d1d249b64..79ecbd3d3 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -192,38 +192,32 @@ void parsec_dtd_broadcast( parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; // Create remote deps corresponding to the braodcast - //parsec_remote_deps_t *deps_0 = parsec_dtd_create_remote_deps( - // myrank, root, data_copy, &parsec_dtd_arenas_datatypes[arena_index], - // dest_ranks, num_dest_ranks); + parsec_remote_deps_t *deps_0 = parsec_dtd_create_remote_deps( + myrank, root, data_copy, &parsec_dtd_arenas_datatypes[arena_index], + dest_ranks, num_dest_ranks); parsec_remote_deps_t *deps_1 = parsec_dtd_create_remote_deps( myrank, root, key_copy, &parsec_dtd_arenas_datatypes[bcast_arena_index], dest_ranks, num_dest_ranks); - //parsec_task_t *bcast_task_root = parsec_dtd_taskpool_create_task( - // taskpool, parsec_dtd_aux_fn2, 0, "bcast_task_root", - // PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | arena_index, - // sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, - // PARSEC_DTD_ARG_END); + parsec_task_t *bcast_task_root = parsec_dtd_taskpool_create_task( + taskpool, parsec_dtd_bcast_data_fn, 0, "bcast_data_fn", + PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | arena_index, + sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, + PARSEC_DTD_ARG_END); - //parsec_dtd_task_t *dtd_bcast_task_root = (parsec_dtd_task_t *)bcast_task_root; - //dtd_bcast_task_root->super.locals[2].value = -1; - //dtd_bcast_task_root->super.locals[3].value = -1; + parsec_dtd_task_t *dtd_bcast_task_root = (parsec_dtd_task_t *)bcast_task_root; // Set broadcast topology info - //dtd_bcast_task_root->deps_out = NULL; - - //dtd_bcast_task_root->deps_out = deps_0; - - //if(myrank == root) { - // dtd_bcast_task_root->ht_item.key = bcast_id; - // dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; - //}else{ - // bcast_id = ( (1<<28) | dtd_tp->recv_task_id[root]++); - // dtd_bcast_task_root->ht_item.key = bcast_id; - // dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; - //} - /* Post the bcast tasks for the actual data */ - //parsec_insert_dtd_task(dtd_bcast_task_root); + dtd_bcast_task_root->deps_out = deps_0; + + if(myrank == root) { + dtd_bcast_task_root->ht_item.key = bcast_id; + dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; + }else{ + bcast_id = ( (1<<28) | dtd_tp->recv_task_id[root]++); + dtd_bcast_task_root->ht_item.key = bcast_id; + dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; + } parsec_task_t *bcast_key_root = parsec_dtd_taskpool_create_task( taskpool, parsec_dtd_bcast_key_fn, 0, "bcast_key_fn", @@ -234,16 +228,18 @@ void parsec_dtd_broadcast( dtd_bcast_key_root->deps_out = NULL; dtd_bcast_key_root->deps_out = deps_1; if(myrank == root) { - //dtd_bcast_task_root->ht_item.key = bcast_id; - //dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; + /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] -1)); - bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); + bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] -1)); + //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; } /* Post the bcast of keys and ranks array */ parsec_insert_dtd_task(dtd_bcast_key_root); + + /* Post the bcast tasks for the actual data */ + parsec_insert_dtd_task(dtd_bcast_task_root); if(myrank == root) { //for (int dest_rank = 0; dest_rank < num_dest_ranks; ++dest_rank) { diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index a6ea57945..6fe7a8ffa 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -102,7 +102,6 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, uint32_t action_mask, parsec_ontask_function_t *ontask, void *ontask_arg); - static int parsec_dtd_release_deps(parsec_execution_stream_t *, parsec_task_t *, @@ -228,6 +227,9 @@ parsec_dtd_enqueue_taskpool(parsec_taskpool_t *tp, void *data) /* The second taskclass of every taskpool is the bcast key array propagation taskclass */ parsec_dtd_create_task_class(dtd_tp, parsec_dtd_bcast_key_fn, "parsec_dtd_bcast_key_fn", 2, sizeof(int), 1); + /* The third taskclass of every taskpool is the bcast taskclass for tile data bcast */ + parsec_dtd_create_task_class(dtd_tp, parsec_dtd_bcast_data_fn, "parsec_dtd_bcast_data_fn", + 2, sizeof(int), 1); return 0; } @@ -1724,6 +1726,31 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ + + /* update the BCAST DATA task or dep with the global ID that we know now */ + uint64_t key = ((uint64_t)(1<<28 | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); + parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); + parsec_dtd_task_t * dtd_task = parsec_dtd_find_task(tp, key); + parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); + fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); + populate_remote_deps(data_ptr, dtd_task->deps_out); + parsec_dtd_untrack_task(tp, key); + if(dep == NULL){ + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_track_task(tp, key2, dtd_task); + }else{ + + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_untrack_remote_dep(tp, key2); + parsec_dtd_track_task(tp, key2, dtd_task); + remote_dep_dequeue_delayed_dep_release(dep); + } + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + /* releasing the receiver task as the only desc task */ tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); @@ -2831,6 +2858,24 @@ parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_ta return PARSEC_HOOK_RETURN_DONE; } +/* **************************************************************************** */ +/** + * Body of bcast task we insert that will propagate the data tile we are broadcasting + * empty body! + * + * @param context, this_task + * + * @ingroup DTD_INTERFACE_INTERNAL + */ +int +parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; (void)this_task; + + fprintf(stderr, "bcast_data_fn executed\n"); + return PARSEC_HOOK_RETURN_DONE; +} + int parsec_dtd_schedule_task_if_ready(int satisfied_flow, parsec_dtd_task_t *this_task, parsec_dtd_taskpool_t *dtd_tp, int *vpid) diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index 823872b45..31d97af65 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -39,6 +39,7 @@ extern int parsec_dtd_dump_traversal_info; /**< For printing traversal info */ #define PARSEC_DTD_FLUSH_TC_ID ((uint8_t)0x00) #define PARSEC_DTD_BCAST_KEY_TC_ID ((uint8_t)0x01) +#define PARSEC_DTD_BCAST_DATA_TC_ID ((uint8_t)0x02) /* To flag the task we are trying to complete as a local one */ #define PARSEC_ACTION_COMPLETE_LOCAL_TASK 0x08000000 @@ -289,6 +290,8 @@ typedef struct parsec_dtd_common_args_s { /* Function prototypes */ int parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task); +int parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task); + int parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_task); void diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 024ca1852..cac4aee31 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -342,6 +342,47 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, } } } while (0 == get_out); + + if(PARSEC_DTD_BCAST_DATA_TC_ID == current_task->super.task_class->task_class_id) { + /* for the bcast data class, in addition to release the data to local deps tasks that will read the data + * propagate the data down to descendants as well */ + if(current_task->deps_out != NULL) { + /* we have not propagate the remote deps yet, otherwise will be set to NULL */ + if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { + assert(NULL != current_task->super.data[current_dep].data_out); + fprintf(stderr, "bcast root data with global key %d\n", current_task->ht_item.key); + current_task->deps_out->output[0].data.data = + current_task->super.data[current_dep].data_out; + //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + /* current node is part of the broadcast operation, propagate downstream */ + int root = current_task->deps_out->root; + int my_rank = current_task->super.taskpool->context->my_rank; + int _array_pos, _array_mask; + struct remote_dep_output_param_s* output; + output = ¤t_task->deps_out->output[0]; + _array_pos = my_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); + + if ((output->rank_bits[_array_pos] & _array_mask)) { + assert(NULL != current_task->super.data[current_dep].data_out); + + current_task->deps_out->output[0].data.data = + current_task->super.data[0].data_out; + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + } + } + } + } /* BCAST DATA propagation */ } } } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index d35da7c5e..d2783f9af 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -730,7 +730,6 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, parsec_dtd_task_t *dtd_task = NULL; dtd_tp = (parsec_dtd_taskpool_t *)origin->taskpool; - fprintf(stderr, "working in get datatype\n"); /* if( NULL == task.task_class ), this case will be taken care of automatically */ /* We need to convert from a dep_datatype_index mask into a dep_index diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 34621220a..35ea089ac 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -270,21 +270,22 @@ int test_broadcast_mixed( // // Retrieve value of broadcasted data // - if(myrank % 2 == 1 || myrank == root) { - for (int rank = 0; rank < world; ++rank) { + //if(myrank % 2 == 1 || myrank == root) { + if(myrank % 2 == 1) { + //for (int rank = 0; rank < world; ++rank) { - if (rank % 2 == 0 || rank == root) continue; + //if (rank % 2 == 0 || rank == root) continue; parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( dtd_tp, retrieve_task_fn, 0, "retrieve_task", PASSED_BY_REF, dtd_tile_root, PARSEC_INPUT | TILE_FULL, - sizeof(int), &rank, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, sizeof(int*), &data_value_out, PARSEC_VALUE, PARSEC_DTD_ARG_END); - //parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - //parsec_insert_dtd_task(retrieve_task); + parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + parsec_insert_dtd_task(retrieve_task); - } + //} } for(int iter=1; iter <= 0; iter++) { // Second round of broadcast, create another array of keys for this bcast @@ -407,7 +408,7 @@ int main(int argc, char **argv) { // Root node for the broadcast operation - sleep(30); + //sleep(30); // // Simple broadcast From 0728ed36f18080e5c43385feeee5d53d6a9de06f Mon Sep 17 00:00:00 2001 From: yu-pei Date: Tue, 5 Oct 2021 16:09:33 -0400 Subject: [PATCH 12/41] clean up and add previous potrf test case --- parsec/interfaces/superscalar/collectives.c | 46 -- .../interfaces/superscalar/insert_function.c | 7 +- .../superscalar/testing_zpotrf_dtd.c | 488 ++++++++++++++++++ 3 files changed, 493 insertions(+), 48 deletions(-) create mode 100644 tests/interfaces/superscalar/testing_zpotrf_dtd.c diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 79ecbd3d3..e1f5e5613 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -109,52 +109,6 @@ int remote_deps_free_if_empty(parsec_remote_deps_t* deps) { return ret; } -/* -static -int parsec_dtd_bcast_task_fn( - parsec_execution_stream_t *es, - parsec_task_t *this_task) { - (void)es; - - // parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *)this_task; - - // INPUT or INOUT data - void *val_in; - // Root index - int root_in; - // Task rank - int dest_rank; - - printf("[parsec_dtd_bcast_task_fn]\n"); - - return PARSEC_HOOK_RETURN_DONE; -} - -int parsec_dtd_aux_fn( - parsec_execution_stream_t *es, - parsec_task_t *this_task) { - (void)es; - // INPUT data - int *val_in; - int *val_bcast; - // Task rank - int dest_rank; - - - parsec_dtd_unpack_args(this_task, &val_bcast, &dest_rank); - fprintf(stderr, "aux_fn on rank %d value %d\n", es->virtual_process->parsec_context->my_rank, *val_bcast); - return PARSEC_HOOK_RETURN_DONE; -} - -int parsec_dtd_aux_fn2( - parsec_execution_stream_t *es, - parsec_task_t *this_task) { - (void)es; - - return PARSEC_HOOK_RETURN_DONE; -} -*/ - /** * Perform a broadcast for of the dtd tile `dtd_tile_root` from the * root node associated with the rank `root` to the nodes with ranks diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 6fe7a8ffa..88c6335d3 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1730,10 +1730,13 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* update the BCAST DATA task or dep with the global ID that we know now */ uint64_t key = ((uint64_t)(1<<28 | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); + parsec_dtd_task_t* dtd_task = NULL; parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; + while(dtd_task == NULL) + dtd_task = parsec_dtd_find_task(tp, key); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); - parsec_dtd_task_t * dtd_task = parsec_dtd_find_task(tp, key); + //parsec_dtd_task_t* dtd_task = parsec_dtd_find_task(tp, key); parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); populate_remote_deps(data_ptr, dtd_task->deps_out); @@ -2626,7 +2629,7 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in } } else { if( !(flow->flags & TASK_INSERTED) ) { - assert(dep->from == real_parent_task->rank); + //assert(dep->from == real_parent_task->rank); flow->flags |= TASK_INSERTED; parsec_dtd_untrack_remote_dep( tp, key ); #if defined(PARSEC_PROF_TRACE) diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c new file mode 100644 index 000000000..afee72517 --- /dev/null +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 2013-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * @precisions normal z -> s d c + * + */ + +#include "common.h" +#include "flops.h" +#include "dplasma/types.h" +#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/interfaces/superscalar/insert_function.h" +#include "parsec/interfaces/superscalar/insert_function_internal.h" + +enum regions { + TILE_FULL, + }; + +int +parsec_core_potrf(parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; + int uplo; + int m, lda, *info; + dplasma_complex64_t *A; + + parsec_dtd_unpack_args(this_task, &uplo, &m, &A, &lda, &info); + + CORE_zpotrf(uplo, m, A, lda, info); + + return PARSEC_HOOK_RETURN_DONE; +} + +int +parsec_core_trsm(parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; + int side, uplo, trans, diag; + int m, n, lda, ldc; + dplasma_complex64_t alpha; + dplasma_complex64_t *A, *C; + + parsec_dtd_unpack_args(this_task, &side, &uplo, &trans, &diag, &m, &n, + &alpha, &A, &lda, &C, &ldc); + //fprintf(stderr, "core_trsm executed\n"); + + CORE_ztrsm(side, uplo, trans, diag, + m, n, alpha, + A, lda, + C, ldc); + + return PARSEC_HOOK_RETURN_DONE; +} + +int +parsec_core_herk(parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; + int uplo, trans; + int m, n, lda, ldc; + dplasma_complex64_t alpha; + dplasma_complex64_t beta; + dplasma_complex64_t *A; + dplasma_complex64_t *C; + + parsec_dtd_unpack_args(this_task, &uplo, &trans, &m, &n, &alpha, &A, + &lda, &beta, &C, &ldc); + + CORE_zherk(uplo, trans, m, n, + alpha, A, lda, + beta, C, ldc); + + return PARSEC_HOOK_RETURN_DONE; +} + +int +parsec_core_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; + int transA, transB; + int m, n, k, lda, ldb, ldc; + dplasma_complex64_t alpha, beta; + dplasma_complex64_t *A; + dplasma_complex64_t *B; + dplasma_complex64_t *C; + + parsec_dtd_unpack_args(this_task, &transA, &transB, &m, &n, &k, &alpha, + &A, &lda, &B, &ldb, &beta, &C, &ldc); + + CORE_zgemm(transA, transB, + m, n, k, + alpha, A, lda, + B, ldb, + beta, C, ldc); + + return PARSEC_HOOK_RETURN_DONE; +} + +int main(int argc, char **argv) +{ + parsec_context_t* parsec; + int iparam[IPARAM_SIZEOF]; + int uplo = dplasmaUpper; + int info = 0; + int ret = 0; + + int m, n, k, total; /* loop counter */ + /* Parameters passed on to Insert_task() */ + int tempkm, tempmm, ldak, ldam, side, transA_p, transA_g, diag, trans, transB, ldan; + dplasma_complex64_t alpha_trsm, alpha_herk, beta; + + /* Set defaults for non argv iparams */ + iparam_default_facto(iparam); + iparam_default_ibnbmb(iparam, 0, 180, 180); + iparam[IPARAM_NGPUS] = DPLASMA_ERR_NOT_SUPPORTED; + + /* Initialize PaRSEC */ + parsec = setup_parsec(argc, argv, iparam); + PASTE_CODE_IPARAM_LOCALS(iparam); + PASTE_CODE_FLOPS(FLOPS_ZPOTRF, ((DagDouble_t)N)); + + /* initializing matrix structure */ + LDA = dplasma_imax( LDA, N ); + LDB = dplasma_imax( LDB, N ); + KP = 1; + KQ = 1; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + sym_two_dim_block_cyclic, (&dcA, matrix_ComplexDouble, + rank, MB, NB, LDA, N, 0, 0, + N, N, P, nodes/P, uplo)); + + /* Initializing dc for dtd */ + sym_two_dim_block_cyclic_t *__dcA = &dcA; + parsec_dtd_data_collection_init((parsec_data_collection_t *)&dcA); + + /* matrix generation */ + if(loud > 3) printf("+++ Generate matrices ... "); + dplasma_zplghe( parsec, (double)(N), uplo, + (parsec_tiled_matrix_dc_t *)&dcA, random_seed); + if(loud > 3) printf("Done\n"); + + /* Getting new parsec handle of dtd type */ + parsec_taskpool_t *dtd_tp = parsec_dtd_taskpool_new(); + + /* Allocating data arrays to be used by comm engine */ + dplasma_add2arena_tile( &parsec_dtd_arenas_datatypes[TILE_FULL], + dcA.super.mb*dcA.super.nb*sizeof(dplasma_complex64_t), + PARSEC_ARENA_ALIGNMENT_SSE, + parsec_datatype_double_complex_t, dcA.super.mb ); + + /* Registering the handle with parsec context */ + parsec_context_add_taskpool( parsec, dtd_tp ); + + //sleep(40); + SYNC_TIME_START(); + + /* #### parsec context Starting #### */ + + /* start parsec context */ + parsec_context_start( parsec ); + + if( dplasmaLower == uplo ) { + + side = dplasmaRight; + transA_p = dplasmaConjTrans; + diag = dplasmaNonUnit; + alpha_trsm = 1.0; + trans = dplasmaNoTrans; + alpha_herk = -1.0; + beta = 1.0; + transB = dplasmaConjTrans; + transA_g = dplasmaNoTrans; + + total = dcA.super.mt; + /* Testing Insert Function */ + for( k = 0; k < total; k++ ) { + tempkm = (k == (dcA.super.mt - 1)) ? dcA.super.m - k * dcA.super.mb : dcA.super.mb; + ldak = BLKLDD(&dcA.super, k); + + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_potrf, + (total - k) * (total-k) * (total - k)/*priority*/, "Potrf", + sizeof(int), &uplo, PARSEC_VALUE, + sizeof(int), &tempkm, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, k), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldak, PARSEC_VALUE, + sizeof(int *), &info, PARSEC_SCRATCH, + PARSEC_DTD_ARG_END ); + + for( m = k+1; m < total; m++ ) { + tempmm = m == dcA.super.mt - 1 ? dcA.super.m - m * dcA.super.mb : dcA.super.mb; + ldam = BLKLDD(&dcA.super, m); + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_trsm, + (total - m) * (total-m) * (total - m) + 3 * ((2 * total) - k - m - 1) * (m - k)/*priority*/, "Trsm", + sizeof(int), &side, PARSEC_VALUE, + sizeof(int), &uplo, PARSEC_VALUE, + sizeof(int), &transA_p, PARSEC_VALUE, + sizeof(int), &diag, PARSEC_VALUE, + sizeof(int), &tempmm, PARSEC_VALUE, + sizeof(int), &dcA.super.nb, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &alpha_trsm, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, k), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldak, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, m, k), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldam, PARSEC_VALUE, + PARSEC_DTD_ARG_END ); + } + parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, k) ); + + for( m = k+1; m < dcA.super.nt; m++ ) { + tempmm = m == dcA.super.mt - 1 ? dcA.super.m - m * dcA.super.mb : dcA.super.mb; + ldam = BLKLDD(&dcA.super, m); + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_herk, + (total - m) * (total - m) * (total - m) + 3 * (m - k)/*priority*/, "Herk", + sizeof(int), &uplo, PARSEC_VALUE, + sizeof(int), &trans, PARSEC_VALUE, + sizeof(int), &tempmm, PARSEC_VALUE, + sizeof(int), &dcA.super.mb, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &alpha_herk, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, m, k), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldam, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &beta, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, m, m), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldam, PARSEC_VALUE, + PARSEC_DTD_ARG_END ); + + for( n = m+1; n < total; n++ ) { + ldan = BLKLDD(&dcA.super, n); + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_gemm, + (total - m) * (total - m) * (total - m) + 3 * ((2 * total) - m - n - 3) * (m - n) + 6 * (m - k) /*priority*/, "Gemm", + sizeof(int), &transA_g, PARSEC_VALUE, + sizeof(int), &transB, PARSEC_VALUE, + sizeof(int), &tempmm, PARSEC_VALUE, + sizeof(int), &dcA.super.mb, PARSEC_VALUE, + sizeof(int), &dcA.super.mb, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &alpha_herk, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, n, k), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldan, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, m, k), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldam, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &beta, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, n, m), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldan, PARSEC_VALUE, + PARSEC_DTD_ARG_END ); + } + parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, m, k) ); + } + } + } else { + side = dplasmaLeft; + transA_p = dplasmaConjTrans; + diag = dplasmaNonUnit; + alpha_trsm = 1.0; + trans = dplasmaConjTrans; + alpha_herk = -1.0; + beta = 1.0; + transB = dplasmaNoTrans; + transA_g = dplasmaConjTrans; + + total = dcA.super.nt; + + /* Variables used for collective */ + int root, num_dest_ranks, dest_rank_idx, flag; + int *dest_ranks = (int*)malloc((P+Q)*sizeof(int));; + for( k = 0; k < total; k++ ) { + tempkm = k == dcA.super.nt-1 ? dcA.super.n-k*dcA.super.nb : dcA.super.nb; + ldak = BLKLDD(&dcA.super, k); + if(parsec_dtd_rank_of_data(&dcA.super.super, k, k) == rank) { + //fprintf(stderr, "Inserting and executing potrf[%d %d] in rank: %d\n", k, k, rank); + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_potrf, + (total - k) * (total-k) * (total - k)/*priority*/, "Potrf", + sizeof(int), &uplo, PARSEC_VALUE, + sizeof(int), &tempkm, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, k), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldak, PARSEC_VALUE, + sizeof(int *), &info, PARSEC_SCRATCH, + PARSEC_DTD_ARG_END ); + } + + /* + * Broadcast the diagonal tile to the current panel + */ + root = parsec_dtd_rank_of_data(&dcA.super.super, k, k); + num_dest_ranks = Q -1; + //int *dest_ranks = (int*)malloc(num_dest_ranks*sizeof(int)); + dest_rank_idx = 0; + flag = 0; + for(int m = k+1; m < total; m++) { + int tile_rank = parsec_dtd_rank_of_data(&dcA.super.super, k, m); + if(tile_rank == root) {flag = 1; continue;} + dest_ranks[dest_rank_idx] = tile_rank; + if(tile_rank == rank) flag = 1; + ++dest_rank_idx; + if(dest_rank_idx == Q-1) break; /* this is to populate the destination ranks */ + } + + parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, k); + if( ( flag || (rank == root) ) && ( dest_rank_idx >= 1) ) { + //fprintf(stderr, "Broadcasting PO tile to TRSM. k %d, rank %d, root %d\n", k, rank, root); + int bcast_id = (1<<29) | (k*total+k+k); /* m*nt+n+k */ + parsec_dtd_broadcast_id( + bcast_id, + dtd_tp, rank, root, + dtd_tile_root, TILE_FULL, + dest_ranks, dest_rank_idx); + } + + for( m = k+1; m < total; m++ ) { + tempmm = m == dcA.super.nt-1 ? dcA.super.n-m*dcA.super.nb : dcA.super.nb; + if( (parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, k) == rank )) { + //fprintf(stderr, "Inserting trsm[%d %d][%d %d] in rank: %d owned: %d\n", k, k, k, m, rank, parsec_dtd_rank_of_data(&dcA.super.super, k, m)); + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_trsm, + (total - m) * (total-m) * (total - m) + 3 * ((2 * total) - k - m - 1) * (m - k)/*priority*/, "Trsm", + sizeof(int), &side, PARSEC_VALUE, + sizeof(int), &uplo, PARSEC_VALUE, + sizeof(int), &transA_p, PARSEC_VALUE, + sizeof(int), &diag, PARSEC_VALUE, + sizeof(int), &dcA.super.nb, PARSEC_VALUE, + sizeof(int), &tempmm, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &alpha_trsm, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, k), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldak, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, m), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldak, PARSEC_VALUE, + PARSEC_DTD_ARG_END ); + } + + /* + * Broadcast the TRSM tile to the descendant SYRK/GEMM tasks + */ + root = parsec_dtd_rank_of_data(&dcA.super.super, k, m); + num_dest_ranks = P+Q -1; + //int *dest_ranks = (int*)malloc(num_dest_ranks*sizeof(int)); + dest_rank_idx = 0; + flag = 0; + /* Loop over P and Q processes to gather the broadcast destinations */ + for(int i = k+1; i <= m; i++) { + int tile_rank = parsec_dtd_rank_of_data(&dcA.super.super, i, m); + if(tile_rank == root) {break;} /* we have loop over all the ranks in the column */ + dest_ranks[dest_rank_idx] = tile_rank; + if(tile_rank == rank) flag = 1; /* flip the flag for the owner of the tile */ + ++dest_rank_idx; + } + int diag_rank = parsec_dtd_rank_of_data(&dcA.super.super, m, m); + for(int j = m+1; j < total; j++) { + int tile_rank = parsec_dtd_rank_of_data(&dcA.super.super, m, j); + if(tile_rank == diag_rank) {break;} /* we have loop over all the ranks in the column */ + dest_ranks[dest_rank_idx] = tile_rank; + if(tile_rank == rank) flag = 1; /* flip the flag for the owner of the tile */ + ++dest_rank_idx; + } + + parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, m); + if( ( flag || (rank == root) ) && ( dest_rank_idx >= 1) ) { + //fprintf(stderr, "Broadcasting TRSM tile to SYRK and GEMM. k %d, m %d, rank %d, root %d\n", k, m, rank, root); + int bcast_id = (1<<29) | (k*total+m+k); /* m*nt+n+k */ + parsec_dtd_broadcast_id( + bcast_id, + dtd_tp, rank, root, + dtd_tile_root, TILE_FULL, + dest_ranks, dest_rank_idx); + } + } + parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, k) ); + + for( m = k+1; m < dcA.super.mt; m++ ) { + tempmm = m == dcA.super.nt-1 ? dcA.super.n-m*dcA.super.nb : dcA.super.nb; + ldam = BLKLDD(&dcA.super, m); + if(parsec_dtd_rank_of_data(&dcA.super.super, m, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank ) { + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_herk, + (total - m) * (total - m) * (total - m) + 3 * (m - k)/*priority*/, "Herk", + sizeof(int), &uplo, PARSEC_VALUE, + sizeof(int), &trans, PARSEC_VALUE, + sizeof(int), &tempmm, PARSEC_VALUE, + sizeof(int), &dcA.super.mb, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &alpha_herk, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, m), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldak, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &beta, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, m, m), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldam, PARSEC_VALUE, + PARSEC_DTD_ARG_END ); + } + for( n = m+1; n < total; n++ ) { + ldan = BLKLDD(&dcA.super, n); + if(parsec_dtd_rank_of_data(&dcA.super.super, m, n) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, n) == rank) { + parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_gemm, + (total - m) * (total - m) * (total - m) + 3 * ((2 * total) - m - n - 3) * (m - n) + 6 * (m - k) /*priority*/, "Gemm", + sizeof(int), &transA_g, PARSEC_VALUE, + sizeof(int), &transB, PARSEC_VALUE, + sizeof(int), &dcA.super.mb, PARSEC_VALUE, + sizeof(int), &tempmm, PARSEC_VALUE, + sizeof(int), &dcA.super.mb, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &alpha_herk, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, m), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldak, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, k, n), PARSEC_INPUT | TILE_FULL, + sizeof(int), &ldak, PARSEC_VALUE, + sizeof(dplasma_complex64_t), &beta, PARSEC_VALUE, + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, m, n), PARSEC_INOUT | TILE_FULL | PARSEC_AFFINITY, + sizeof(int), &ldan, PARSEC_VALUE, + PARSEC_DTD_ARG_END ); + } + } + parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, m) ); + } + } + } + + parsec_dtd_data_flush_all( dtd_tp, (parsec_data_collection_t *)&dcA ); + + /* finishing all the tasks inserted, but not finishing the handle */ + parsec_dtd_taskpool_wait( dtd_tp ); + + /* Waiting on all handle and turning everything off for this context */ + parsec_context_wait( parsec ); + + /* #### PaRSEC context is done #### */ + + SYNC_TIME_PRINT(rank, ("\tPxQ= %3d %-3d NB= %4d N= %7d : %14f gflops\n", + P, Q, NB, N, + gflops=(flops/1e9)/sync_time_elapsed)); + + /* Cleaning up the parsec handle */ + parsec_taskpool_free( dtd_tp ); + + if( 0 == rank && info != 0 ) { + printf("-- Factorization is suspicious (info = %d) ! \n", info); + ret |= 1; + } + if( !info && check ) { + /* Check the factorization */ + PASTE_CODE_ALLOCATE_MATRIX(dcA0, check, + sym_two_dim_block_cyclic, (&dcA0, matrix_ComplexDouble, + rank, MB, NB, LDA, N, 0, 0, + N, N, P, nodes/P, uplo)); + dplasma_zplghe( parsec, (double)(N), uplo, + (parsec_tiled_matrix_dc_t *)&dcA0, random_seed); + + ret |= check_zpotrf( parsec, (rank == 0) ? loud : 0, uplo, + (parsec_tiled_matrix_dc_t *)&dcA, + (parsec_tiled_matrix_dc_t *)&dcA0); + + /* Check the solution */ + PASTE_CODE_ALLOCATE_MATRIX(dcB, check, + two_dim_block_cyclic, (&dcB, matrix_ComplexDouble, matrix_Tile, + rank, MB, NB, LDB, NRHS, 0, 0, + N, NRHS, P, nodes/P, KP, KQ, IP, JQ)); + dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_dc_t *)&dcB, random_seed+1); + + PASTE_CODE_ALLOCATE_MATRIX(dcX, check, + two_dim_block_cyclic, (&dcX, matrix_ComplexDouble, matrix_Tile, + rank, MB, NB, LDB, NRHS, 0, 0, + N, NRHS, P, nodes/P, KP, KQ, IP, JQ)); + dplasma_zlacpy( parsec, dplasmaUpperLower, + (parsec_tiled_matrix_dc_t *)&dcB, (parsec_tiled_matrix_dc_t *)&dcX ); + + dplasma_zpotrs(parsec, uplo, + (parsec_tiled_matrix_dc_t *)&dcA, + (parsec_tiled_matrix_dc_t *)&dcX ); + + ret |= check_zaxmb( parsec, (rank == 0) ? loud : 0, uplo, + (parsec_tiled_matrix_dc_t *)&dcA0, + (parsec_tiled_matrix_dc_t *)&dcB, + (parsec_tiled_matrix_dc_t *)&dcX); + + /* Cleanup */ + parsec_data_free(dcA0.mat); dcA0.mat = NULL; + parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA0 ); + parsec_data_free(dcB.mat); dcB.mat = NULL; + parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcB ); + parsec_data_free(dcX.mat); dcX.mat = NULL; + parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcX ); + } + + /* Cleaning data arrays we allocated for communication */ + dplasma_matrix_del2arena( &parsec_dtd_arenas_datatypes[TILE_FULL] ); + parsec_dtd_data_collection_fini( (parsec_data_collection_t *)&dcA ); + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); + + cleanup_parsec(parsec, iparam); + return ret; +} From ed5a53a5d07f2afe5be1c613658aafa3b51c2039 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sat, 9 Oct 2021 16:15:51 -0400 Subject: [PATCH 13/41] key array propagate will need root rank info as part of key as well --- parsec/interfaces/superscalar/collectives.c | 4 +- .../interfaces/superscalar/insert_function.c | 13 ++++- .../superscalar/insert_function_internal.h | 1 - .../superscalar/parsec_dtd_data_flush.c | 1 - parsec/scheduling.c | 1 - .../superscalar/testing_zpotrf_dtd.c | 55 ++++++++++++++----- 6 files changed, 53 insertions(+), 22 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index e1f5e5613..0a845570d 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -168,7 +168,7 @@ void parsec_dtd_broadcast( dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<28) | dtd_tp->recv_task_id[root]++); + bcast_id = ( (1<<28) | (root << 18) | dtd_tp->recv_task_id[root]++); dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; } @@ -184,7 +184,7 @@ void parsec_dtd_broadcast( if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<29) | (root << 18) | (dtd_tp->recv_task_id[root] -1)); //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 88c6335d3..0270fb30a 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1686,7 +1686,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* root of the bcast key */ successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 18) | *(data_ptr+1+successor)); fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, successor); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); @@ -1715,7 +1715,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, populate_remote_deps(data_ptr, current_task->deps_out); successor = get_chain_successor(es, current_task, current_task->deps_out); fprintf(stderr, "continuation with chain successor %d\n", successor); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 18) | *(data_ptr+1+successor)); assert(NULL != current_task->super.data[current_dep].data_out); current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; @@ -1728,7 +1728,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key = ((uint64_t)(1<<28 | (root << 18 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); parsec_dtd_task_t* dtd_task = NULL; parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; @@ -3563,3 +3563,10 @@ parsec_dtd_get_taskpool(parsec_task_t *this_task) { return this_task->taskpool; } + +int +parsec_dtd_rank_of_data(parsec_dc_t *dc, int i, int j) +{ + parsec_data_key_t key = dc->data_key(dc, i, j); + return dc->rank_of_key(dc, key); +} diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index 31d97af65..66647ede6 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -119,7 +119,6 @@ typedef struct parsec_dtd_min_flow_info_s { 3 release remote data */ parsec_dtd_tile_t *tile; - int msg_keys[MAX_RANK_INFO]; /* enable user trimming, store dest rank send ID for a flow */ } parsec_dtd_min_flow_info_t; typedef struct parsec_dtd_flow_info_s { diff --git a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c index 0bc1a9890..e93d21e76 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c +++ b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c @@ -38,7 +38,6 @@ parsec_dtd_data_flush_sndrcv(parsec_execution_stream_t *es, parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; parsec_dtd_tile_t *tile = (FLOW_OF(current_task, 0))->tile; - fprintf(stderr, "Executed data flush body in rank %d\n", current_task->rank); assert(tile != NULL); #if defined(DISTRIBUTED) diff --git a/parsec/scheduling.c b/parsec/scheduling.c index 891da44c8..a2620681d 100644 --- a/parsec/scheduling.c +++ b/parsec/scheduling.c @@ -703,7 +703,6 @@ int parsec_context_wait( parsec_context_t* context ) { int ret = 0; - fprintf(stderr, "in parsec_context_wait on rank %d\n", context->my_rank); if( !(PARSEC_CONTEXT_FLAG_CONTEXT_ACTIVE & context->flags) ) { parsec_warning("parsec_context_wait detected on a non started context\n"); return -1; diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c index afee72517..39d26225e 100644 --- a/tests/interfaces/superscalar/testing_zpotrf_dtd.c +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -17,6 +17,7 @@ enum regions { TILE_FULL, + TILE_BCAST }; int @@ -29,6 +30,7 @@ parsec_core_potrf(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &uplo, &m, &A, &lda, &info); + fprintf(stderr, "core_potrf executed\n"); CORE_zpotrf(uplo, m, A, lda, info); return PARSEC_HOOK_RETURN_DONE; @@ -45,7 +47,8 @@ parsec_core_trsm(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &side, &uplo, &trans, &diag, &m, &n, &alpha, &A, &lda, &C, &ldc); - //fprintf(stderr, "core_trsm executed\n"); + int rank = this_task->taskpool->context->my_rank; + fprintf(stderr, "core_trsm executed on rank %d \n", rank); CORE_ztrsm(side, uplo, trans, diag, m, n, alpha, @@ -68,6 +71,7 @@ parsec_core_herk(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &uplo, &trans, &m, &n, &alpha, &A, &lda, &beta, &C, &ldc); + fprintf(stderr, "core_herk executed\n"); CORE_zherk(uplo, trans, m, n, alpha, A, lda, @@ -89,6 +93,7 @@ parsec_core_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &transA, &transB, &m, &n, &k, &alpha, &A, &lda, &B, &ldb, &beta, &C, &ldc); + fprintf(stderr, "core_gemm executed\n"); CORE_zgemm(transA, transB, m, n, k, @@ -107,6 +112,8 @@ int main(int argc, char **argv) int info = 0; int ret = 0; + //sleep(30); + int m, n, k, total; /* loop counter */ /* Parameters passed on to Insert_task() */ int tempkm, tempmm, ldak, ldam, side, transA_p, transA_g, diag, trans, transB, ldan; @@ -132,10 +139,18 @@ int main(int argc, char **argv) sym_two_dim_block_cyclic, (&dcA, matrix_ComplexDouble, rank, MB, NB, LDA, N, 0, 0, N, N, P, nodes/P, uplo)); + + PASTE_CODE_ALLOCATE_MATRIX(dcB, 1, + sym_two_dim_block_cyclic, (&dcB, matrix_Integer, + rank, 15, 15, 15*N/NB, 15*N/NB, 0, 0, + 15*N/NB, 15*N/NB, P, nodes/P, uplo)); /* Initializing dc for dtd */ sym_two_dim_block_cyclic_t *__dcA = &dcA; parsec_dtd_data_collection_init((parsec_data_collection_t *)&dcA); + + sym_two_dim_block_cyclic_t *__dcB = &dcB; + parsec_dtd_data_collection_init((parsec_data_collection_t *)&dcB); /* matrix generation */ if(loud > 3) printf("+++ Generate matrices ... "); @@ -151,6 +166,11 @@ int main(int argc, char **argv) dcA.super.mb*dcA.super.nb*sizeof(dplasma_complex64_t), PARSEC_ARENA_ALIGNMENT_SSE, parsec_datatype_double_complex_t, dcA.super.mb ); + + dplasma_add2arena_tile( &parsec_dtd_arenas_datatypes[TILE_BCAST], + dcB.super.mb*dcB.super.nb*sizeof(int), + PARSEC_ARENA_ALIGNMENT_SSE, + parsec_datatype_int32_t, dcB.super.mb ); /* Registering the handle with parsec context */ parsec_context_add_taskpool( parsec, dtd_tp ); @@ -269,7 +289,7 @@ int main(int argc, char **argv) tempkm = k == dcA.super.nt-1 ? dcA.super.n-k*dcA.super.nb : dcA.super.nb; ldak = BLKLDD(&dcA.super, k); if(parsec_dtd_rank_of_data(&dcA.super.super, k, k) == rank) { - //fprintf(stderr, "Inserting and executing potrf[%d %d] in rank: %d\n", k, k, rank); + fprintf(stderr, "Inserting and executing potrf[%d %d] in rank: %d\n", k, k, rank); parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_potrf, (total - k) * (total-k) * (total - k)/*priority*/, "Potrf", sizeof(int), &uplo, PARSEC_VALUE, @@ -297,20 +317,21 @@ int main(int argc, char **argv) if(dest_rank_idx == Q-1) break; /* this is to populate the destination ranks */ } - parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, k); if( ( flag || (rank == root) ) && ( dest_rank_idx >= 1) ) { + parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, k); + parsec_dtd_tile_t* dtd_key_root = PARSEC_DTD_TILE_OF(B, k, k); //fprintf(stderr, "Broadcasting PO tile to TRSM. k %d, rank %d, root %d\n", k, rank, root); - int bcast_id = (1<<29) | (k*total+k+k); /* m*nt+n+k */ - parsec_dtd_broadcast_id( - bcast_id, + parsec_dtd_broadcast( dtd_tp, rank, root, dtd_tile_root, TILE_FULL, + dtd_key_root, TILE_BCAST, dest_ranks, dest_rank_idx); } for( m = k+1; m < total; m++ ) { tempmm = m == dcA.super.nt-1 ? dcA.super.n-m*dcA.super.nb : dcA.super.nb; - if( (parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, k) == rank )) { + //if( (parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, k) == rank )) { + if( (parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank )) { //fprintf(stderr, "Inserting trsm[%d %d][%d %d] in rank: %d owned: %d\n", k, k, k, m, rank, parsec_dtd_rank_of_data(&dcA.super.super, k, m)); parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_trsm, (total - m) * (total-m) * (total - m) + 3 * ((2 * total) - k - m - 1) * (m - k)/*priority*/, "Trsm", @@ -353,23 +374,24 @@ int main(int argc, char **argv) ++dest_rank_idx; } - parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, m); if( ( flag || (rank == root) ) && ( dest_rank_idx >= 1) ) { + parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, m); + parsec_dtd_tile_t* dtd_key_root = PARSEC_DTD_TILE_OF(B, k, m); //fprintf(stderr, "Broadcasting TRSM tile to SYRK and GEMM. k %d, m %d, rank %d, root %d\n", k, m, rank, root); - int bcast_id = (1<<29) | (k*total+m+k); /* m*nt+n+k */ - parsec_dtd_broadcast_id( - bcast_id, + parsec_dtd_broadcast( dtd_tp, rank, root, dtd_tile_root, TILE_FULL, + dtd_key_root, TILE_BCAST, dest_ranks, dest_rank_idx); } } - parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, k) ); + //parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, k) ); for( m = k+1; m < dcA.super.mt; m++ ) { tempmm = m == dcA.super.nt-1 ? dcA.super.n-m*dcA.super.nb : dcA.super.nb; ldam = BLKLDD(&dcA.super, m); - if(parsec_dtd_rank_of_data(&dcA.super.super, m, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank ) { + //if(parsec_dtd_rank_of_data(&dcA.super.super, m, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank ) { + if(parsec_dtd_rank_of_data(&dcA.super.super, m, m) == rank ) { parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_herk, (total - m) * (total - m) * (total - m) + 3 * (m - k)/*priority*/, "Herk", sizeof(int), &uplo, PARSEC_VALUE, @@ -386,7 +408,8 @@ int main(int argc, char **argv) } for( n = m+1; n < total; n++ ) { ldan = BLKLDD(&dcA.super, n); - if(parsec_dtd_rank_of_data(&dcA.super.super, m, n) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, n) == rank) { + //if(parsec_dtd_rank_of_data(&dcA.super.super, m, n) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, n) == rank) { + if(parsec_dtd_rank_of_data(&dcA.super.super, m, n) == rank ) { parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_gemm, (total - m) * (total - m) * (total - m) + 3 * ((2 * total) - m - n - 3) * (m - n) + 6 * (m - k) /*priority*/, "Gemm", sizeof(int), &transA_g, PARSEC_VALUE, @@ -478,10 +501,14 @@ int main(int argc, char **argv) /* Cleaning data arrays we allocated for communication */ dplasma_matrix_del2arena( &parsec_dtd_arenas_datatypes[TILE_FULL] ); + dplasma_matrix_del2arena( &parsec_dtd_arenas_datatypes[TILE_BCAST] ); parsec_dtd_data_collection_fini( (parsec_data_collection_t *)&dcA ); + parsec_dtd_data_collection_fini( (parsec_data_collection_t *)&dcB ); parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_data_free(dcB.mat); dcB.mat = NULL; parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); + parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcB); cleanup_parsec(parsec, iparam); return ret; From 5ad52fecf2548f74ed74fab393acb524d6632390 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Wed, 13 Oct 2021 10:45:27 -0400 Subject: [PATCH 14/41] fix issue of pending_ack for created remote_deps --- parsec/interfaces/superscalar/collectives.c | 7 +- .../interfaces/superscalar/insert_function.c | 35 +++++--- .../superscalar/overlap_strategies.c | 83 ++++++++++--------- parsec/remote_dep.c | 1 + .../superscalar/testing_zpotrf_dtd.c | 10 ++- 5 files changed, 77 insertions(+), 59 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 0a845570d..8bedba49a 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -21,7 +21,7 @@ parsec_remote_deps_t* parsec_dtd_create_remote_deps( int myrank, int root, parsec_data_copy_t *data_copy, parsec_arena_datatype_t *arenas_datatype, int* dest_ranks, int num_dest_ranks) { - + parsec_remote_deps_t *deps = (parsec_remote_deps_t*)remote_deps_allocate(&parsec_remote_dep_context.freelist); assert(NULL != deps); @@ -140,7 +140,7 @@ void parsec_dtd_broadcast( data_ptr[100+i+1] = dest_ranks[i]; } } - fprintf(stderr, "finished bcast key packing\n"); + //fprintf(stderr, "finished bcast key packing\n"); // Retrieve DTD tile's data_copy parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; @@ -162,6 +162,7 @@ void parsec_dtd_broadcast( parsec_dtd_task_t *dtd_bcast_task_root = (parsec_dtd_task_t *)bcast_task_root; // Set broadcast topology info + deps_0->pending_ack = 1; dtd_bcast_task_root->deps_out = deps_0; if(myrank == root) { @@ -179,7 +180,7 @@ void parsec_dtd_broadcast( sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_bcast_key_root = (parsec_dtd_task_t *)bcast_key_root; - dtd_bcast_key_root->deps_out = NULL; + deps_1->pending_ack = 1; dtd_bcast_key_root->deps_out = deps_1; if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 0270fb30a..98baee59c 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1687,7 +1687,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 18) | *(data_ptr+1+successor)); - fprintf(stderr, "bcast root dep %d with chain successor %d\n", current_dep, successor); + //fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d\n", current_task->deps_out, successor, my_rank); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); parsec_remote_dep_activate( @@ -1714,7 +1714,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); populate_remote_deps(data_ptr, current_task->deps_out); successor = get_chain_successor(es, current_task, current_task->deps_out); - fprintf(stderr, "continuation with chain successor %d\n", successor); + //fprintf(stderr, "continuation with chain successor %d\n", successor); current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 18) | *(data_ptr+1+successor)); assert(NULL != current_task->super.data[current_dep].data_out); @@ -1730,15 +1730,25 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* update the BCAST DATA task or dep with the global ID that we know now */ uint64_t key = ((uint64_t)(1<<28 | (root << 18 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); + struct timespec rqtp; + uint64_t misses_in_a_row; + rqtp.tv_sec = 0; + misses_in_a_row = 2; parsec_dtd_task_t* dtd_task = NULL; parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; - while(dtd_task == NULL) + while(dtd_task == NULL) { + //rqtp.tv_nsec = exponential_backoff(misses_in_a_row); + //nanosleep(&rqtp, NULL); + //sleep(1); + misses_in_a_row++; dtd_task = parsec_dtd_find_task(tp, key); - parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); + //fprintf(stderr, "finding dtd task with iteration %d\n", misses_in_a_row); + } + //parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); //parsec_dtd_task_t* dtd_task = parsec_dtd_find_task(tp, key); parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); - fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); + //fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); populate_remote_deps(data_ptr, dtd_task->deps_out); parsec_dtd_untrack_task(tp, key); if(dep == NULL){ @@ -1752,7 +1762,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, remote_dep_dequeue_delayed_dep_release(dep); } parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); - parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + //parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); /* releasing the receiver task as the only desc task */ tile = FLOW_OF(current_task, current_dep)->tile; @@ -1938,7 +1948,7 @@ parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *es, if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) ) { /* root of the bcast key operation */ this_dtd_task = (parsec_dtd_task_t *)this_task; - fprintf(stderr, "bcast key release on rank %d\n", this_dtd_task->rank); + //fprintf(stderr, "bcast key release on rank %d\n", this_dtd_task->rank); //parsec_dtd_remote_task_retain(this_dtd_task); } else { int flow_index, track_flow = 0; @@ -2386,7 +2396,7 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* tc->nb_flows = flow_count; /* set to one so that prof_grpaher prints the task id properly */ tc->nb_parameters = 1; - tc->nb_locals = 8; + tc->nb_locals = 9; params[0] = &symb_dtd_taskid; locals[0] = &symb_dtd_taskid; locals[1] = &symb_dtd_taskid; @@ -2396,6 +2406,7 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* locals[5] = &symb_dtd_taskid; locals[6] = &symb_dtd_taskid; locals[7] = &symb_dtd_taskid; + locals[8] = &symb_dtd_taskid; tc->data_affinity = NULL; tc->initial_data = NULL; tc->final_data = (parsec_data_ref_fn_t *) NULL; @@ -2715,7 +2726,7 @@ parsec_dtd_create_and_initialize_task( parsec_dtd_taskpool_t *dtd_tp, /* this is needed for grapher to work properly */ this_task->super.locals[0].value = (int)(uintptr_t)this_task->ht_item.key; //assert( (uintptr_t)this_task->super.locals[0].value == (uintptr_t)this_task->ht_item.key ); - for(int idx = 0; idx < 8; idx++) { + for(int idx = 0; idx < 9; idx++) { this_task->super.locals[idx].value = 0; } this_task->super.task_class = tc; @@ -2839,7 +2850,7 @@ parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task { (void)es; (void)this_task; - fprintf(stderr, "bcast_key_fn executed\n"); + //fprintf(stderr, "bcast_key_fn executed\n"); return PARSEC_HOOK_RETURN_DONE; } @@ -2857,7 +2868,7 @@ parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_ta { (void)es; (void)this_task; - fprintf(stderr, "bcast_key_recv executed\n"); + //fprintf(stderr, "bcast_key_recv executed\n"); return PARSEC_HOOK_RETURN_DONE; } @@ -2875,7 +2886,7 @@ parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_tas { (void)es; (void)this_task; - fprintf(stderr, "bcast_data_fn executed\n"); + //fprintf(stderr, "bcast_data_fn executed\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index cac4aee31..8daef5094 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -219,6 +219,49 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, #if defined(PARSEC_DEBUG_ENABLE) assert(current_desc != NULL); #endif + + if(PARSEC_DTD_BCAST_DATA_TC_ID == current_task->super.task_class->task_class_id) { + /* for the bcast data class, in addition to release the data to local deps tasks that will read the data + * propagate the data down to descendants as well */ + if(current_task->deps_out != NULL) { + /* we have not propagate the remote deps yet, otherwise will be set to NULL */ + if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { + assert(NULL != current_task->super.data[current_dep].data_out); + //fprintf(stderr, "bcast root data with global key %d\n", current_task->ht_item.key); + current_task->deps_out->output[0].data.data = + current_task->super.data[current_dep].data_out; + //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + /* current node is part of the broadcast operation, propagate downstream */ + int root = current_task->deps_out->root; + int my_rank = current_task->super.taskpool->context->my_rank; + int _array_pos, _array_mask; + struct remote_dep_output_param_s* output; + output = ¤t_task->deps_out->output[0]; + _array_pos = my_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); + //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p\n", my_rank, root, current_task); + + if ((output->rank_bits[_array_pos] & _array_mask)) { + assert(NULL != current_task->super.data[current_dep].data_out); + + current_task->deps_out->output[0].data.data = + current_task->super.data[0].data_out; + //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + } + } + } + } /* BCAST DATA propagation */ /* setting data */ data.data = current_task->super.data[current_dep].data_out; @@ -343,46 +386,6 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, } } while (0 == get_out); - if(PARSEC_DTD_BCAST_DATA_TC_ID == current_task->super.task_class->task_class_id) { - /* for the bcast data class, in addition to release the data to local deps tasks that will read the data - * propagate the data down to descendants as well */ - if(current_task->deps_out != NULL) { - /* we have not propagate the remote deps yet, otherwise will be set to NULL */ - if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { - assert(NULL != current_task->super.data[current_dep].data_out); - fprintf(stderr, "bcast root data with global key %d\n", current_task->ht_item.key); - current_task->deps_out->output[0].data.data = - current_task->super.data[current_dep].data_out; - //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - /* current node is part of the broadcast operation, propagate downstream */ - int root = current_task->deps_out->root; - int my_rank = current_task->super.taskpool->context->my_rank; - int _array_pos, _array_mask; - struct remote_dep_output_param_s* output; - output = ¤t_task->deps_out->output[0]; - _array_pos = my_rank / (8 * sizeof(uint32_t)); - _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); - - if ((output->rank_bits[_array_pos] & _array_mask)) { - assert(NULL != current_task->super.data[current_dep].data_out); - - current_task->deps_out->output[0].data.data = - current_task->super.data[0].data_out; - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - } - } - } - } /* BCAST DATA propagation */ } } } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index a256bd518..62a8e332f 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -558,6 +558,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, } } #endif /* PARSEC_DEBUG_NOISIER */ + //fprintf(stderr, "on node %d with locals[0].value %d my_idx %d, idx %d\n", es->virtual_process->parsec_context->my_rank, remote_deps->msg.locals[0].value, my_idx, idx); assert(output->parent->taskpool == task->taskpool); if( 0 == parsec_atomic_fetch_inc_int32(&remote_deps->pending_ack) ) { keeper = 1; diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c index 39d26225e..67dfc6af1 100644 --- a/tests/interfaces/superscalar/testing_zpotrf_dtd.c +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -30,7 +30,8 @@ parsec_core_potrf(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &uplo, &m, &A, &lda, &info); - fprintf(stderr, "core_potrf executed\n"); + int rank = this_task->taskpool->context->my_rank; + fprintf(stderr, "core_potrf executed on rank %d\n", rank); CORE_zpotrf(uplo, m, A, lda, info); return PARSEC_HOOK_RETURN_DONE; @@ -71,7 +72,7 @@ parsec_core_herk(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &uplo, &trans, &m, &n, &alpha, &A, &lda, &beta, &C, &ldc); - fprintf(stderr, "core_herk executed\n"); + //fprintf(stderr, "core_herk executed\n"); CORE_zherk(uplo, trans, m, n, alpha, A, lda, @@ -93,7 +94,8 @@ parsec_core_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &transA, &transB, &m, &n, &k, &alpha, &A, &lda, &B, &ldb, &beta, &C, &ldc); - fprintf(stderr, "core_gemm executed\n"); + int rank = this_task->taskpool->context->my_rank; + fprintf(stderr, "core_gemm executed on rank %d\n", rank); CORE_zgemm(transA, transB, m, n, k, @@ -289,7 +291,7 @@ int main(int argc, char **argv) tempkm = k == dcA.super.nt-1 ? dcA.super.n-k*dcA.super.nb : dcA.super.nb; ldak = BLKLDD(&dcA.super, k); if(parsec_dtd_rank_of_data(&dcA.super.super, k, k) == rank) { - fprintf(stderr, "Inserting and executing potrf[%d %d] in rank: %d\n", k, k, rank); + //fprintf(stderr, "Inserting and executing potrf[%d %d] in rank: %d\n", k, k, rank); parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_potrf, (total - k) * (total-k) * (total - k)/*priority*/, "Potrf", sizeof(int), &uplo, PARSEC_VALUE, From 43963538dbf5e6ed7db5a98e152abd863aa1b9f6 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Thu, 21 Oct 2021 11:16:00 -0400 Subject: [PATCH 15/41] fix issue in get successor rank; TODO: data copy and remote deps recycle --- parsec/interfaces/superscalar/collectives.c | 12 +++---- .../interfaces/superscalar/insert_function.c | 36 +++++++++++-------- parsec/remote_dep.c | 4 ++- parsec/remote_dep_mpi.c | 13 ++++++- .../dtd_test_broadcast_collective.c | 2 +- .../superscalar/testing_zpotrf_dtd.c | 2 +- 6 files changed, 44 insertions(+), 25 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 8bedba49a..26ac5a9e3 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -71,11 +71,11 @@ parsec_remote_deps_t* parsec_dtd_create_remote_deps( _array_pos = myrank / (8 * sizeof(uint32_t)); _array_mask = 1 << (myrank % (8 * sizeof(uint32_t))); - if( !(output->rank_bits[_array_pos] & _array_mask) ) { + //if( !(output->rank_bits[_array_pos] & _array_mask) ) { output->rank_bits[_array_pos] |= _array_mask; output->deps_mask |= (1 << 0); /* not used by DTD? */ output->count_bits++; - } + //} } return deps; @@ -162,14 +162,14 @@ void parsec_dtd_broadcast( parsec_dtd_task_t *dtd_bcast_task_root = (parsec_dtd_task_t *)bcast_task_root; // Set broadcast topology info - deps_0->pending_ack = 1; + deps_0->pending_ack = 0; dtd_bcast_task_root->deps_out = deps_0; if(myrank == root) { dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<28) | (root << 18) | dtd_tp->recv_task_id[root]++); + bcast_id = ( (1<<28) | (root << 18) | (myrank << 13) | dtd_tp->recv_task_id[root]++); dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; } @@ -180,12 +180,12 @@ void parsec_dtd_broadcast( sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_bcast_key_root = (parsec_dtd_task_t *)bcast_key_root; - deps_1->pending_ack = 1; + deps_1->pending_ack = 0; dtd_bcast_key_root->deps_out = deps_1; if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - bcast_id = ( (1<<29) | (root << 18) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<29) | (root << 18) | (myrank << 13) | (dtd_tp->recv_task_id[root] -1)); //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 98baee59c..6e6f11594 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1606,9 +1606,11 @@ get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_re count++; boffset = rank / (8 * sizeof(uint32_t)); - if(dep_fw_mask[boffset] & ((uint32_t)1) << (rank % (8 * sizeof(uint32_t)))) - continue; + //if(dep_fw_mask[boffset] & ((uint32_t)1) << (rank % (8 * sizeof(uint32_t)))) + // continue; idx++; + if(es->virtual_process->parsec_context->my_rank == 6) + fprintf(stderr, "idx %d, checking rank %d\n", idx, rank); if(my_idx == -1) { if(rank == es->virtual_process->parsec_context->my_rank) { my_idx = idx; @@ -1625,7 +1627,7 @@ get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_re } } } - return 0; + return -1; } static int @@ -1636,7 +1638,6 @@ populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) uint32_t dest_rank_idx; /* TODO: don't assume the length of data_ptr */ int num_dest_ranks = data_ptr[100]; - for(dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { uint32_t dest_rank = data_ptr[100+dest_rank_idx+1]; _array_pos = dest_rank / (8 * sizeof(uint32_t)); @@ -1686,8 +1687,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* root of the bcast key */ successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 18) | *(data_ptr+1+successor)); - //fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d\n", current_task->deps_out, successor, my_rank); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 18) | (successor << 13) | *(data_ptr+1+successor)); + fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d value %d\n", current_task->deps_out, successor, my_rank, current_task->super.locals[0].value); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); parsec_remote_dep_activate( @@ -1714,8 +1715,11 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); populate_remote_deps(data_ptr, current_task->deps_out); successor = get_chain_successor(es, current_task, current_task->deps_out); - //fprintf(stderr, "continuation with chain successor %d\n", successor); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 18) | *(data_ptr+1+successor)); + fprintf(stderr, "continuation with chain successor %d on rank %d value %d, current_task %p, data_ptr %p deps_out %p rank bits %d\n", successor, my_rank, current_task->super.locals[0].value, current_task, data_ptr, current_task->deps_out, current_task->deps_out->output[0].rank_bits[0]); + if(successor == -1) { + current_task->deps_out->outgoing_mask = 0; + } + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 18) | (successor << 13) | *(data_ptr+1+successor)); assert(NULL != current_task->super.data[current_dep].data_out); current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; @@ -1726,9 +1730,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ - /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 18 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key = ((uint64_t)(1<<28 | (root << 18 ) | (my_rank << 13) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); struct timespec rqtp; uint64_t misses_in_a_row; @@ -1736,13 +1739,15 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, misses_in_a_row = 2; parsec_dtd_task_t* dtd_task = NULL; parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; + while(dtd_task == NULL) { - //rqtp.tv_nsec = exponential_backoff(misses_in_a_row); - //nanosleep(&rqtp, NULL); + rqtp.tv_nsec = exponential_backoff(misses_in_a_row); + nanosleep(&rqtp, NULL); //sleep(1); misses_in_a_row++; dtd_task = parsec_dtd_find_task(tp, key); - //fprintf(stderr, "finding dtd task with iteration %d\n", misses_in_a_row); + if(misses_in_a_row > 3) + fprintf(stderr, "finding dtd task with iteration %d\n", misses_in_a_row); } //parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); @@ -1763,6 +1768,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, } parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); //parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + /* releasing the receiver task as the only desc task */ tile = FLOW_OF(current_task, current_dep)->tile; @@ -2148,7 +2154,7 @@ parsec_dtd_remote_task_release( parsec_dtd_task_t *this_task ) for( current_flow = 0; current_flow < this_task->super.task_class->nb_flows; current_flow++ ) { if( !((FLOW_OF(this_task, current_flow))->op_type & PARSEC_DONT_TRACK) ) { if( NULL != this_task->super.data[current_flow].data_out ) { - parsec_dtd_release_data_copy(this_task->super.data[current_flow].data_out); + //parsec_dtd_release_data_copy(this_task->super.data[current_flow].data_out); } } @@ -2868,7 +2874,7 @@ parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_ta { (void)es; (void)this_task; - //fprintf(stderr, "bcast_key_recv executed\n"); + fprintf(stderr, "bcast_key_recv executed\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 62a8e332f..eb7c9500e 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -212,7 +212,7 @@ inline void remote_deps_free(parsec_remote_deps_t* deps) memset( &deps->msg, 0, sizeof(remote_dep_wire_activate_t) ); #endif deps->taskpool = NULL; - parsec_lifo_push(deps->origin, (parsec_list_item_t*)deps); + //parsec_lifo_push(deps->origin, (parsec_list_item_t*)deps); PARSEC_VALGRIND_MEMPOOL_FREE(deps->origin, ((unsigned char *)deps)+sizeof(parsec_list_item_t)); } @@ -569,6 +569,8 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, */ (void)parsec_atomic_fetch_inc_int32(&remote_deps->pending_ack); } + //if(PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type && task->task_class->task_class_id == 2) + // remote_dep_inc_flying_messages(task->taskpool); remote_dep_send(es, rank, remote_deps); } else { PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- skip (not my direct descendant)", diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index d2783f9af..7add3b2ca 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -750,6 +750,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if( NULL == dtd_task ) { return_defer = 1; + fprintf(stderr, "defer receive\n"); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to @@ -932,7 +933,9 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, remote_dep_complete_and_cleanup(&origin, 1); } else { //remote_dep_complete_and_cleanup(&origin, 1); - remote_deps_free(origin); + //remote_deps_free(origin); + //remote_dep_dec_flying_messages(task.taskpool); + } #else remote_deps_free(origin); @@ -1859,6 +1862,10 @@ remote_dep_mpi_save_put_cb(parsec_execution_stream_t* es, assert(0 != deps->pending_ack); assert(0 != deps->outgoing_mask); item->priority = deps->max_priority; + + PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI: SAVE PUT CB for %s from %d tag %u which 0x%x (deps %p)", + remote_dep_cmd_to_string(&deps->msg, tmp, MAX_TASK_STRLEN), item->cmd.activate.peer, + task->tag, task->output_mask, (void*)deps); /* Get the highest priority PUT operation */ parsec_list_nolock_push_sorted(&dep_put_fifo, (parsec_list_item_t*)item, dep_cmd_prio); @@ -1955,6 +1962,8 @@ remote_dep_mpi_put_end_cb(parsec_execution_stream_t* es, DEBUG_MARK_DTA_MSG_END_SEND(status->MPI_TAG); TAKE_TIME(MPIsnd_prof, MPI_Data_plds_ek, cb->idx); remote_dep_complete_and_cleanup(&deps, 1); + //if(deps != NULL) + // remote_dep_complete_and_cleanup(&deps, 1); parsec_comm_puts--; (void)es; return 0; @@ -2296,6 +2305,8 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, } #if !defined(PARSEC_PROF_DRY_DEP) if(msg.output_mask) { + PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tTO\t%d\tMPI SEND GET\t% -8s\tk=%d\twith datakey %lx ", + from, tmp, k, task->deps); TAKE_TIME_WITH_INFO(MPIctl_prof, MPI_Data_ctl_sk, get, from, es->virtual_process->parsec_context->my_rank, (*task)); MPI_Send(&msg, datakey_count, datakey_dtt, from, diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 35ea089ac..8a0572098 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -345,7 +345,7 @@ for(int iter=1; iter <= 0; iter++) { } } - parsec_dtd_data_flush_all( dtd_tp, A ); + //parsec_dtd_data_flush_all( dtd_tp, A ); //parsec_dtd_data_flush_all( dtd_tp, B ); // Wait for task completion diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c index 67dfc6af1..75578a29c 100644 --- a/tests/interfaces/superscalar/testing_zpotrf_dtd.c +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -430,7 +430,7 @@ int main(int argc, char **argv) PARSEC_DTD_ARG_END ); } } - parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, m) ); + //parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, m) ); } } } From af22ab6b8421af47195695754774519930c6cfe9 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sat, 23 Oct 2021 18:50:42 -0400 Subject: [PATCH 16/41] window size limits the key bcast in current scheme; incresing as temp fix --- parsec/interfaces/superscalar/collectives.c | 6 ++-- .../interfaces/superscalar/insert_function.c | 31 ++++++++++--------- parsec/remote_dep.c | 2 +- parsec/remote_dep_mpi.c | 2 +- .../superscalar/testing_zpotrf_dtd.c | 8 ++--- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 26ac5a9e3..bd2211b7b 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -127,7 +127,7 @@ void parsec_dtd_broadcast( parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)taskpool; if(myrank == root) { - bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); + bcast_id = ( (1<<29) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; parsec_data_copy = bcast_keys_root->data_copy; @@ -169,7 +169,7 @@ void parsec_dtd_broadcast( dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<28) | (root << 18) | (myrank << 13) | dtd_tp->recv_task_id[root]++); + bcast_id = ( (1<<28) | (root << 21) | (myrank << 16) | dtd_tp->recv_task_id[root]++); dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; } @@ -185,7 +185,7 @@ void parsec_dtd_broadcast( if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - bcast_id = ( (1<<29) | (root << 18) | (myrank << 13) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<27) | (root << 21) | (myrank << 16) | (dtd_tp->recv_task_id[root] -1)); //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 6e6f11594..999b57a23 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -60,8 +60,8 @@ static int parsec_dtd_profile_verbose = 0; static parsec_dc_key_t parsec_dtd_dc_id = 0; int32_t __parsec_dtd_is_initialized = 0; /**< Indicates init of dtd environment is completed */ -int parsec_dtd_window_size = 8000; /**< Default window size */ -int parsec_dtd_threshold_size = 4000; /**< Default threshold size of tasks for master thread to wait on */ +int parsec_dtd_window_size = 100000; /**< Default window size */ +int parsec_dtd_threshold_size = 2000; /**< Default threshold size of tasks for master thread to wait on */ static int parsec_dtd_task_hash_table_size = 1<<16; /**< Default task hash table size */ static int parsec_dtd_tile_hash_table_size = 1<<16; /**< Default tile hash table size */ static int parsec_dtd_no_of_arenas_datatypes = 16; @@ -1609,8 +1609,8 @@ get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_re //if(dep_fw_mask[boffset] & ((uint32_t)1) << (rank % (8 * sizeof(uint32_t)))) // continue; idx++; - if(es->virtual_process->parsec_context->my_rank == 6) - fprintf(stderr, "idx %d, checking rank %d\n", idx, rank); + //if(es->virtual_process->parsec_context->my_rank == 6) + // fprintf(stderr, "idx %d, checking rank %d\n", idx, rank); if(my_idx == -1) { if(rank == es->virtual_process->parsec_context->my_rank) { my_idx = idx; @@ -1687,8 +1687,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* root of the bcast key */ successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 18) | (successor << 13) | *(data_ptr+1+successor)); - fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d value %d\n", current_task->deps_out, successor, my_rank, current_task->super.locals[0].value); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<27) |(current_task->deps_out->root << 21) | (successor << 16) | *(data_ptr+1+successor)); + //fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d value %d\n", current_task->deps_out, successor, my_rank, current_task->super.locals[0].value); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); parsec_remote_dep_activate( @@ -1715,11 +1715,11 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); populate_remote_deps(data_ptr, current_task->deps_out); successor = get_chain_successor(es, current_task, current_task->deps_out); - fprintf(stderr, "continuation with chain successor %d on rank %d value %d, current_task %p, data_ptr %p deps_out %p rank bits %d\n", successor, my_rank, current_task->super.locals[0].value, current_task, data_ptr, current_task->deps_out, current_task->deps_out->output[0].rank_bits[0]); + //fprintf(stderr, "continuation with chain successor %d on rank %d value %d, current_task %p, data_ptr %p deps_out %p rank bits %d\n", successor, my_rank, current_task->super.locals[0].value, current_task, data_ptr, current_task->deps_out, current_task->deps_out->output[0].rank_bits[0]); if(successor == -1) { current_task->deps_out->outgoing_mask = 0; } - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 18) | (successor << 13) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<27) | (root << 21) | (successor << 16) | *(data_ptr+1+successor)); assert(NULL != current_task->super.data[current_dep].data_out); current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; @@ -1729,9 +1729,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; - parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 18 ) | (my_rank << 13) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key = ((uint64_t)(1<<28 | (root << 21 ) | (my_rank << 16) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); struct timespec rqtp; uint64_t misses_in_a_row; @@ -1743,11 +1742,12 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, while(dtd_task == NULL) { rqtp.tv_nsec = exponential_backoff(misses_in_a_row); nanosleep(&rqtp, NULL); - //sleep(1); misses_in_a_row++; dtd_task = parsec_dtd_find_task(tp, key); - if(misses_in_a_row > 3) - fprintf(stderr, "finding dtd task with iteration %d\n", misses_in_a_row); + if(misses_in_a_row > 10) { + //sleep(1); + fprintf(stderr, "finding dtd task with iteration %d for key %ld key2 %ld on rank %d\n", misses_in_a_row, key, key2, my_rank); + } } //parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); @@ -1769,6 +1769,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); //parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ /* releasing the receiver task as the only desc task */ tile = FLOW_OF(current_task, current_dep)->tile; @@ -2154,7 +2155,7 @@ parsec_dtd_remote_task_release( parsec_dtd_task_t *this_task ) for( current_flow = 0; current_flow < this_task->super.task_class->nb_flows; current_flow++ ) { if( !((FLOW_OF(this_task, current_flow))->op_type & PARSEC_DONT_TRACK) ) { if( NULL != this_task->super.data[current_flow].data_out ) { - //parsec_dtd_release_data_copy(this_task->super.data[current_flow].data_out); + parsec_dtd_release_data_copy(this_task->super.data[current_flow].data_out); } } @@ -2874,7 +2875,7 @@ parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_ta { (void)es; (void)this_task; - fprintf(stderr, "bcast_key_recv executed\n"); + //fprintf(stderr, "bcast_key_recv executed\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index eb7c9500e..62c9654c4 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -212,7 +212,7 @@ inline void remote_deps_free(parsec_remote_deps_t* deps) memset( &deps->msg, 0, sizeof(remote_dep_wire_activate_t) ); #endif deps->taskpool = NULL; - //parsec_lifo_push(deps->origin, (parsec_list_item_t*)deps); + parsec_lifo_push(deps->origin, (parsec_list_item_t*)deps); PARSEC_VALGRIND_MEMPOOL_FREE(deps->origin, ((unsigned char *)deps)+sizeof(parsec_list_item_t)); } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 7add3b2ca..46f3a341e 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -750,7 +750,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if( NULL == dtd_task ) { return_defer = 1; - fprintf(stderr, "defer receive\n"); + //fprintf(stderr, "defer receive\n"); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c index 75578a29c..0c9292815 100644 --- a/tests/interfaces/superscalar/testing_zpotrf_dtd.c +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -49,7 +49,7 @@ parsec_core_trsm(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &side, &uplo, &trans, &diag, &m, &n, &alpha, &A, &lda, &C, &ldc); int rank = this_task->taskpool->context->my_rank; - fprintf(stderr, "core_trsm executed on rank %d \n", rank); + //fprintf(stderr, "core_trsm executed on rank %d \n", rank); CORE_ztrsm(side, uplo, trans, diag, m, n, alpha, @@ -95,7 +95,7 @@ parsec_core_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &transA, &transB, &m, &n, &k, &alpha, &A, &lda, &B, &ldb, &beta, &C, &ldc); int rank = this_task->taskpool->context->my_rank; - fprintf(stderr, "core_gemm executed on rank %d\n", rank); + //fprintf(stderr, "core_gemm executed on rank %d\n", rank); CORE_zgemm(transA, transB, m, n, k, @@ -387,7 +387,7 @@ int main(int argc, char **argv) dest_ranks, dest_rank_idx); } } - //parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, k) ); + parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, k) ); for( m = k+1; m < dcA.super.mt; m++ ) { tempmm = m == dcA.super.nt-1 ? dcA.super.n-m*dcA.super.nb : dcA.super.nb; @@ -430,7 +430,7 @@ int main(int argc, char **argv) PARSEC_DTD_ARG_END ); } } - //parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, m) ); + parsec_dtd_data_flush( dtd_tp, PARSEC_DTD_TILE_OF(A, k, m) ); } } } From c11afe1b9523699476ff3240bba80f6a7f980f8d Mon Sep 17 00:00:00 2001 From: Yu Pei Date: Tue, 26 Oct 2021 20:39:47 +0300 Subject: [PATCH 17/41] allocating more bits for ranks --- parsec/interfaces/superscalar/collectives.c | 10 +++++----- parsec/interfaces/superscalar/insert_function.c | 10 +++++----- .../interfaces/superscalar/insert_function_internal.h | 2 +- parsec/remote_dep_mpi.c | 2 -- tests/interfaces/superscalar/testing_zpotrf_dtd.c | 6 +++--- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index bd2211b7b..9f1f56543 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -127,17 +127,17 @@ void parsec_dtd_broadcast( parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)taskpool; if(myrank == root) { - bcast_id = ( (1<<29) | (root << 18) | dtd_tp->bcast_id); + bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; parsec_data_copy = bcast_keys_root->data_copy; data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); data_ptr[0] = bcast_id; - data_ptr[100] = num_dest_ranks; + data_ptr[400] = num_dest_ranks; for(int i = 0; i < num_dest_ranks; i++) { data_ptr[dest_ranks[i]+1] = dtd_tp->send_task_id[dest_ranks[i]]++; //pack the ranks at the end of the tiles as well - data_ptr[100+i+1] = dest_ranks[i]; + data_ptr[400+i+1] = dest_ranks[i]; } } //fprintf(stderr, "finished bcast key packing\n"); @@ -169,7 +169,7 @@ void parsec_dtd_broadcast( dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<28) | (root << 21) | (myrank << 16) | dtd_tp->recv_task_id[root]++); + bcast_id = ( (1<<28) | (root << 20) | (myrank << 12) | dtd_tp->recv_task_id[root]++); dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; } @@ -185,7 +185,7 @@ void parsec_dtd_broadcast( if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - bcast_id = ( (1<<27) | (root << 21) | (myrank << 16) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<29) | (root << 20) | (myrank << 12) | (dtd_tp->recv_task_id[root] -1)); //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 999b57a23..b8dbdb69c 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1637,9 +1637,9 @@ populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) int _array_pos, _array_mask; uint32_t dest_rank_idx; /* TODO: don't assume the length of data_ptr */ - int num_dest_ranks = data_ptr[100]; + int num_dest_ranks = data_ptr[400]; for(dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { - uint32_t dest_rank = data_ptr[100+dest_rank_idx+1]; + uint32_t dest_rank = data_ptr[400+dest_rank_idx+1]; _array_pos = dest_rank / (8 * sizeof(uint32_t)); _array_mask = 1 << (dest_rank % (8 * sizeof(uint32_t))); @@ -1687,7 +1687,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* root of the bcast key */ successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<27) |(current_task->deps_out->root << 21) | (successor << 16) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | (successor << 12) | *(data_ptr+1+successor)); //fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d value %d\n", current_task->deps_out, successor, my_rank, current_task->super.locals[0].value); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); @@ -1719,7 +1719,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, if(successor == -1) { current_task->deps_out->outgoing_mask = 0; } - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<27) | (root << 21) | (successor << 16) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | (successor << 12) | *(data_ptr+1+successor)); assert(NULL != current_task->super.data[current_dep].data_out); current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; @@ -1730,7 +1730,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 21 ) | (my_rank << 16) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | (my_rank << 12) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); struct timespec rqtp; uint64_t misses_in_a_row; diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index 66647ede6..ace54f9f8 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -133,7 +133,7 @@ typedef struct parsec_dtd_flow_info_s { 4 release ownership even when the flow is of type R */ parsec_dtd_tile_t *tile; - int msg_keys[MAX_RANK_INFO]; /* enable user trimming, store dest rank send ID for a flow */ + int msg_keys[MAX_RANK_INFO*sizeof(int)*8]; /* enable user trimming, store dest rank send ID for a flow */ int rank_sent_to[MAX_RANK_INFO]; /* currently support 1024 nodes */ } parsec_dtd_flow_info_t; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 46f3a341e..de457a8d5 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -2305,8 +2305,6 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, } #if !defined(PARSEC_PROF_DRY_DEP) if(msg.output_mask) { - PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream, "MPI:\tTO\t%d\tMPI SEND GET\t% -8s\tk=%d\twith datakey %lx ", - from, tmp, k, task->deps); TAKE_TIME_WITH_INFO(MPIctl_prof, MPI_Data_ctl_sk, get, from, es->virtual_process->parsec_context->my_rank, (*task)); MPI_Send(&msg, datakey_count, datakey_dtt, from, diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c index 0c9292815..6e196dc44 100644 --- a/tests/interfaces/superscalar/testing_zpotrf_dtd.c +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -141,11 +141,11 @@ int main(int argc, char **argv) sym_two_dim_block_cyclic, (&dcA, matrix_ComplexDouble, rank, MB, NB, LDA, N, 0, 0, N, N, P, nodes/P, uplo)); - + int bsize = 30; PASTE_CODE_ALLOCATE_MATRIX(dcB, 1, sym_two_dim_block_cyclic, (&dcB, matrix_Integer, - rank, 15, 15, 15*N/NB, 15*N/NB, 0, 0, - 15*N/NB, 15*N/NB, P, nodes/P, uplo)); + rank, bsize, bsize, bsize*N/NB, bsize*N/NB, 0, 0, + bsize*N/NB, bsize*N/NB, P, nodes/P, uplo)); /* Initializing dc for dtd */ sym_two_dim_block_cyclic_t *__dcA = &dcA; From 8bec4b026f314edcc37954546e1968cc46a66e05 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Wed, 3 Nov 2021 16:42:27 -0400 Subject: [PATCH 18/41] hash table to store key arrays, still overfill task window --- parsec/interfaces/superscalar/collectives.c | 9 +- .../interfaces/superscalar/insert_function.c | 104 +++++++++++++----- .../superscalar/insert_function_internal.h | 1 + .../dtd_test_broadcast_collective.c | 28 ++--- 4 files changed, 100 insertions(+), 42 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 9f1f56543..146918074 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -169,7 +169,7 @@ void parsec_dtd_broadcast( dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<28) | (root << 20) | (myrank << 12) | dtd_tp->recv_task_id[root]++); + bcast_id = ( (1<<28) | (root << 20) | dtd_tp->recv_task_id[root]++); dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; } @@ -185,7 +185,7 @@ void parsec_dtd_broadcast( if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - bcast_id = ( (1<<29) | (root << 20) | (myrank << 12) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<29) | (root << 20) | (dtd_tp->recv_task_id[root] -1)); //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; @@ -193,8 +193,6 @@ void parsec_dtd_broadcast( /* Post the bcast of keys and ranks array */ parsec_insert_dtd_task(dtd_bcast_key_root); - /* Post the bcast tasks for the actual data */ - parsec_insert_dtd_task(dtd_bcast_task_root); if(myrank == root) { //for (int dest_rank = 0; dest_rank < num_dest_ranks; ++dest_rank) { @@ -215,6 +213,9 @@ void parsec_dtd_broadcast( parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; parsec_insert_dtd_task(dtd_retrieve_task); } + + /* Post the bcast tasks for the actual data */ + parsec_insert_dtd_task(dtd_bcast_task_root); } #endif diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index b8dbdb69c..8d022a560 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -60,8 +60,8 @@ static int parsec_dtd_profile_verbose = 0; static parsec_dc_key_t parsec_dtd_dc_id = 0; int32_t __parsec_dtd_is_initialized = 0; /**< Indicates init of dtd environment is completed */ -int parsec_dtd_window_size = 100000; /**< Default window size */ -int parsec_dtd_threshold_size = 2000; /**< Default threshold size of tasks for master thread to wait on */ +int parsec_dtd_window_size = 8000; /**< Default window size */ +int parsec_dtd_threshold_size = 16000; /**< Default threshold size of tasks for master thread to wait on */ static int parsec_dtd_task_hash_table_size = 1<<16; /**< Default task hash table size */ static int parsec_dtd_tile_hash_table_size = 1<<16; /**< Default tile hash table size */ static int parsec_dtd_no_of_arenas_datatypes = 16; @@ -263,6 +263,14 @@ void parsec_dtd_taskpool_constructor(parsec_dtd_taskpool_t *tp) tp->function_counter = 0; + tp->keys_hash_table = PARSEC_OBJ_NEW(parsec_hash_table_t); + for(nb = 1; nb < 16 && (1<keys_hash_table, + offsetof(dtd_hash_table_pointer_item_t, ht_item), + nb, + DTD_key_fns, + tp->keys_hash_table); + tp->task_hash_table = PARSEC_OBJ_NEW(parsec_hash_table_t); for(nb = 1; nb < 16 && (1<task_hash_table, @@ -1687,7 +1695,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* root of the bcast key */ successor = get_chain_successor(es, current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | (successor << 12) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | *(data_ptr+1+successor)); //fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d value %d\n", current_task->deps_out, successor, my_rank, current_task->super.locals[0].value); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); @@ -1719,7 +1727,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, if(successor == -1) { current_task->deps_out->outgoing_mask = 0; } - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | (successor << 12) | *(data_ptr+1+successor)); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | *(data_ptr+1+successor)); assert(NULL != current_task->super.data[current_dep].data_out); current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; @@ -1730,7 +1738,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, current_task->deps_out->outgoing_mask); current_task->deps_out = NULL; /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | (my_rank << 12) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); struct timespec rqtp; uint64_t misses_in_a_row; @@ -1738,36 +1746,52 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, misses_in_a_row = 2; parsec_dtd_task_t* dtd_task = NULL; parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; - + + /* while(dtd_task == NULL) { rqtp.tv_nsec = exponential_backoff(misses_in_a_row); nanosleep(&rqtp, NULL); misses_in_a_row++; dtd_task = parsec_dtd_find_task(tp, key); if(misses_in_a_row > 10) { - //sleep(1); + sleep(1); fprintf(stderr, "finding dtd task with iteration %d for key %ld key2 %ld on rank %d\n", misses_in_a_row, key, key2, my_rank); } } - //parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); - parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); + */ + + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); + dtd_task = parsec_dtd_find_task(tp, key); + if(dtd_task == NULL) { + int* buffer = malloc(sizeof(int)*30*30); + memcpy(buffer, data_ptr, sizeof(int)*30*30); + dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); + parsec_hash_table_t *hash_table = tp->keys_hash_table; + item->ht_item.key = (parsec_key_t)key; + item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; + item->value = (void *)buffer; + parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); + } else { + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); + parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); + + populate_remote_deps(data_ptr, dtd_task->deps_out); + parsec_dtd_untrack_task(tp, key); + if(dep == NULL){ + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_track_task(tp, key2, dtd_task); + }else{ + + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_untrack_remote_dep(tp, key2); + parsec_dtd_track_task(tp, key2, dtd_task); + remote_dep_dequeue_delayed_dep_release(dep); + } + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); + } //parsec_dtd_task_t* dtd_task = parsec_dtd_find_task(tp, key); - parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); //fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); - populate_remote_deps(data_ptr, dtd_task->deps_out); - parsec_dtd_untrack_task(tp, key); - if(dep == NULL){ - dtd_task->super.locals[0].value = data_ptr[0]; - parsec_dtd_track_task(tp, key2, dtd_task); - }else{ - - dtd_task->super.locals[0].value = data_ptr[0]; - parsec_dtd_untrack_remote_dep(tp, key2); - parsec_dtd_track_task(tp, key2, dtd_task); - remote_dep_dequeue_delayed_dep_release(dep); - } - parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); - //parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ @@ -2149,6 +2173,7 @@ void parsec_dtd_remote_task_release( parsec_dtd_task_t *this_task ) { parsec_object_t *object = (parsec_object_t *)this_task; + //parsec_atomic_fetch_inc_int32( &object->obj_reference_count); assert(object->obj_reference_count > 1); if( 2 == parsec_atomic_fetch_dec_int32( &object->obj_reference_count ) ){ int current_flow; @@ -2640,6 +2665,23 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in uint64_t key = (((uint64_t)real_parent_task->ht_item.key)<<32) | (1U<task_hash_table, (parsec_key_t)key); parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( tp, key ); + if(real_parent_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { + //sleep(1); + //uint64_t key = ((this_task->super.locals[0].value)<<32) | (1U<<0); + parsec_hash_table_t *hash_table = tp->keys_hash_table; + dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); + if(item) { + int* data_ptr = (int*)item->value; + parsec_dtd_untrack_task(tp, key); + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + key = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); + parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key); + real_parent_task->super.locals[0].value = data_ptr[0]; + populate_remote_deps(data_ptr, real_parent_task->deps_out); + } + //fprintf(stderr, "inserting bcast data task and finding in hashtable with key %d, result %p\n", real_parent_task->super.locals[0].value, item); + } if( NULL == dep ) { if( !(flow->flags & TASK_INSERTED) ) { flow->flags |= TASK_INSERTED; @@ -2893,7 +2935,7 @@ parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_tas { (void)es; (void)this_task; - //fprintf(stderr, "bcast_data_fn executed\n"); + //fprintf(stderr, "bcast_data_fn executed on rank %d\n", es->virtual_process->parsec_context->my_rank); return PARSEC_HOOK_RETURN_DONE; } @@ -2924,6 +2966,7 @@ parsec_dtd_block_if_threshold_reached(parsec_dtd_taskpool_t *dtd_tp, int task_th if( dtd_tp->task_window_size < parsec_dtd_window_size ) { dtd_tp->task_window_size *= 2; } else { + fprintf(stderr, "block function in rank %d with local task inserted %d\n", dtd_tp->super.context->my_rank, dtd_tp->local_task_inserted); parsec_execute_and_come_back(&dtd_tp->super, task_threshold); return 1; /* Indicating we blocked */ @@ -2962,6 +3005,14 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) parsec_dtd_remote_task_retain( this_task ); } + //if(this_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { + //sleep(1); + // uint64_t key = ((this_task->super.locals[0].value)<<32) | (1U<<0); + // parsec_hash_table_t *hash_table = dtd_tp->keys_hash_table; + // dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); + // fprintf(stderr, "inserting bcast data task and finding in hashtable with key %d, result %p\n", this_task->super.locals[0].value, item); + //} + /* In the next segment we resolve the dependencies of each flow */ for( flow_index = 0, tile = NULL, tile_op_type = 0; flow_index < tc->nb_flows; flow_index ++ ) { parsec_dtd_tile_user_t last_user, last_writer; @@ -3270,6 +3321,9 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) dtd_tp->local_task_inserted++; PARSEC_DEBUG_VERBOSE(parsec_dtd_dump_traversal_info, parsec_dtd_debug_output, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); + if(this_task->rank == 0) { + //fprintf(stderr, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); + } } /* Releasing every remote_task */ diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index ace54f9f8..cc4385cb0 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -240,6 +240,7 @@ struct parsec_dtd_taskpool_s { parsec_mempool_t *hash_table_bucket_mempool; parsec_hash_table_t *task_hash_table; parsec_hash_table_t *function_h_table; + parsec_hash_table_t *keys_hash_table; /* ring of initial ready tasks */ parsec_task_t **startup_list; int bcast_id; diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 8a0572098..082027fe5 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -147,7 +147,7 @@ int test_broadcast_mixed( // Tile size int nb = 1; - int nb_bcast = 200; + int nb_bcast = 30; // Total number of tiles int nt = 1; int data_value = 0; @@ -168,7 +168,7 @@ int test_broadcast_mixed( parsec_matrix_add2arena_rect( &parsec_dtd_arenas_datatypes[TILE_BCAST], parsec_datatype_int32_t, - nb_bcast, 1, nb_bcast); + nb_bcast, nb_bcast, nb_bcast); // Initial value on the root node. All node should have this value // at the end of the operation. int data_root = 55; @@ -287,14 +287,16 @@ int test_broadcast_mixed( //} } -for(int iter=1; iter <= 0; iter++) { +for(int iter=1; iter <= 1; iter++) { // Second round of broadcast, create another array of keys for this bcast - key_root = B->data_key(B, root+iter*world, 0); - //key_root = B->data_key(B, root, 0); + //key_root = B->data_key(B, root+iter*world, 0); + key_root = B->data_key(B, root, 0); bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); - //sleep(5); + sleep(5); int new_value = -1; + key_root = key = A->data_key(A, root+iter*world, 0); + dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); if (root == myrank) { //*data_ptr = 1998; new_value = 1998+iter; @@ -332,20 +334,20 @@ for(int iter=1; iter <= 0; iter++) { // // Retrieve value of broadcasted data // - for (int rank = 0; rank < world; ++rank) { - if ( rank == root) continue; + //for (int rank = 0; rank < world; ++rank) { + //if ( myrank != root) { parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( dtd_tp, retrieve_task_fn, 0, "retrieve_task", PASSED_BY_REF, dtd_tile_root, PARSEC_INPUT | TILE_FULL, - sizeof(int), &rank, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, sizeof(int*), &data_value_out, PARSEC_VALUE, PARSEC_DTD_ARG_END); - //parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - //parsec_insert_dtd_task(retrieve_task); + parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + parsec_insert_dtd_task(retrieve_task); - } + //} } - //parsec_dtd_data_flush_all( dtd_tp, A ); + parsec_dtd_data_flush_all( dtd_tp, A ); //parsec_dtd_data_flush_all( dtd_tp, B ); // Wait for task completion From a0d16b6bf8b285fd25a47bc00c07cf168dc3af90 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Thu, 4 Nov 2021 16:42:30 -0400 Subject: [PATCH 19/41] WIP: removing receiver task for key array, reduce tasks --- parsec/interfaces/superscalar/collectives.c | 2 +- .../interfaces/superscalar/insert_function.c | 41 +++++++++++++------ parsec/remote_dep_mpi.c | 2 +- .../dtd_test_broadcast_collective.c | 4 +- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 146918074..036c61178 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -211,7 +211,7 @@ void parsec_dtd_broadcast( sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - parsec_insert_dtd_task(dtd_retrieve_task); + //parsec_insert_dtd_task(dtd_retrieve_task); } /* Post the bcast tasks for the actual data */ diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 8d022a560..70da4e714 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1793,16 +1793,16 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, //fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); - parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ + //parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ /* releasing the receiver task as the only desc task */ tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); - current_desc = (DESC_OF(current_task, current_dep))->task; - current_desc->super.data[0].data_in = current_task->super.data[current_dep].data_out; - (void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); - ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, - &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); + //current_desc = (DESC_OF(current_task, current_dep))->task; + //current_desc->super.data[0].data_in = current_task->super.data[current_dep].data_out; + //(void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); + //ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, + // &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); } } else { /* on the receiver side, get datatype to aquire datatype, arena etc info */ @@ -3005,13 +3005,6 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) parsec_dtd_remote_task_retain( this_task ); } - //if(this_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { - //sleep(1); - // uint64_t key = ((this_task->super.locals[0].value)<<32) | (1U<<0); - // parsec_hash_table_t *hash_table = dtd_tp->keys_hash_table; - // dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); - // fprintf(stderr, "inserting bcast data task and finding in hashtable with key %d, result %p\n", this_task->super.locals[0].value, item); - //} /* In the next segment we resolve the dependencies of each flow */ for( flow_index = 0, tile = NULL, tile_op_type = 0; flow_index < tc->nb_flows; flow_index ++ ) { @@ -3334,6 +3327,28 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) /* Increase the count of satisfied flows to counter-balance the increase in the * number of expected flows done during the task creation. */ satisfied_flow++; + + if(parsec_dtd_task_is_remote(this_task) && this_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { + parsec_dtd_flow_info_t *flow = FLOW_OF(this_task, 0); + uint64_t key = ((uint64_t)(this_task->super.locals[0].value)<<32) | (1U<<0); + parsec_hash_table_lock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); + parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( dtd_tp, key ); + if( NULL == dep ) { + if( !(flow->flags & TASK_INSERTED) ) { + flow->flags |= TASK_INSERTED; + parsec_dtd_track_task( dtd_tp, key, this_task ); + fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); + } + } else { + if( !(flow->flags & TASK_INSERTED) ) { + flow->flags |= TASK_INSERTED; + parsec_dtd_untrack_remote_dep( dtd_tp, key ); + parsec_dtd_track_task( dtd_tp, key, this_task ); + remote_dep_dequeue_delayed_dep_release(dep); + } + } + parsec_hash_table_unlock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); + } #if defined(PARSEC_PROF_TRACE) if(parsec_dtd_profile_verbose) diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index de457a8d5..436bc1b55 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -750,7 +750,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if( NULL == dtd_task ) { return_defer = 1; - //fprintf(stderr, "defer receive\n"); + fprintf(stderr, "defer receive for key %d k %d\n", origin->msg.locals[0].value, k); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 082027fe5..67547e7d6 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -287,7 +287,7 @@ int test_broadcast_mixed( //} } -for(int iter=1; iter <= 1; iter++) { +for(int iter=1; iter <= 0; iter++) { // Second round of broadcast, create another array of keys for this bcast //key_root = B->data_key(B, root+iter*world, 0); key_root = B->data_key(B, root, 0); @@ -410,7 +410,7 @@ int main(int argc, char **argv) { // Root node for the broadcast operation - //sleep(30); + sleep(30); // // Simple broadcast From f873ec7625fc9ba11bdf726b7b825f5c89b71fdb Mon Sep 17 00:00:00 2001 From: yu-pei Date: Thu, 11 Nov 2021 17:45:28 -0500 Subject: [PATCH 20/41] use correct key to retrieve saved deps --- .../interfaces/superscalar/insert_function.c | 23 ++--- .../superscalar/overlap_strategies.c | 85 ++++++++++--------- parsec/remote_dep_mpi.c | 5 +- 3 files changed, 59 insertions(+), 54 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 70da4e714..7e0039b69 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -60,8 +60,8 @@ static int parsec_dtd_profile_verbose = 0; static parsec_dc_key_t parsec_dtd_dc_id = 0; int32_t __parsec_dtd_is_initialized = 0; /**< Indicates init of dtd environment is completed */ -int parsec_dtd_window_size = 8000; /**< Default window size */ -int parsec_dtd_threshold_size = 16000; /**< Default threshold size of tasks for master thread to wait on */ +int parsec_dtd_window_size = 2; /**< Default window size */ +int parsec_dtd_threshold_size = 2; /**< Default threshold size of tasks for master thread to wait on */ static int parsec_dtd_task_hash_table_size = 1<<16; /**< Default task hash table size */ static int parsec_dtd_tile_hash_table_size = 1<<16; /**< Default tile hash table size */ static int parsec_dtd_no_of_arenas_datatypes = 16; @@ -1790,7 +1790,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); } //parsec_dtd_task_t* dtd_task = parsec_dtd_find_task(tp, key); - //fprintf(stderr, "iterate successor on rank %d, key2 %d remote dep %p with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dep, dtd_task); + fprintf(stderr, "iterate successor on rank %d, key2 %d with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dtd_task); parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); //parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ @@ -2676,11 +2676,11 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); key = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); - parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key); - real_parent_task->super.locals[0].value = data_ptr[0]; + dep = parsec_dtd_find_task(tp, key); + real_parent_task->super.locals[0].value = real_parent_task->ht_item.key = data_ptr[0]; populate_remote_deps(data_ptr, real_parent_task->deps_out); } - //fprintf(stderr, "inserting bcast data task and finding in hashtable with key %d, result %p\n", real_parent_task->super.locals[0].value, item); + fprintf(stderr, "inserting bcast data task and finding in hashtable with key %llu %d, result %p dep %p\n", key, real_parent_task->super.locals[0].value, item, dep); } if( NULL == dep ) { if( !(flow->flags & TASK_INSERTED) ) { @@ -2935,7 +2935,7 @@ parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_tas { (void)es; (void)this_task; - //fprintf(stderr, "bcast_data_fn executed on rank %d\n", es->virtual_process->parsec_context->my_rank); + fprintf(stderr, "bcast_data_fn %p executed on rank %d\n", this_task, es->virtual_process->parsec_context->my_rank); return PARSEC_HOOK_RETURN_DONE; } @@ -2966,7 +2966,8 @@ parsec_dtd_block_if_threshold_reached(parsec_dtd_taskpool_t *dtd_tp, int task_th if( dtd_tp->task_window_size < parsec_dtd_window_size ) { dtd_tp->task_window_size *= 2; } else { - fprintf(stderr, "block function in rank %d with local task inserted %d\n", dtd_tp->super.context->my_rank, dtd_tp->local_task_inserted); + if(dtd_tp->local_task_inserted>0) + fprintf(stderr, "block function in rank %d with local task inserted %d\n", dtd_tp->super.context->my_rank, dtd_tp->local_task_inserted); parsec_execute_and_come_back(&dtd_tp->super, task_threshold); return 1; /* Indicating we blocked */ @@ -3314,8 +3315,8 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) dtd_tp->local_task_inserted++; PARSEC_DEBUG_VERBOSE(parsec_dtd_dump_traversal_info, parsec_dtd_debug_output, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); - if(this_task->rank == 0) { - //fprintf(stderr, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); + if(this_task->rank == 1) { + fprintf(stderr, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); } } @@ -3337,7 +3338,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) if( !(flow->flags & TASK_INSERTED) ) { flow->flags |= TASK_INSERTED; parsec_dtd_track_task( dtd_tp, key, this_task ); - fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); + //fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); } } else { if( !(flow->flags & TASK_INSERTED) ) { diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 8daef5094..5115f276f 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -168,6 +168,49 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, if( NULL == tile ) { continue; } + + if(PARSEC_DTD_BCAST_DATA_TC_ID == current_task->super.task_class->task_class_id) { + /* for the bcast data class, in addition to release the data to local deps tasks that will read the data + * propagate the data down to descendants as well */ + if(current_task->deps_out != NULL) { + /* we have not propagate the remote deps yet, otherwise will be set to NULL */ + if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { + assert(NULL != current_task->super.data[current_dep].data_out); + fprintf(stderr, "bcast root task %p data with global key %d\n", current_task, current_task->ht_item.key); + current_task->deps_out->output[0].data.data = + current_task->super.data[current_dep].data_out; + //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + /* current node is part of the broadcast operation, propagate downstream */ + int root = current_task->deps_out->root; + int my_rank = current_task->super.taskpool->context->my_rank; + int _array_pos, _array_mask; + struct remote_dep_output_param_s* output; + output = ¤t_task->deps_out->output[0]; + _array_pos = my_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); + //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p\n", my_rank, root, current_task); + + if ((output->rank_bits[_array_pos] & _array_mask)) { + assert(NULL != current_task->super.data[current_dep].data_out); + + current_task->deps_out->output[0].data.data = + current_task->super.data[0].data_out; + //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + } + } + } + } /* BCAST DATA propagation */ if( FLOW_OF(current_task, current_dep)->op_type & PARSEC_DONT_TRACK ) { /* User has instructed us not to track this data */ @@ -220,48 +263,6 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, assert(current_desc != NULL); #endif - if(PARSEC_DTD_BCAST_DATA_TC_ID == current_task->super.task_class->task_class_id) { - /* for the bcast data class, in addition to release the data to local deps tasks that will read the data - * propagate the data down to descendants as well */ - if(current_task->deps_out != NULL) { - /* we have not propagate the remote deps yet, otherwise will be set to NULL */ - if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { - assert(NULL != current_task->super.data[current_dep].data_out); - //fprintf(stderr, "bcast root data with global key %d\n", current_task->ht_item.key); - current_task->deps_out->output[0].data.data = - current_task->super.data[current_dep].data_out; - //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - /* current node is part of the broadcast operation, propagate downstream */ - int root = current_task->deps_out->root; - int my_rank = current_task->super.taskpool->context->my_rank; - int _array_pos, _array_mask; - struct remote_dep_output_param_s* output; - output = ¤t_task->deps_out->output[0]; - _array_pos = my_rank / (8 * sizeof(uint32_t)); - _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); - //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p\n", my_rank, root, current_task); - - if ((output->rank_bits[_array_pos] & _array_mask)) { - assert(NULL != current_task->super.data[current_dep].data_out); - - current_task->deps_out->output[0].data.data = - current_task->super.data[0].data_out; - //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - } - } - } - } /* BCAST DATA propagation */ /* setting data */ data.data = current_task->super.data[current_dep].data_out; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 436bc1b55..467bdb11e 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -750,7 +750,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if( NULL == dtd_task ) { return_defer = 1; - fprintf(stderr, "defer receive for key %d k %d\n", origin->msg.locals[0].value, k); + fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to @@ -2127,6 +2127,9 @@ remote_dep_mpi_save_activate_cb(parsec_execution_stream_t* es, &deps->msg, dep_count, dep_dtt, dep_comm); deps->from = status->MPI_SOURCE; + //if(es->virtual_process->parsec_context->my_rank == 1){ + // fprintf(stderr, "save activate cb with value %d\n", deps->msg.locals[0].value); + //} /* Retrieve the data arenas and update the msg.incoming_mask to reflect * the data we should be receiving from the predecessor. */ From 31445b0210b98910a50795a8d75a6e9513370230 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Fri, 12 Nov 2021 11:52:44 -0500 Subject: [PATCH 21/41] remove prints --- .../interfaces/superscalar/insert_function.c | 18 +++++++++--------- .../superscalar/overlap_strategies.c | 2 +- parsec/remote_dep_mpi.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 7e0039b69..c7f4d0756 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -60,8 +60,8 @@ static int parsec_dtd_profile_verbose = 0; static parsec_dc_key_t parsec_dtd_dc_id = 0; int32_t __parsec_dtd_is_initialized = 0; /**< Indicates init of dtd environment is completed */ -int parsec_dtd_window_size = 2; /**< Default window size */ -int parsec_dtd_threshold_size = 2; /**< Default threshold size of tasks for master thread to wait on */ +int parsec_dtd_window_size = 8000; /**< Default window size */ +int parsec_dtd_threshold_size = 4000; /**< Default threshold size of tasks for master thread to wait on */ static int parsec_dtd_task_hash_table_size = 1<<16; /**< Default task hash table size */ static int parsec_dtd_tile_hash_table_size = 1<<16; /**< Default tile hash table size */ static int parsec_dtd_no_of_arenas_datatypes = 16; @@ -1790,7 +1790,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); } //parsec_dtd_task_t* dtd_task = parsec_dtd_find_task(tp, key); - fprintf(stderr, "iterate successor on rank %d, key2 %d with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dtd_task); + //fprintf(stderr, "iterate successor on rank %d, key2 %d with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dtd_task); parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); //parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ @@ -2680,7 +2680,7 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in real_parent_task->super.locals[0].value = real_parent_task->ht_item.key = data_ptr[0]; populate_remote_deps(data_ptr, real_parent_task->deps_out); } - fprintf(stderr, "inserting bcast data task and finding in hashtable with key %llu %d, result %p dep %p\n", key, real_parent_task->super.locals[0].value, item, dep); + //fprintf(stderr, "inserting bcast data task and finding in hashtable with key %llu %d, result %p dep %p\n", key, real_parent_task->super.locals[0].value, item, dep); } if( NULL == dep ) { if( !(flow->flags & TASK_INSERTED) ) { @@ -2935,7 +2935,7 @@ parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_tas { (void)es; (void)this_task; - fprintf(stderr, "bcast_data_fn %p executed on rank %d\n", this_task, es->virtual_process->parsec_context->my_rank); + //fprintf(stderr, "bcast_data_fn %p executed on rank %d\n", this_task, es->virtual_process->parsec_context->my_rank); return PARSEC_HOOK_RETURN_DONE; } @@ -2967,7 +2967,7 @@ parsec_dtd_block_if_threshold_reached(parsec_dtd_taskpool_t *dtd_tp, int task_th dtd_tp->task_window_size *= 2; } else { if(dtd_tp->local_task_inserted>0) - fprintf(stderr, "block function in rank %d with local task inserted %d\n", dtd_tp->super.context->my_rank, dtd_tp->local_task_inserted); + //fprintf(stderr, "block function in rank %d with local task inserted %d\n", dtd_tp->super.context->my_rank, dtd_tp->local_task_inserted); parsec_execute_and_come_back(&dtd_tp->super, task_threshold); return 1; /* Indicating we blocked */ @@ -3315,9 +3315,9 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) dtd_tp->local_task_inserted++; PARSEC_DEBUG_VERBOSE(parsec_dtd_dump_traversal_info, parsec_dtd_debug_output, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); - if(this_task->rank == 1) { - fprintf(stderr, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); - } + //if(this_task->rank == 1) { + //fprintf(stderr, "Task generated -> %s %d rank %d\n", this_task->super.task_class->name, this_task->ht_item.key, this_task->rank); + //} } /* Releasing every remote_task */ diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 5115f276f..993e2b730 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -176,7 +176,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, /* we have not propagate the remote deps yet, otherwise will be set to NULL */ if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { assert(NULL != current_task->super.data[current_dep].data_out); - fprintf(stderr, "bcast root task %p data with global key %d\n", current_task, current_task->ht_item.key); + //fprintf(stderr, "bcast root task %p data with global key %d\n", current_task, current_task->ht_item.key); current_task->deps_out->output[0].data.data = current_task->super.data[current_dep].data_out; //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 467bdb11e..4422bf708 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -750,7 +750,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if( NULL == dtd_task ) { return_defer = 1; - fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); + //fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to From c46a57dc65efeedb043a71a6ff9872666b8205c9 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Tue, 23 Nov 2021 14:23:36 -0500 Subject: [PATCH 22/41] clean up the not needed bcast release deps func --- .../interfaces/superscalar/insert_function.c | 117 ------------------ parsec/remote_dep_mpi.c | 2 +- 2 files changed, 1 insertion(+), 118 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index c7f4d0756..3b5075c76 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -107,12 +107,6 @@ parsec_dtd_release_deps(parsec_execution_stream_t *, parsec_task_t *, uint32_t, parsec_remote_deps_t *); -static int -parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *, - parsec_task_t *, - uint32_t, parsec_remote_deps_t *); - - static parsec_hook_return_t complete_hook_of_dtd(parsec_execution_stream_t *, parsec_task_t *); @@ -1947,115 +1941,6 @@ parsec_dtd_release_deps(parsec_execution_stream_t *es, return 0; } -static int -parsec_dtd_bcast_key_release_deps(parsec_execution_stream_t *es, - parsec_task_t *this_task, - uint32_t action_mask, - parsec_remote_deps_t *deps) -{ - (void)deps; - parsec_release_dep_fct_arg_t arg; - int __vp_id; - - assert(NULL != es); - - PARSEC_PINS(es, RELEASE_DEPS_BEGIN, this_task); -#if defined(DISTRIBUTED) - arg.remote_deps = deps; -#endif /* defined(DISTRIBUTED) */ - - arg.action_mask = action_mask; - arg.output_usage = 0; - arg.output_entry = NULL; - arg.ready_lists = alloca(sizeof(parsec_task_t *) * es->virtual_process->parsec_context->nb_vp); - - for (__vp_id = 0; __vp_id < es->virtual_process->parsec_context->nb_vp; __vp_id++) - arg.ready_lists[__vp_id] = NULL; - - parsec_dtd_task_t *this_dtd_task = NULL; - const parsec_task_class_t *tc = this_task->task_class; - parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)this_task->taskpool; - - if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) ) { - /* root of the bcast key operation */ - this_dtd_task = (parsec_dtd_task_t *)this_task; - //fprintf(stderr, "bcast key release on rank %d\n", this_dtd_task->rank); - //parsec_dtd_remote_task_retain(this_dtd_task); - } else { - int flow_index, track_flow = 0; - for(flow_index = 0; flow_index < tc->nb_flows; flow_index++) { - if((action_mask & (1 << flow_index))) { - if(!(track_flow & (1U << flow_index))) { - uint64_t key = (((uint64_t)this_task->locals[0].value<<32) | (1U<task_hash_table, (parsec_key_t)key); - this_dtd_task = parsec_dtd_find_task( tp, key ); - assert(this_dtd_task != NULL); - - if( this_task->data[flow_index].data_out != NULL ) { - assert(this_task->data[flow_index].data_out != NULL); - this_dtd_task->super.data[flow_index].data_in = this_task->data[flow_index].data_in; - this_dtd_task->super.data[flow_index].data_out = this_task->data[flow_index].data_out; - parsec_dtd_retain_data_copy(this_task->data[flow_index].data_out); - - } - track_flow |= (1U<task_hash_table, (parsec_key_t)key); - } - } - } - } - - //int *data_ptr; - //data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); - assert(NULL != this_dtd_task); - tc->iterate_successors(es, (parsec_task_t*)this_dtd_task, action_mask, dtd_release_dep_fct, &arg); - -#if defined(DISTRIBUTED) - /* We perform this only for remote tasks that are being activated - * from the comm engine. We remove the task from the hash table - * for each flow a rank is concerned about. - */ - if( parsec_dtd_task_is_remote(this_dtd_task) && !(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) ) { - int flow_index, track_flow = 0; - for(flow_index = 0; flow_index < tc->nb_flows; flow_index++) { - if((action_mask & (1 << flow_index))) { - if(!(track_flow & (1U << flow_index))) { - uint64_t key = (((uint64_t)this_task->locals[0].value<<32) | (1U<task_hash_table, (parsec_key_t)key); - if( NULL != parsec_dtd_untrack_task( tp, key) ) { - /* also releasing task */ - parsec_dtd_remote_task_release( this_dtd_task ); - } - track_flow |= (1U<task_hash_table, (parsec_key_t)key); - } - } - } - } -#else - (void)deps; -#endif - - /* Scheduling tasks */ - if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - parsec_vp_t **vps = es->virtual_process->parsec_context->virtual_processes; - for (__vp_id = 0; __vp_id < es->virtual_process->parsec_context->nb_vp; __vp_id++) { - if (NULL == arg.ready_lists[__vp_id]) { - continue; - } - if (__vp_id == es->virtual_process->vp_id) { - __parsec_schedule(es, arg.ready_lists[__vp_id], 0); - }else { - __parsec_schedule(vps[__vp_id]->execution_streams[0], arg.ready_lists[__vp_id], 0); - } - arg.ready_lists[__vp_id] = NULL; - } - } - - PARSEC_PINS(es, RELEASE_DEPS_END, this_task); - return 0; -} - /* **************************************************************************** */ /** * This function is called internally by PaRSEC once a task is done @@ -2455,8 +2340,6 @@ parsec_dtd_create_task_class( parsec_dtd_taskpool_t *__tp, parsec_dtd_funcptr_t* tc->iterate_successors = parsec_dtd_bcast_key_iterate_successors; tc->iterate_predecessors = NULL; tc->release_deps = parsec_dtd_release_deps; - if(tc->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) - tc->release_deps = parsec_dtd_bcast_key_release_deps; tc->prepare_input = data_lookup_of_dtd_task; tc->prepare_output = output_data_of_dtd_task; tc->get_datatype = (parsec_datatype_lookup_t *)datatype_lookup_of_dtd_task; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 4422bf708..8b0d1ad16 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -933,7 +933,7 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, remote_dep_complete_and_cleanup(&origin, 1); } else { //remote_dep_complete_and_cleanup(&origin, 1); - //remote_deps_free(origin); + remote_deps_free(origin); //remote_dep_dec_flying_messages(task.taskpool); } From c736feae2487bb15d6c7da238152aef722cc6037 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Tue, 23 Nov 2021 15:24:59 -0500 Subject: [PATCH 23/41] move broadcast code into own file --- parsec/interfaces/superscalar/CMakeLists.txt | 1 + parsec/interfaces/superscalar/collectives.c | 61 +--- .../interfaces/superscalar/insert_function.c | 260 ------------------ .../superscalar/insert_function_internal.h | 11 +- .../superscalar/parsec_dtd_broadcast.c | 218 +++++++++++++++ 5 files changed, 231 insertions(+), 320 deletions(-) create mode 100644 parsec/interfaces/superscalar/parsec_dtd_broadcast.c diff --git a/parsec/interfaces/superscalar/CMakeLists.txt b/parsec/interfaces/superscalar/CMakeLists.txt index 3186c48e2..4ff096b0d 100644 --- a/parsec/interfaces/superscalar/CMakeLists.txt +++ b/parsec/interfaces/superscalar/CMakeLists.txt @@ -3,6 +3,7 @@ if( BUILD_PARSEC ) ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/parsec_dtd_data_flush.c ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/overlap_strategies.c ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/insert_function.c + ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/parsec_dtd_broadcast.c ${CMAKE_CURRENT_SOURCE_DIR}/interfaces/superscalar/collectives.c) INSTALL(FILES diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 036c61178..829f3fce8 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -10,8 +10,6 @@ #ifdef PARSEC_DTD_DIST_COLLECTIVES -/* static parsec_lifo_t parsec_dep_lifo; */ - /** * Create and return `parsec_remote_deps_t` structure associated with * the broadcast of the a data to all the nodes set in the @@ -71,44 +69,14 @@ parsec_remote_deps_t* parsec_dtd_create_remote_deps( _array_pos = myrank / (8 * sizeof(uint32_t)); _array_mask = 1 << (myrank % (8 * sizeof(uint32_t))); - //if( !(output->rank_bits[_array_pos] & _array_mask) ) { - output->rank_bits[_array_pos] |= _array_mask; - output->deps_mask |= (1 << 0); /* not used by DTD? */ - output->count_bits++; - //} + output->rank_bits[_array_pos] |= _array_mask; + output->deps_mask |= (1 << 0); /* not used by DTD? */ + output->count_bits++; } return deps; } -/** - * Free remote deps if it does not involve any participants. - **/ -static -int remote_deps_free_if_empty(parsec_remote_deps_t* deps) { - - // Return 1 if the remote_deps has no participants, 0 otherwise. - int ret = 0; - - struct remote_dep_output_param_s* output = &deps->output[0]; - - // TODO: loop through the whole output array are use max_priority - // instead - if (output->count_bits <= 0) { - // No participants - - deps->pending_ack = 0; - deps->incoming_mask = 0; - deps->outgoing_mask = 0; - remote_deps_free(deps); - - // Indicate that remote deps is empty - ret = 1; - } - - return ret; -} - /** * Perform a broadcast for of the dtd tile `dtd_tile_root` from the * root node associated with the rank `root` to the nodes with ranks @@ -140,7 +108,6 @@ void parsec_dtd_broadcast( data_ptr[400+i+1] = dest_ranks[i]; } } - //fprintf(stderr, "finished bcast key packing\n"); // Retrieve DTD tile's data_copy parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; @@ -186,34 +153,12 @@ void parsec_dtd_broadcast( /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ bcast_id = ( (1<<29) | (root << 20) | (dtd_tp->recv_task_id[root] -1)); - //bcast_id = ( (1<<29) | (dtd_tp->recv_task_id[root] )); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; } /* Post the bcast of keys and ranks array */ parsec_insert_dtd_task(dtd_bcast_key_root); - - if(myrank == root) { - //for (int dest_rank = 0; dest_rank < num_dest_ranks; ++dest_rank) { - // parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( - // dtd_tp, parsec_dtd_aux_fn, 0, "retrieve_task", - // PASSED_BY_REF, bcast_keys_root, PARSEC_INPUT | bcast_arena_index, - // sizeof(int), &dest_ranks[dest_rank], PARSEC_VALUE | PARSEC_AFFINITY, - // PARSEC_DTD_ARG_END); - // parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - // parsec_insert_dtd_task(dtd_retrieve_task); - //} - }else { - parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( - dtd_tp, parsec_dtd_bcast_key_recv, 0, "retrieve_task", - PASSED_BY_REF, bcast_keys_root, PARSEC_INPUT | bcast_arena_index, - sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, - PARSEC_DTD_ARG_END); - parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - //parsec_insert_dtd_task(dtd_retrieve_task); - } - /* Post the bcast tasks for the actual data */ parsec_insert_dtd_task(dtd_bcast_task_root); } diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 3b5075c76..b1bb0c565 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -96,12 +96,6 @@ parsec_dtd_iterate_successors(parsec_execution_stream_t *es, uint32_t action_mask, parsec_ontask_function_t *ontask, void *ontask_arg); -static void -parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, - const parsec_task_t *this_task, - uint32_t action_mask, - parsec_ontask_function_t *ontask, - void *ontask_arg); static int parsec_dtd_release_deps(parsec_execution_stream_t *, parsec_task_t *, @@ -1583,242 +1577,6 @@ parsec_dtd_iterate_successors(parsec_execution_stream_t *es, action_mask, ontask, ontask_arg ); } -/* when the comm_coll_bcast is 1 we use the chain topology, get the successor's rank */ -static int -get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_remote_deps_t* remote_deps) -{ - int my_idx, idx, current_mask; - unsigned int array_index, count, bit_index; - uint32_t boffset; - uint32_t dep_fw_mask[es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof]; - memset(dep_fw_mask, 0, es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof); - memcpy(&dep_fw_mask, remote_deps->remote_dep_fw_mask, es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof); - struct remote_dep_output_param_s* output = &remote_deps->output[0]; - boffset = remote_deps->root / (8 * sizeof(uint32_t)); - dep_fw_mask[boffset] |= ((uint32_t)1) << (remote_deps->root % (8 * sizeof(uint32_t))); - my_idx = (remote_deps->root == es->virtual_process->parsec_context->my_rank) ? 0 : -1; - idx = 0; - for(array_index = count = 0; count < remote_deps->output[0].count_bits; array_index++) { - current_mask = output->rank_bits[array_index]; - if( 0 == current_mask ) continue; - for( bit_index = 0; current_mask != 0; bit_index++ ) { - if( !(current_mask & (1 << bit_index)) ) continue; - int rank = (array_index * sizeof(uint32_t) * 8) + bit_index; - current_mask ^= (1 << bit_index); - count++; - - boffset = rank / (8 * sizeof(uint32_t)); - //if(dep_fw_mask[boffset] & ((uint32_t)1) << (rank % (8 * sizeof(uint32_t)))) - // continue; - idx++; - //if(es->virtual_process->parsec_context->my_rank == 6) - // fprintf(stderr, "idx %d, checking rank %d\n", idx, rank); - if(my_idx == -1) { - if(rank == es->virtual_process->parsec_context->my_rank) { - my_idx = idx; - } - boffset = rank / (8 * sizeof(uint32_t)); - dep_fw_mask[boffset] |= ((uint32_t)1) << (rank % (8 * sizeof(uint32_t))); - continue; - } - if(my_idx != -1){ - if(idx == my_idx+1) - { - return rank; - } - } - } - } - return -1; -} - -static int -populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) -{ - struct remote_dep_output_param_s* output = &remote_deps->output[0]; - int _array_pos, _array_mask; - uint32_t dest_rank_idx; - /* TODO: don't assume the length of data_ptr */ - int num_dest_ranks = data_ptr[400]; - for(dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { - uint32_t dest_rank = data_ptr[400+dest_rank_idx+1]; - _array_pos = dest_rank / (8 * sizeof(uint32_t)); - _array_mask = 1 << (dest_rank % (8 * sizeof(uint32_t))); - - if( !(output->rank_bits[_array_pos] & _array_mask) ) { - output->rank_bits[_array_pos] |= _array_mask; - output->count_bits++; - } - } -} - -static void -parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, - const parsec_task_t *this_task, - uint32_t action_mask, - parsec_ontask_function_t *ontask, - void *ontask_arg) -{ - parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; - int current_dep; - parsec_dtd_task_t *current_desc = NULL; - int op_type_on_current_flow, desc_op_type, desc_flow_index; - parsec_dtd_tile_t *tile; - - parsec_dep_t deps; - parsec_release_dep_fct_arg_t *arg = (parsec_release_dep_fct_arg_t *)ontask_arg; - parsec_dep_data_description_t data; - int rank_src = 0, rank_dst = 0, vpid_dst=0; - parsec_dtd_flow_info_t* flow; - - /* finding for which flow we need to iterate successors of */ - int flow_mask = action_mask; - int my_rank = current_task->super.taskpool->context->my_rank; - int successor = -1; - - rank_src = current_task->rank; - - int rc; /* retrive the mca number for comm_coll_bcast */ - int comm_coll_bcast; /* retrive the value set for comm_coll_bcast */ - if (0 < (rc = parsec_mca_param_find("runtime", NULL, "comm_coll_bcast")) ) { - parsec_mca_param_lookup_int(rc, &comm_coll_bcast); - } - for( current_dep = 0; current_dep < current_task->super.task_class->nb_flows; current_dep++ ) { - if( (flow_mask & (1<deps_out); - int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | *(data_ptr+1+successor)); - //fprintf(stderr, "bcast root dep %p with chain successor %d on rank %d value %d\n", current_task->deps_out, successor, my_rank, current_task->super.locals[0].value); - tile = FLOW_OF(current_task, current_dep)->tile; - parsec_dtd_tile_retain(tile); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - /* decrease the count as in the data flush */ - parsec_dtd_release_local_task( current_task ); - - } else if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { - /* a node in the key array propagation */ - int root = current_task->deps_out->root; - int my_rank = current_task->super.taskpool->context->my_rank; - - int _array_pos, _array_mask; - struct remote_dep_output_param_s* output; - output = ¤t_task->deps_out->output[0]; - _array_pos = my_rank / (8 * sizeof(uint32_t)); - _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); - - if ((output->rank_bits[_array_pos] & _array_mask)) { - /* We are part of the broadcast, forward message */ - int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - populate_remote_deps(data_ptr, current_task->deps_out); - successor = get_chain_successor(es, current_task, current_task->deps_out); - //fprintf(stderr, "continuation with chain successor %d on rank %d value %d, current_task %p, data_ptr %p deps_out %p rank bits %d\n", successor, my_rank, current_task->super.locals[0].value, current_task, data_ptr, current_task->deps_out, current_task->deps_out->output[0].rank_bits[0]); - if(successor == -1) { - current_task->deps_out->outgoing_mask = 0; - } - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | *(data_ptr+1+successor)); - assert(NULL != current_task->super.data[current_dep].data_out); - - current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; - parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); - uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); - struct timespec rqtp; - uint64_t misses_in_a_row; - rqtp.tv_sec = 0; - misses_in_a_row = 2; - parsec_dtd_task_t* dtd_task = NULL; - parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; - - /* - while(dtd_task == NULL) { - rqtp.tv_nsec = exponential_backoff(misses_in_a_row); - nanosleep(&rqtp, NULL); - misses_in_a_row++; - dtd_task = parsec_dtd_find_task(tp, key); - if(misses_in_a_row > 10) { - sleep(1); - fprintf(stderr, "finding dtd task with iteration %d for key %ld key2 %ld on rank %d\n", misses_in_a_row, key, key2, my_rank); - } - } - */ - - parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); - dtd_task = parsec_dtd_find_task(tp, key); - if(dtd_task == NULL) { - int* buffer = malloc(sizeof(int)*30*30); - memcpy(buffer, data_ptr, sizeof(int)*30*30); - dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); - parsec_hash_table_t *hash_table = tp->keys_hash_table; - item->ht_item.key = (parsec_key_t)key; - item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; - item->value = (void *)buffer; - parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); - } else { - parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); - parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); - - populate_remote_deps(data_ptr, dtd_task->deps_out); - parsec_dtd_untrack_task(tp, key); - if(dep == NULL){ - dtd_task->super.locals[0].value = data_ptr[0]; - parsec_dtd_track_task(tp, key2, dtd_task); - }else{ - - dtd_task->super.locals[0].value = data_ptr[0]; - parsec_dtd_untrack_remote_dep(tp, key2); - parsec_dtd_track_task(tp, key2, dtd_task); - remote_dep_dequeue_delayed_dep_release(dep); - } - parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); - } - //parsec_dtd_task_t* dtd_task = parsec_dtd_find_task(tp, key); - //fprintf(stderr, "iterate successor on rank %d, key2 %d with task %p\n", es->virtual_process->parsec_context->my_rank, data_ptr[0], dtd_task); - parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); - - //parsec_dtd_remote_task_release(this_task); /* decrease the count as in the data flush */ - - /* releasing the receiver task as the only desc task */ - tile = FLOW_OF(current_task, current_dep)->tile; - parsec_dtd_tile_retain(tile); - //current_desc = (DESC_OF(current_task, current_dep))->task; - //current_desc->super.data[0].data_in = current_task->super.data[current_dep].data_out; - //(void)parsec_atomic_fetch_inc_int32( ¤t_task->super.data[current_dep].data_out->readers ); - //ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, - // &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); - } - } else { - /* on the receiver side, get datatype to aquire datatype, arena etc info */ - data.data = current_task->super.data[current_dep].data_out; - data.arena = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].arena; - data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; - data.count = 1; - data.displ = 0; - deps.cond = NULL; - deps.ctl_gather_nb = NULL; - //deps.task_class_id = current_desc->super.task_class->task_class_id; - deps.flow = current_task->super.task_class->out[current_dep]; - deps.dep_index = desc_flow_index; - deps.belongs_to = current_task->super.task_class->out[current_dep]; - deps.direct_data = NULL; - deps.dep_datatype_index = current_dep; - ontask( es, (parsec_task_t *)current_task, (parsec_task_t *)current_task, - &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); - } - } - } -} /* **************************************************************************** */ /** @@ -2786,24 +2544,6 @@ parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task return PARSEC_HOOK_RETURN_DONE; } -/* **************************************************************************** */ -/** - * Body of bcast key receiver task we insert that will ensure propagation of the key array - * on the receiver side, empty body! - * - * @param context, this_task - * - * @ingroup DTD_INTERFACE_INTERNAL - */ -int -parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_task) -{ - (void)es; (void)this_task; - - //fprintf(stderr, "bcast_key_recv executed\n"); - return PARSEC_HOOK_RETURN_DONE; -} - /* **************************************************************************** */ /** * Body of bcast task we insert that will propagate the data tile we are broadcasting diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index cc4385cb0..b7a38029e 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -292,8 +292,6 @@ int parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_ int parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task); -int parsec_dtd_bcast_key_recv( parsec_execution_stream_t *es, parsec_task_t *this_task); - void parsec_detach_all_dtd_taskpool_from_context( parsec_context_t *context ); @@ -533,6 +531,15 @@ int parsec_dtd_iterator_arg_get_size(int first_arg, void *tile, int tile_op_type, void *cb_data); +void +parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, + const parsec_task_t *this_task, + uint32_t action_mask, + parsec_ontask_function_t *ontask, + void *ontask_arg); + + + END_C_DECLS #endif /* INSERT_FUNCTION_INTERNAL_H_HAS_BEEN_INCLUDED */ diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c new file mode 100644 index 000000000..c7f41a8ce --- /dev/null +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -0,0 +1,218 @@ +/** + * Copyright (c) 2013-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + */ + +#include "parsec/class/lifo.h" +#include "parsec/parsec_config.h" +#include "parsec/interfaces/superscalar/insert_function_internal.h" + +#ifdef PARSEC_DTD_DIST_COLLECTIVES +int +populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) +{ + struct remote_dep_output_param_s* output = &remote_deps->output[0]; + int _array_pos, _array_mask; + uint32_t dest_rank_idx; + /* TODO: don't assume the length of data_ptr */ + int num_dest_ranks = data_ptr[400]; + for(dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { + uint32_t dest_rank = data_ptr[400+dest_rank_idx+1]; + _array_pos = dest_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (dest_rank % (8 * sizeof(uint32_t))); + + if( !(output->rank_bits[_array_pos] & _array_mask) ) { + output->rank_bits[_array_pos] |= _array_mask; + output->count_bits++; + } + } +} + +/* when the comm_coll_bcast is 1 we use the chain topology, get the successor's rank */ +int +get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_remote_deps_t* remote_deps) +{ + int my_idx, idx, current_mask; + unsigned int array_index, count, bit_index; + uint32_t boffset; + uint32_t dep_fw_mask[es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof]; + memset(dep_fw_mask, 0, es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof); + memcpy(&dep_fw_mask, remote_deps->remote_dep_fw_mask, es->virtual_process->parsec_context->remote_dep_fw_mask_sizeof); + struct remote_dep_output_param_s* output = &remote_deps->output[0]; + boffset = remote_deps->root / (8 * sizeof(uint32_t)); + dep_fw_mask[boffset] |= ((uint32_t)1) << (remote_deps->root % (8 * sizeof(uint32_t))); + my_idx = (remote_deps->root == es->virtual_process->parsec_context->my_rank) ? 0 : -1; + idx = 0; + for(array_index = count = 0; count < remote_deps->output[0].count_bits; array_index++) { + current_mask = output->rank_bits[array_index]; + if( 0 == current_mask ) continue; + for( bit_index = 0; current_mask != 0; bit_index++ ) { + if( !(current_mask & (1 << bit_index)) ) continue; + int rank = (array_index * sizeof(uint32_t) * 8) + bit_index; + current_mask ^= (1 << bit_index); + count++; + + boffset = rank / (8 * sizeof(uint32_t)); + idx++; + if(my_idx == -1) { + if(rank == es->virtual_process->parsec_context->my_rank) { + my_idx = idx; + } + boffset = rank / (8 * sizeof(uint32_t)); + dep_fw_mask[boffset] |= ((uint32_t)1) << (rank % (8 * sizeof(uint32_t))); + continue; + } + if(my_idx != -1){ + if(idx == my_idx+1) + { + return rank; + } + } + } + } + return -1; +} + +void +parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, + const parsec_task_t *this_task, + uint32_t action_mask, + parsec_ontask_function_t *ontask, + void *ontask_arg) +{ + parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; + int current_dep; + parsec_dtd_task_t *current_desc = NULL; + int op_type_on_current_flow, desc_op_type, desc_flow_index; + parsec_dtd_tile_t *tile; + + parsec_dep_t deps; + parsec_release_dep_fct_arg_t *arg = (parsec_release_dep_fct_arg_t *)ontask_arg; + parsec_dep_data_description_t data; + int rank_src = 0, rank_dst = 0, vpid_dst=0; + parsec_dtd_flow_info_t* flow; + + /* finding for which flow we need to iterate successors of */ + int flow_mask = action_mask; + int my_rank = current_task->super.taskpool->context->my_rank; + int successor = -1; + + rank_src = current_task->rank; + + int rc; /* retrive the mca number for comm_coll_bcast */ + int comm_coll_bcast; /* retrive the value set for comm_coll_bcast */ + if (0 < (rc = parsec_mca_param_find("runtime", NULL, "comm_coll_bcast")) ) { + parsec_mca_param_lookup_int(rc, &comm_coll_bcast); + } + for( current_dep = 0; current_dep < current_task->super.task_class->nb_flows; current_dep++ ) { + if( (flow_mask & (1<deps_out); + int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | *(data_ptr+1+successor)); + tile = FLOW_OF(current_task, current_dep)->tile; + parsec_dtd_tile_retain(tile); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + /* decrease the count as in the data flush */ + parsec_dtd_release_local_task( current_task ); + + } else if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { + /* a node in the key array propagation */ + int root = current_task->deps_out->root; + int my_rank = current_task->super.taskpool->context->my_rank; + + int _array_pos, _array_mask; + struct remote_dep_output_param_s* output; + output = ¤t_task->deps_out->output[0]; + _array_pos = my_rank / (8 * sizeof(uint32_t)); + _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); + + if ((output->rank_bits[_array_pos] & _array_mask)) { + /* We are part of the broadcast, forward message */ + int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + populate_remote_deps(data_ptr, current_task->deps_out); + successor = get_chain_successor(es, current_task, current_task->deps_out); + if(successor == -1) { + current_task->deps_out->outgoing_mask = 0; + } + current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | *(data_ptr+1+successor)); + assert(NULL != current_task->super.data[current_dep].data_out); + + current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; + parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + current_task->deps_out, + current_task->deps_out->outgoing_mask); + current_task->deps_out = NULL; + /* update the BCAST DATA task or dep with the global ID that we know now */ + uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); + struct timespec rqtp; + uint64_t misses_in_a_row; + rqtp.tv_sec = 0; + misses_in_a_row = 2; + parsec_dtd_task_t* dtd_task = NULL; + parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); + dtd_task = parsec_dtd_find_task(tp, key); + if(dtd_task == NULL) { + int* buffer = malloc(sizeof(int)*30*30); + memcpy(buffer, data_ptr, sizeof(int)*30*30); + dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); + parsec_hash_table_t *hash_table = tp->keys_hash_table; + item->ht_item.key = (parsec_key_t)key; + item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; + item->value = (void *)buffer; + parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); + } else { + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); + parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); + + populate_remote_deps(data_ptr, dtd_task->deps_out); + parsec_dtd_untrack_task(tp, key); + if(dep == NULL){ + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_track_task(tp, key2, dtd_task); + }else{ + + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_untrack_remote_dep(tp, key2); + parsec_dtd_track_task(tp, key2, dtd_task); + remote_dep_dequeue_delayed_dep_release(dep); + } + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); + } + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + tile = FLOW_OF(current_task, current_dep)->tile; + parsec_dtd_tile_retain(tile); + } + } else { + /* on the receiver side, get datatype to aquire datatype, arena etc info */ + data.data = current_task->super.data[current_dep].data_out; + data.arena = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].arena; + data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; + data.count = 1; + data.displ = 0; + deps.cond = NULL; + deps.ctl_gather_nb = NULL; + deps.flow = current_task->super.task_class->out[current_dep]; + deps.dep_index = desc_flow_index; + deps.belongs_to = current_task->super.task_class->out[current_dep]; + deps.direct_data = NULL; + deps.dep_datatype_index = current_dep; + ontask( es, (parsec_task_t *)current_task, (parsec_task_t *)current_task, + &deps, &data, current_task->rank, my_rank, vpid_dst, ontask_arg ); + } + } + } +} + + +#endif From 3d8e0301016818cdc73e06244a8c51fd30980970 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Tue, 23 Nov 2021 16:05:37 -0500 Subject: [PATCH 24/41] move the bcast key/data task body to dtd_broadcast file --- .../interfaces/superscalar/insert_function.c | 41 +------------- .../superscalar/insert_function_internal.h | 3 +- .../superscalar/parsec_dtd_broadcast.c | 55 +++++++++++++------ 3 files changed, 42 insertions(+), 57 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index b1bb0c565..eb3800a9d 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1464,7 +1464,6 @@ dtd_release_dep_fct( parsec_execution_stream_t *es, /* On the sender side, update the key of the dep flow */ parsec_dtd_task_t * real_parent_task = (parsec_dtd_task_t *)oldcontext; - parsec_dtd_task_t * real_child_task = (parsec_dtd_task_t *)newcontext; #if !defined(PARSEC_DIST_COLLECTIVES) assert(src_rank == es->virtual_process->parsec_context->my_rank); @@ -1934,7 +1933,7 @@ static int bcast_key_datatype_lookup_of_dtd_task(parsec_execution_stream_t *es, const parsec_task_t *this_task, uint32_t *flow_mask, parsec_dep_data_description_t *data) { - (void)es; + (void)es;(void)this_task; data->count = 1; data->displ = 0; data->arena = NULL; @@ -2307,8 +2306,6 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( tp, key ); if(real_parent_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { - //sleep(1); - //uint64_t key = ((this_task->super.locals[0].value)<<32) | (1U<<0); parsec_hash_table_t *hash_table = tp->keys_hash_table; dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); if(item) { @@ -2526,42 +2523,6 @@ fake_first_out_body( parsec_execution_stream_t *es, parsec_task_t *this_task) return PARSEC_HOOK_RETURN_DONE; } -/* **************************************************************************** */ -/** - * Body of bcast key task we insert that will propagate the key array - * empty body! - * - * @param context, this_task - * - * @ingroup DTD_INTERFACE_INTERNAL - */ -int -parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) -{ - (void)es; (void)this_task; - - //fprintf(stderr, "bcast_key_fn executed\n"); - return PARSEC_HOOK_RETURN_DONE; -} - -/* **************************************************************************** */ -/** - * Body of bcast task we insert that will propagate the data tile we are broadcasting - * empty body! - * - * @param context, this_task - * - * @ingroup DTD_INTERFACE_INTERNAL - */ -int -parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) -{ - (void)es; (void)this_task; - - //fprintf(stderr, "bcast_data_fn %p executed on rank %d\n", this_task, es->virtual_process->parsec_context->my_rank); - return PARSEC_HOOK_RETURN_DONE; -} - int parsec_dtd_schedule_task_if_ready(int satisfied_flow, parsec_dtd_task_t *this_task, parsec_dtd_taskpool_t *dtd_tp, int *vpid) diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index b7a38029e..c2fe2be09 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -538,7 +538,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, parsec_ontask_function_t *ontask, void *ontask_arg); - +void +populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps); END_C_DECLS diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index c7f41a8ce..8d1ba7bea 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -4,12 +4,12 @@ * reserved. */ -#include "parsec/class/lifo.h" #include "parsec/parsec_config.h" +#include "parsec/utils/mca_param.h" #include "parsec/interfaces/superscalar/insert_function_internal.h" #ifdef PARSEC_DTD_DIST_COLLECTIVES -int +void populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) { struct remote_dep_output_param_s* output = &remote_deps->output[0]; @@ -33,6 +33,7 @@ populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) int get_chain_successor(parsec_execution_stream_t*es, parsec_task_t* task, parsec_remote_deps_t* remote_deps) { + (void)task; int my_idx, idx, current_mask; unsigned int array_index, count, bit_index; uint32_t boffset; @@ -83,23 +84,17 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, { parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; int current_dep; - parsec_dtd_task_t *current_desc = NULL; - int op_type_on_current_flow, desc_op_type, desc_flow_index; parsec_dtd_tile_t *tile; parsec_dep_t deps; - parsec_release_dep_fct_arg_t *arg = (parsec_release_dep_fct_arg_t *)ontask_arg; parsec_dep_data_description_t data; - int rank_src = 0, rank_dst = 0, vpid_dst=0; - parsec_dtd_flow_info_t* flow; + int vpid_dst=0; /* finding for which flow we need to iterate successors of */ int flow_mask = action_mask; int my_rank = current_task->super.taskpool->context->my_rank; int successor = -1; - rank_src = current_task->rank; - int rc; /* retrive the mca number for comm_coll_bcast */ int comm_coll_bcast; /* retrive the value set for comm_coll_bcast */ if (0 < (rc = parsec_mca_param_find("runtime", NULL, "comm_coll_bcast")) ) { @@ -109,7 +104,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, if( (flow_mask & (1<deps_out); + successor = get_chain_successor(es, (parsec_task_t*)current_task, current_task->deps_out); int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | *(data_ptr+1+successor)); tile = FLOW_OF(current_task, current_dep)->tile; @@ -137,7 +132,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* We are part of the broadcast, forward message */ int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); populate_remote_deps(data_ptr, current_task->deps_out); - successor = get_chain_successor(es, current_task, current_task->deps_out); + successor = get_chain_successor(es, (parsec_task_t*)current_task, current_task->deps_out); if(successor == -1) { current_task->deps_out->outgoing_mask = 0; } @@ -154,10 +149,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* update the BCAST DATA task or dep with the global ID that we know now */ uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); - struct timespec rqtp; - uint64_t misses_in_a_row; - rqtp.tv_sec = 0; - misses_in_a_row = 2; + parsec_dtd_task_t* dtd_task = NULL; parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); @@ -203,7 +195,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, deps.cond = NULL; deps.ctl_gather_nb = NULL; deps.flow = current_task->super.task_class->out[current_dep]; - deps.dep_index = desc_flow_index; + deps.dep_index = 0; deps.belongs_to = current_task->super.task_class->out[current_dep]; deps.direct_data = NULL; deps.dep_datatype_index = current_dep; @@ -214,5 +206,36 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, } } +/* **************************************************************************** */ +/** + * Body of bcast key task we insert that will propagate the key array + * empty body! + * + * @param context, this_task + * + * @ingroup DTD_INTERFACE_INTERNAL + */ +int +parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; (void)this_task; + return PARSEC_HOOK_RETURN_DONE; +} + +/* **************************************************************************** */ +/** + * Body of bcast task we insert that will propagate the data tile we are broadcasting + * empty body! + * + * @param context, this_task + * + * @ingroup DTD_INTERFACE_INTERNAL + */ +int +parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + (void)es; (void)this_task; + return PARSEC_HOOK_RETURN_DONE; +} #endif From d3625ddb66129b73aae11715f8a96f9722484a61 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 6 Dec 2021 12:51:49 -0500 Subject: [PATCH 25/41] move the bcast keys tile to be DTD internal --- parsec/interfaces/superscalar/collectives.c | 29 +++++++++-- .../interfaces/superscalar/insert_function.c | 24 ++++++++- .../interfaces/superscalar/insert_function.h | 6 ++- .../superscalar/parsec_dtd_broadcast.c | 1 + .../dtd_test_broadcast_collective.c | 8 +-- .../superscalar/testing_zpotrf_dtd.c | 52 ++++++++++--------- 6 files changed, 86 insertions(+), 34 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 829f3fce8..5002aa71e 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -5,8 +5,10 @@ */ #include "parsec/class/lifo.h" +#include "parsec/parsec_internal.h" #include "parsec/parsec_config.h" #include "parsec/interfaces/superscalar/insert_function_internal.h" +#include "parsec/interfaces/superscalar/insert_function.h" #ifdef PARSEC_DTD_DIST_COLLECTIVES @@ -83,21 +85,41 @@ parsec_remote_deps_t* parsec_dtd_create_remote_deps( * set in the `dest_ranks` array. **/ void parsec_dtd_broadcast( - parsec_taskpool_t *taskpool, int myrank, int root, + parsec_taskpool_t *taskpool, int root, parsec_dtd_tile_t* dtd_tile_root, int arena_index, - parsec_dtd_tile_t* bcast_keys_root, int bcast_arena_index, + //parsec_dtd_tile_t* bcast_keys_root, int bcast_arena_index, int* dest_ranks, int num_dest_ranks) { - + + parsec_dtd_tile_t* bcast_keys_root = NULL; + int bcast_arena_index = 15; + + parsec_data_copy_t *parsec_data_copy; int *data_ptr; int key; int bcast_id; + int myrank = taskpool->context->my_rank; parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)taskpool; + bcast_keys_root = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); + bcast_keys_root->dc = NULL; + bcast_keys_root->arena_index = -1; + bcast_keys_root->key = (uint64_t) bcast_id; + bcast_keys_root->rank = root; + bcast_keys_root->flushed = NOT_FLUSHED; + parsec_data_copy_t* new_data_copy = PARSEC_OBJ_NEW(parsec_data_copy_t); + + new_data_copy->coherency_state = PARSEC_DATA_COHERENCY_OWNED; + new_data_copy->device_private = malloc(sizeof(int)*2500); + bcast_keys_root->data_copy = new_data_copy; + bcast_keys_root->ht_item.key = (parsec_key_t)key; + parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); + if(myrank == root) { bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; + parsec_data_copy = bcast_keys_root->data_copy; data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); data_ptr[0] = bcast_id; @@ -108,6 +130,7 @@ void parsec_dtd_broadcast( data_ptr[400+i+1] = dest_ranks[i]; } } + // Retrieve DTD tile's data_copy parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index eb3800a9d..277454290 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -65,6 +65,7 @@ int parsec_dtd_threshold_size = 4000; /**< Default threshold size of static int parsec_dtd_task_hash_table_size = 1<<16; /**< Default task hash table size */ static int parsec_dtd_tile_hash_table_size = 1<<16; /**< Default tile hash table size */ static int parsec_dtd_no_of_arenas_datatypes = 16; +static int parsec_dtd_bcast_tile_size = 50; int parsec_dtd_dump_traversal_info = 60; /**< Level for printing traversal info */ int parsec_dtd_dump_function_info = 50; /**< Level for printing function_structure info */ @@ -83,6 +84,8 @@ parsec_mempool_t *parsec_dtd_taskpool_mempool = NULL; /* Global mempool for all tiles */ parsec_mempool_t *parsec_dtd_tile_mempool = NULL; +parsec_hash_table_t* parsec_bcast_keys_hash; +parsec_mempool_t* parsec_bcast_keys_tile_mempool; /** * All the static functions should be declared before being defined. */ @@ -460,6 +463,23 @@ parsec_dtd_lazy_init(void) 1/* no. of threads*/ ); parsec_dtd_arenas_datatypes = (parsec_arena_datatype_t *) calloc(parsec_dtd_no_of_arenas_datatypes, sizeof(parsec_arena_datatype_t)); + + parsec_bcast_keys_hash = PARSEC_OBJ_NEW(parsec_hash_table_t); + int nb; + for(nb = 1; nb < 16 && (1 << nb) < parsec_dtd_tile_hash_table_size; nb++) /* nothing */; + parsec_hash_table_init( parsec_bcast_keys_hash, + offsetof(parsec_dtd_tile_t, ht_item), + nb, + DTD_key_fns, + parsec_bcast_keys_hash); + parsec_bcast_keys_tile_mempool = (parsec_mempool_t*) malloc (sizeof(parsec_mempool_t)); + parsec_mempool_construct( parsec_bcast_keys_tile_mempool, + PARSEC_OBJ_CLASS(parsec_dtd_tile_t), sizeof(parsec_dtd_tile_t), + offsetof(parsec_dtd_tile_t, mempool_owner), + 1/* no. of threads*/ ); + parsec_matrix_add2arena_rect(&parsec_dtd_arenas_datatypes[15], + parsec_datatype_int32_t, + parsec_dtd_bcast_tile_size, parsec_dtd_bcast_tile_size, parsec_dtd_bcast_tile_size); } /* **************************************************************************** */ @@ -961,6 +981,8 @@ parsec_dtd_tile_insert( uint64_t key, void parsec_dtd_tile_remove( parsec_data_collection_t *dc, uint64_t key ) { + if(dc == NULL) + return; parsec_hash_table_t *hash_table = (parsec_hash_table_t *)dc->tile_h_table; parsec_hash_table_remove( hash_table, (parsec_key_t)key ); @@ -1065,7 +1087,7 @@ parsec_dtd_data_collection_init( parsec_data_collection_t *dc ) void parsec_dtd_data_collection_fini( parsec_data_collection_t *dc ) { - parsec_hash_table_fini(dc->tile_h_table); + //parsec_hash_table_fini(dc->tile_h_table); PARSEC_OBJ_RELEASE(dc->tile_h_table); parsec_dc_unregister_id(dc->dc_id); } diff --git a/parsec/interfaces/superscalar/insert_function.h b/parsec/interfaces/superscalar/insert_function.h index cfbe055ac..9cc0798f3 100644 --- a/parsec/interfaces/superscalar/insert_function.h +++ b/parsec/interfaces/superscalar/insert_function.h @@ -15,6 +15,7 @@ #define PARSEC_INSERT_FUNCTION_H_HAS_BEEN_INCLUDED #include "parsec/runtime.h" +#include "parsec/parsec_internal.h" #include "parsec/data_distribution.h" BEGIN_C_DECLS @@ -101,6 +102,8 @@ extern parsec_arena_datatype_t *parsec_dtd_arenas_datatypes; extern int parsec_dtd_window_size; extern int parsec_dtd_threshold_size; +extern parsec_hash_table_t* parsec_bcast_keys_hash; +extern parsec_mempool_t* parsec_bcast_keys_tile_mempool; typedef struct parsec_dtd_tile_s parsec_dtd_tile_t; typedef struct parsec_dtd_task_s parsec_dtd_task_t; @@ -345,9 +348,8 @@ parsec_remote_deps_t* parsec_dtd_create_remote_deps( **/ void parsec_dtd_broadcast( - parsec_taskpool_t *taskpool, int myrank, int root, + parsec_taskpool_t *taskpool, int root, parsec_dtd_tile_t* dtd_tile_root, int arena_index, - parsec_dtd_tile_t* bcast_keys_root, int bcast_arena_index, int* dest_ranks, int num_dest_ranks); #endif diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index 8d1ba7bea..23fc16b34 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -9,6 +9,7 @@ #include "parsec/interfaces/superscalar/insert_function_internal.h" #ifdef PARSEC_DTD_DIST_COLLECTIVES + void populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) { diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 67547e7d6..250f0190d 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -261,9 +261,9 @@ int test_broadcast_mixed( if(myrank % 2 == 1 || myrank == root) { fprintf(stderr, "perform bcast from rank %d\n", myrank); parsec_dtd_broadcast( - dtd_tp, myrank, root, + dtd_tp, root, dtd_tile_root, TILE_FULL, - bcast_keys_root, TILE_BCAST, + //bcast_keys_root, TILE_BCAST, dest_ranks, num_dest_ranks); } @@ -326,9 +326,9 @@ for(int iter=1; iter <= 0; iter++) { // Perform Broadcast AGAIN // parsec_dtd_broadcast( - dtd_tp, myrank, root, + dtd_tp, root, dtd_tile_root, TILE_FULL, - bcast_keys_root, TILE_BCAST, + //bcast_keys_root, TILE_BCAST, dest_ranks, num_dest_ranks); // diff --git a/tests/interfaces/superscalar/testing_zpotrf_dtd.c b/tests/interfaces/superscalar/testing_zpotrf_dtd.c index 6e196dc44..62430752f 100644 --- a/tests/interfaces/superscalar/testing_zpotrf_dtd.c +++ b/tests/interfaces/superscalar/testing_zpotrf_dtd.c @@ -72,7 +72,8 @@ parsec_core_herk(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &uplo, &trans, &m, &n, &alpha, &A, &lda, &beta, &C, &ldc); - //fprintf(stderr, "core_herk executed\n"); + int rank = this_task->taskpool->context->my_rank; + //fprintf(stderr, "core_herk executed on rank %d\n", rank); CORE_zherk(uplo, trans, m, n, alpha, A, lda, @@ -114,7 +115,6 @@ int main(int argc, char **argv) int info = 0; int ret = 0; - //sleep(30); int m, n, k, total; /* loop counter */ /* Parameters passed on to Insert_task() */ @@ -136,23 +136,24 @@ int main(int argc, char **argv) LDB = dplasma_imax( LDB, N ); KP = 1; KQ = 1; + //sleep(30); PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, sym_two_dim_block_cyclic, (&dcA, matrix_ComplexDouble, rank, MB, NB, LDA, N, 0, 0, N, N, P, nodes/P, uplo)); - int bsize = 30; - PASTE_CODE_ALLOCATE_MATRIX(dcB, 1, - sym_two_dim_block_cyclic, (&dcB, matrix_Integer, - rank, bsize, bsize, bsize*N/NB, bsize*N/NB, 0, 0, - bsize*N/NB, bsize*N/NB, P, nodes/P, uplo)); + //int bsize = 30; + //PASTE_CODE_ALLOCATE_MATRIX(dcB, 1, + // sym_two_dim_block_cyclic, (&dcB, matrix_Integer, + // rank, bsize, bsize, bsize*N/NB, bsize*N/NB, 0, 0, + // bsize*N/NB, bsize*N/NB, P, nodes/P, uplo)); /* Initializing dc for dtd */ sym_two_dim_block_cyclic_t *__dcA = &dcA; parsec_dtd_data_collection_init((parsec_data_collection_t *)&dcA); - sym_two_dim_block_cyclic_t *__dcB = &dcB; - parsec_dtd_data_collection_init((parsec_data_collection_t *)&dcB); + //sym_two_dim_block_cyclic_t *__dcB = &dcB; + //parsec_dtd_data_collection_init((parsec_data_collection_t *)&dcB); /* matrix generation */ if(loud > 3) printf("+++ Generate matrices ... "); @@ -169,10 +170,10 @@ int main(int argc, char **argv) PARSEC_ARENA_ALIGNMENT_SSE, parsec_datatype_double_complex_t, dcA.super.mb ); - dplasma_add2arena_tile( &parsec_dtd_arenas_datatypes[TILE_BCAST], - dcB.super.mb*dcB.super.nb*sizeof(int), - PARSEC_ARENA_ALIGNMENT_SSE, - parsec_datatype_int32_t, dcB.super.mb ); + //dplasma_add2arena_tile( &parsec_dtd_arenas_datatypes[TILE_BCAST], + // dcB.super.mb*dcB.super.nb*sizeof(int), + // PARSEC_ARENA_ALIGNMENT_SSE, + // parsec_datatype_int32_t, dcB.super.mb ); /* Registering the handle with parsec context */ parsec_context_add_taskpool( parsec, dtd_tp ); @@ -310,7 +311,8 @@ int main(int argc, char **argv) //int *dest_ranks = (int*)malloc(num_dest_ranks*sizeof(int)); dest_rank_idx = 0; flag = 0; - for(int m = k+1; m < total; m++) { + /* Should only be done in root, others will pass NULL since they know nothing */ + for(int m = k+1; m < total; m++) { int tile_rank = parsec_dtd_rank_of_data(&dcA.super.super, k, m); if(tile_rank == root) {flag = 1; continue;} dest_ranks[dest_rank_idx] = tile_rank; @@ -321,12 +323,12 @@ int main(int argc, char **argv) if( ( flag || (rank == root) ) && ( dest_rank_idx >= 1) ) { parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, k); - parsec_dtd_tile_t* dtd_key_root = PARSEC_DTD_TILE_OF(B, k, k); + //parsec_dtd_tile_t* dtd_key_root = PARSEC_DTD_TILE_OF(B, k, k); //fprintf(stderr, "Broadcasting PO tile to TRSM. k %d, rank %d, root %d\n", k, rank, root); parsec_dtd_broadcast( - dtd_tp, rank, root, + dtd_tp, root, dtd_tile_root, TILE_FULL, - dtd_key_root, TILE_BCAST, + // dtd_key_root, TILE_BCAST, dest_ranks, dest_rank_idx); } @@ -378,12 +380,12 @@ int main(int argc, char **argv) if( ( flag || (rank == root) ) && ( dest_rank_idx >= 1) ) { parsec_dtd_tile_t* dtd_tile_root = PARSEC_DTD_TILE_OF(A, k, m); - parsec_dtd_tile_t* dtd_key_root = PARSEC_DTD_TILE_OF(B, k, m); + //parsec_dtd_tile_t* dtd_key_root = PARSEC_DTD_TILE_OF(B, k, m); //fprintf(stderr, "Broadcasting TRSM tile to SYRK and GEMM. k %d, m %d, rank %d, root %d\n", k, m, rank, root); parsec_dtd_broadcast( - dtd_tp, rank, root, + dtd_tp, root, dtd_tile_root, TILE_FULL, - dtd_key_root, TILE_BCAST, + //dtd_key_root, TILE_BCAST, dest_ranks, dest_rank_idx); } } @@ -394,6 +396,7 @@ int main(int argc, char **argv) ldam = BLKLDD(&dcA.super, m); //if(parsec_dtd_rank_of_data(&dcA.super.super, m, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank ) { if(parsec_dtd_rank_of_data(&dcA.super.super, m, m) == rank ) { + //fprintf(stderr, "Inserting syrk[%d %d][%d %d] in rank: %d owned: %d\n", m, m, k, m, rank, parsec_dtd_rank_of_data(&dcA.super.super, m, m)); parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_herk, (total - m) * (total - m) * (total - m) + 3 * (m - k)/*priority*/, "Herk", sizeof(int), &uplo, PARSEC_VALUE, @@ -412,6 +415,7 @@ int main(int argc, char **argv) ldan = BLKLDD(&dcA.super, n); //if(parsec_dtd_rank_of_data(&dcA.super.super, m, n) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, m) == rank || parsec_dtd_rank_of_data(&dcA.super.super, k, n) == rank) { if(parsec_dtd_rank_of_data(&dcA.super.super, m, n) == rank ) { + //fprintf(stderr, "Inserting GEMM[%d %d][%d %d] in rank: %d owned: %d\n", k, n, k, m, rank, parsec_dtd_rank_of_data(&dcA.super.super, m, n)); parsec_dtd_taskpool_insert_task( dtd_tp, parsec_core_gemm, (total - m) * (total - m) * (total - m) + 3 * ((2 * total) - m - n - 3) * (m - n) + 6 * (m - k) /*priority*/, "Gemm", sizeof(int), &transA_g, PARSEC_VALUE, @@ -503,14 +507,14 @@ int main(int argc, char **argv) /* Cleaning data arrays we allocated for communication */ dplasma_matrix_del2arena( &parsec_dtd_arenas_datatypes[TILE_FULL] ); - dplasma_matrix_del2arena( &parsec_dtd_arenas_datatypes[TILE_BCAST] ); + //dplasma_matrix_del2arena( &parsec_dtd_arenas_datatypes[TILE_BCAST] ); parsec_dtd_data_collection_fini( (parsec_data_collection_t *)&dcA ); - parsec_dtd_data_collection_fini( (parsec_data_collection_t *)&dcB ); + //parsec_dtd_data_collection_fini( (parsec_data_collection_t *)&dcB ); parsec_data_free(dcA.mat); dcA.mat = NULL; - parsec_data_free(dcB.mat); dcB.mat = NULL; + //parsec_data_free(dcB.mat); dcB.mat = NULL; parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcA); - parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcB); + //parsec_tiled_matrix_dc_destroy( (parsec_tiled_matrix_dc_t*)&dcB); cleanup_parsec(parsec, iparam); return ret; From 464c721651f97ac0250062169f32ced65b399734 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 24 Jan 2022 14:51:45 -0500 Subject: [PATCH 26/41] switch to reuse PTG implementation --- parsec/interfaces/superscalar/collectives.c | 18 ++- .../interfaces/superscalar/insert_function.c | 20 +-- .../superscalar/insert_function_internal.h | 2 +- .../superscalar/overlap_strategies.c | 36 ++--- .../superscalar/parsec_dtd_broadcast.c | 140 ++++++++++-------- parsec/remote_dep.c | 19 ++- parsec/remote_dep_mpi.c | 24 +-- .../dtd_test_broadcast_collective.c | 14 +- 8 files changed, 156 insertions(+), 117 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 5002aa71e..231f36869 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -102,6 +102,7 @@ void parsec_dtd_broadcast( parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)taskpool; bcast_keys_root = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); + SET_LAST_ACCESSOR(bcast_keys_root); bcast_keys_root->dc = NULL; bcast_keys_root->arena_index = -1; bcast_keys_root->key = (uint64_t) bcast_id; @@ -132,17 +133,18 @@ void parsec_dtd_broadcast( } // Retrieve DTD tile's data_copy - parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; - parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; + //parsec_data_copy_t *data_copy = dtd_tile_root->data_copy; + //parsec_data_copy_t *key_copy = bcast_keys_root->data_copy; // Create remote deps corresponding to the braodcast + /* parsec_remote_deps_t *deps_0 = parsec_dtd_create_remote_deps( myrank, root, data_copy, &parsec_dtd_arenas_datatypes[arena_index], dest_ranks, num_dest_ranks); parsec_remote_deps_t *deps_1 = parsec_dtd_create_remote_deps( myrank, root, key_copy, &parsec_dtd_arenas_datatypes[bcast_arena_index], dest_ranks, num_dest_ranks); - + */ parsec_task_t *bcast_task_root = parsec_dtd_taskpool_create_task( taskpool, parsec_dtd_bcast_data_fn, 0, "bcast_data_fn", PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | arena_index, @@ -152,8 +154,8 @@ void parsec_dtd_broadcast( parsec_dtd_task_t *dtd_bcast_task_root = (parsec_dtd_task_t *)bcast_task_root; // Set broadcast topology info - deps_0->pending_ack = 0; - dtd_bcast_task_root->deps_out = deps_0; + //deps_0->pending_ack = 0; + //dtd_bcast_task_root->deps_out = deps_0; if(myrank == root) { dtd_bcast_task_root->ht_item.key = bcast_id; @@ -170,8 +172,8 @@ void parsec_dtd_broadcast( sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_bcast_key_root = (parsec_dtd_task_t *)bcast_key_root; - deps_1->pending_ack = 0; - dtd_bcast_key_root->deps_out = deps_1; + //deps_1->pending_ack = 0; + //dtd_bcast_key_root->deps_out = deps_1; if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ @@ -183,7 +185,7 @@ void parsec_dtd_broadcast( parsec_insert_dtd_task(dtd_bcast_key_root); /* Post the bcast tasks for the actual data */ - parsec_insert_dtd_task(dtd_bcast_task_root); + //parsec_insert_dtd_task(dtd_bcast_task_root); } #endif diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 277454290..6df9f7e84 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1493,7 +1493,8 @@ dtd_release_dep_fct( parsec_execution_stream_t *es, _array_pos = dst_rank / (8 * sizeof(uint32_t)); _array_mask = 1 << (dst_rank % (8 * sizeof(uint32_t))); PARSEC_ALLOCATE_REMOTE_DEPS_IF_NULL(arg->remote_deps, oldcontext, MAX_PARAM_COUNT); - if(real_parent_task->deps_out == NULL) { + if(real_parent_task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + real_parent_task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { arg->remote_deps->bcast_keys[dep->dep_datatype_index] = 0; arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= src_rank<<18; arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= (FLOW_OF(real_parent_task, dep->belongs_to->flow_index))->msg_keys[dst_rank]; @@ -2313,7 +2314,8 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in } /* On the receiver side, based on the previous parent key, update next recv key for dep flow */ - if(real_parent_task->deps_out == NULL) { + if(real_parent_task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + real_parent_task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { if(real_parent_task->ht_item.key == 0xffffffff) { real_parent_task->ht_item.key = 0; real_parent_task->ht_item.key |= real_parent_task->rank<<18; @@ -2338,7 +2340,7 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); dep = parsec_dtd_find_task(tp, key); real_parent_task->super.locals[0].value = real_parent_task->ht_item.key = data_ptr[0]; - populate_remote_deps(data_ptr, real_parent_task->deps_out); + //populate_remote_deps(data_ptr, real_parent_task->deps_out); } //fprintf(stderr, "inserting bcast data task and finding in hashtable with key %llu %d, result %p dep %p\n", key, real_parent_task->super.locals[0].value, item, dep); } @@ -2425,8 +2427,6 @@ parsec_dtd_create_and_initialize_task( parsec_dtd_taskpool_t *dtd_tp, assert(this_task->super.super.super.obj_reference_count == 1); this_task->orig_task = NULL; - /* DTD Collective */ - this_task->deps_out = NULL; this_task->super.taskpool = (parsec_taskpool_t*)dtd_tp; /* this_task->ht_item.key = (parsec_key_t)(uintptr_t)(dtd_tp->task_id++); */ @@ -2734,7 +2734,8 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) tile_op_type, last_user.alive); } - if(last_writer.task->deps_out == NULL) { + if(last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { /* local parent and we are inserting a remote task, indicates it needs to send data */ if(parsec_dtd_task_is_local(last_writer.task) && parsec_dtd_task_is_remote(this_task)) { @@ -2832,7 +2833,8 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) } - if(last_writer.task->deps_out == NULL) { + if(last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { /* local parent and we are inserting a remote task, indicates it needs to send data */ if(parsec_dtd_task_is_local(last_writer.task) && parsec_dtd_task_is_remote(this_task)) { @@ -2928,7 +2930,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) /* Releasing every remote_task */ if( parsec_dtd_task_is_remote( this_task ) ) { - parsec_dtd_remote_task_release( this_task ); + // parsec_dtd_remote_task_release( this_task ); } /* Increase the count of satisfied flows to counter-balance the increase in the @@ -2940,11 +2942,11 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) uint64_t key = ((uint64_t)(this_task->super.locals[0].value)<<32) | (1U<<0); parsec_hash_table_lock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( dtd_tp, key ); + fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); if( NULL == dep ) { if( !(flow->flags & TASK_INSERTED) ) { flow->flags |= TASK_INSERTED; parsec_dtd_track_task( dtd_tp, key, this_task ); - //fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); } } else { if( !(flow->flags & TASK_INSERTED) ) { diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index c2fe2be09..fb0d3fd3e 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -40,6 +40,7 @@ extern int parsec_dtd_dump_traversal_info; /**< For printing traversal info */ #define PARSEC_DTD_FLUSH_TC_ID ((uint8_t)0x00) #define PARSEC_DTD_BCAST_KEY_TC_ID ((uint8_t)0x01) #define PARSEC_DTD_BCAST_DATA_TC_ID ((uint8_t)0x02) +#define PARSEC_DTD_BCAST_TC_ID = ((PARSEC_DTD_BCAST_KEY_TC_ID) | (PARSEC_DTD_BCAST_DATA_TC_ID)) /* To flag the task we are trying to complete as a local one */ #define PARSEC_ACTION_COMPLETE_LOCAL_TASK 0x08000000 @@ -184,7 +185,6 @@ struct parsec_dtd_task_s { int32_t rank_bits[MAX_RANK_INFO]; /* for testing PTG inserting task in DTD */ parsec_task_t *orig_task; - parsec_remote_deps_t *deps_out; }; /* For creating objects of class parsec_dtd_task_t */ diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 993e2b730..69cb75713 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -172,26 +172,26 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, if(PARSEC_DTD_BCAST_DATA_TC_ID == current_task->super.task_class->task_class_id) { /* for the bcast data class, in addition to release the data to local deps tasks that will read the data * propagate the data down to descendants as well */ - if(current_task->deps_out != NULL) { + //if(current_task->deps_out != NULL) { /* we have not propagate the remote deps yet, otherwise will be set to NULL */ if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { assert(NULL != current_task->super.data[current_dep].data_out); //fprintf(stderr, "bcast root task %p data with global key %d\n", current_task, current_task->ht_item.key); - current_task->deps_out->output[0].data.data = - current_task->super.data[current_dep].data_out; + //current_task->deps_out->output[0].data.data = + // current_task->super.data[current_dep].data_out; //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; + //parsec_remote_dep_activate( + // es, (parsec_task_t *)current_task, + // current_task->deps_out, + // current_task->deps_out->outgoing_mask); + //current_task->deps_out = NULL; } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* current node is part of the broadcast operation, propagate downstream */ - int root = current_task->deps_out->root; + //int root = current_task->deps_out->root; int my_rank = current_task->super.taskpool->context->my_rank; int _array_pos, _array_mask; struct remote_dep_output_param_s* output; - output = ¤t_task->deps_out->output[0]; + //output = ¤t_task->deps_out->output[0]; _array_pos = my_rank / (8 * sizeof(uint32_t)); _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p\n", my_rank, root, current_task); @@ -199,17 +199,17 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, if ((output->rank_bits[_array_pos] & _array_mask)) { assert(NULL != current_task->super.data[current_dep].data_out); - current_task->deps_out->output[0].data.data = - current_task->super.data[0].data_out; + //current_task->deps_out->output[0].data.data = + // current_task->super.data[0].data_out; //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; + //parsec_remote_dep_activate( + // es, (parsec_task_t *)current_task, + // current_task->deps_out, + // current_task->deps_out->outgoing_mask); + //current_task->deps_out = NULL; } } - } + //} } /* BCAST DATA propagation */ if( FLOW_OF(current_task, current_dep)->op_type & PARSEC_DONT_TRACK ) { diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index 23fc16b34..1380b6848 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -105,87 +105,100 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, if( (flow_mask & (1<deps_out); + parsec_remote_deps_t *deps; + PARSEC_ALLOCATE_REMOTE_DEPS_IF_NULL(deps, this_task, MAX_PARAM_COUNT); + deps->root = my_rank; + deps->outgoing_mask |= (1 << 0); /* only 1 flow */ + deps->max_priority = 0; + + struct remote_dep_output_param_s* output = &deps->output[0]; + output->data.data = current_task->super.data[0].data_out;//NULL; + output->data.arena = parsec_dtd_arenas_datatypes[15].arena; + output->data.layout = parsec_dtd_arenas_datatypes[15].opaque_dtt; + output->data.count = 1; + output->data.displ = 0; + output->priority = 0; int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(current_task->deps_out->root << 20) | *(data_ptr+1+successor)); + //successor = get_chain_successor(es, (parsec_task_t*)current_task, deps); + //current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) |(my_rank << 20) | *(data_ptr+1+successor)); tile = FLOW_OF(current_task, current_dep)->tile; parsec_dtd_tile_retain(tile); + populate_remote_deps(data_ptr, deps); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; + deps, + deps->outgoing_mask); + //current_task->deps_out = NULL; /* decrease the count as in the data flush */ parsec_dtd_release_local_task( current_task ); - } else if (action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* a node in the key array propagation */ - int root = current_task->deps_out->root; + parsec_release_dep_fct_arg_t* arg = (parsec_release_dep_fct_arg_t*)ontask_arg; + parsec_remote_deps_t* deps = arg->remote_deps; + int root = deps->root; int my_rank = current_task->super.taskpool->context->my_rank; int _array_pos, _array_mask; struct remote_dep_output_param_s* output; - output = ¤t_task->deps_out->output[0]; + output = &deps->output[0]; _array_pos = my_rank / (8 * sizeof(uint32_t)); _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); - if ((output->rank_bits[_array_pos] & _array_mask)) { - /* We are part of the broadcast, forward message */ - int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - populate_remote_deps(data_ptr, current_task->deps_out); - successor = get_chain_successor(es, (parsec_task_t*)current_task, current_task->deps_out); - if(successor == -1) { - current_task->deps_out->outgoing_mask = 0; - } - current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | *(data_ptr+1+successor)); - assert(NULL != current_task->super.data[current_dep].data_out); - - current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; - parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); - parsec_remote_dep_activate( - es, (parsec_task_t *)current_task, - current_task->deps_out, - current_task->deps_out->outgoing_mask); - current_task->deps_out = NULL; - /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); - uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); - - parsec_dtd_task_t* dtd_task = NULL; - parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; - parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); - dtd_task = parsec_dtd_find_task(tp, key); - if(dtd_task == NULL) { - int* buffer = malloc(sizeof(int)*30*30); - memcpy(buffer, data_ptr, sizeof(int)*30*30); - dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); - parsec_hash_table_t *hash_table = tp->keys_hash_table; - item->ht_item.key = (parsec_key_t)key; - item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; - item->value = (void *)buffer; - parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); - } else { - parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); - parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); - - populate_remote_deps(data_ptr, dtd_task->deps_out); - parsec_dtd_untrack_task(tp, key); - if(dep == NULL){ - dtd_task->super.locals[0].value = data_ptr[0]; - parsec_dtd_track_task(tp, key2, dtd_task); - }else{ - - dtd_task->super.locals[0].value = data_ptr[0]; - parsec_dtd_untrack_remote_dep(tp, key2); - parsec_dtd_track_task(tp, key2, dtd_task); - remote_dep_dequeue_delayed_dep_release(dep); - } - parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); + /* We are part of the broadcast, forward message */ + int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + populate_remote_deps(data_ptr, deps); + //successor = get_chain_successor(es, (parsec_task_t*)current_task, current_task->deps_out); + if(successor == -1) { + //current_task->deps_out->outgoing_mask = 0; + } + //current_task->super.locals[0].value = current_task->ht_item.key = ((1<<29) | (root << 20) | *(data_ptr+1+successor)); + assert(NULL != current_task->super.data[current_dep].data_out); + + //current_task->deps_out->output[0].data.data = current_task->super.data[0].data_out; + //parsec_dtd_retain_data_copy(current_task->super.data[current_dep].data_out); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + deps, + deps->outgoing_mask); + //current_task->deps_out = NULL; + /* update the BCAST DATA task or dep with the global ID that we know now */ + uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); + + parsec_dtd_task_t* dtd_task = NULL; + parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); + dtd_task = parsec_dtd_find_task(tp, key); + if(dtd_task == NULL) { + int* buffer = malloc(sizeof(int)*30*30); + memcpy(buffer, data_ptr, sizeof(int)*30*30); + dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); + parsec_hash_table_t *hash_table = tp->keys_hash_table; + item->ht_item.key = (parsec_key_t)key; + item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; + item->value = (void *)buffer; + parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); + } else { + parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); + parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); + + //populate_remote_deps(data_ptr, dtd_task->deps_out); + parsec_dtd_untrack_task(tp, key); + if(dep == NULL){ + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_track_task(tp, key2, dtd_task); + }else{ + + dtd_task->super.locals[0].value = data_ptr[0]; + parsec_dtd_untrack_remote_dep(tp, key2); + parsec_dtd_track_task(tp, key2, dtd_task); + remote_dep_dequeue_delayed_dep_release(dep); } - parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); - tile = FLOW_OF(current_task, current_dep)->tile; - parsec_dtd_tile_retain(tile); + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key2); } + parsec_hash_table_unlock_bucket(tp->task_hash_table, (parsec_key_t)key); + tile = FLOW_OF(current_task, current_dep)->tile; + parsec_dtd_tile_retain(tile); } else { /* on the receiver side, get datatype to aquire datatype, arena etc info */ data.data = current_task->super.data[current_dep].data_out; @@ -220,6 +233,7 @@ int parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) { (void)es; (void)this_task; + fprintf(stderr, "executed the body of bcast key fn\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 62c9654c4..ae9d18cd0 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -135,7 +135,7 @@ remote_dep_complete_and_cleanup(parsec_remote_deps_t** deps, (*deps)->outgoing_mask = 0; if(ncompleted) remote_dep_dec_flying_messages((*deps)->taskpool); - remote_deps_free(*deps); + //remote_deps_free(*deps); *deps = NULL; return 1; } @@ -533,7 +533,8 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, /* Right now DTD only supports a star broadcast topology */ if( PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type ) { parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; - if(this_dtd_task->deps_out == NULL) { + if(this_dtd_task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + this_dtd_task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { remote_deps->msg.locals[0].value = remote_deps->bcast_keys[i]; /* p2p, update the key for this message */ remote_dep_bcast_child_permits = remote_dep_bcast_star_child(my_idx, idx); } else { @@ -544,6 +545,15 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, } if(remote_dep_bcast_child_permits) { + if( PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type ) { + parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; + if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || + this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { + int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); + this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); + remote_deps->msg.locals[0].value = this_dtd_task->super.locals[0].value; + } + } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- send (%x)", remote_deps->root, i, tmp, my_idx, idx, rank, remote_deps->outgoing_mask); assert(remote_deps->outgoing_mask & (1U<pending_ack); } //if(PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type && task->task_class->task_class_id == 2) - // remote_dep_inc_flying_messages(task->taskpool); + remote_dep_inc_flying_messages(task->taskpool); + (void)parsec_atomic_fetch_inc_int32(&remote_deps->pending_ack); remote_dep_send(es, rank, remote_deps); } else { PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- skip (not my direct descendant)", @@ -581,7 +592,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, } } } - remote_dep_complete_and_cleanup(&remote_deps, (keeper ? 1 : 0)); + //remote_dep_complete_and_cleanup(&remote_deps, (keeper ? 1 : 0)); return 0; } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 8b0d1ad16..1df413ac9 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -750,7 +750,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, if( NULL == dtd_task ) { return_defer = 1; - //fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); + fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to @@ -896,9 +896,15 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tTranslate mask from 0x%lx to 0x%x (remote_dep_release_incoming)", complete_mask, action_mask); - (void)task.task_class->release_deps(es, &task, - action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, - NULL); + if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { + (void)task.task_class->release_deps(es, &task, + action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, + origin); + } else { + (void)task.task_class->release_deps(es, &task, + action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, + NULL); + } assert(0 == (origin->incoming_mask & complete_mask)); if(0 != origin->incoming_mask) /* not done receiving */ @@ -912,7 +918,7 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, * references on the allocated data and on the dependency. */ uint32_t mask = origin->outgoing_mask; - origin->outgoing_mask = 0; + //origin->outgoing_mask = 0; #if defined(PARSEC_DIST_COLLECTIVES) if( PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type ) /* indicates it is a PTG taskpool */ @@ -933,7 +939,7 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, remote_dep_complete_and_cleanup(&origin, 1); } else { //remote_dep_complete_and_cleanup(&origin, 1); - remote_deps_free(origin); + //remote_deps_free(origin); //remote_dep_dec_flying_messages(task.taskpool); } @@ -2127,9 +2133,9 @@ remote_dep_mpi_save_activate_cb(parsec_execution_stream_t* es, &deps->msg, dep_count, dep_dtt, dep_comm); deps->from = status->MPI_SOURCE; - //if(es->virtual_process->parsec_context->my_rank == 1){ - // fprintf(stderr, "save activate cb with value %d\n", deps->msg.locals[0].value); - //} + if(es->virtual_process->parsec_context->my_rank == 1){ + fprintf(stderr, "save activate cb with value %d\n", deps->msg.locals[0].value); + } /* Retrieve the data arenas and update the msg.incoming_mask to reflect * the data we should be receiving from the predecessor. */ diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 250f0190d..ff34491ae 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -231,7 +231,8 @@ int test_broadcast_mixed( int key_root; parsec_dtd_tile_t* dtd_tile_root; parsec_dtd_tile_t* bcast_keys_root; - if(myrank % 2 == 1 || myrank == root) { + //if(myrank % 2 == 1 || myrank == root) { + if(1) { key_root = key = A->data_key(A, root, 0); dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); key_root = B->data_key(B, root, 0); @@ -248,7 +249,8 @@ int test_broadcast_mixed( // Put odd rank indexes into `dest_ranks` array except for the root // node. VALID ONLY ON THE ROOT NODE for (int rank = 0; rank < world; ++rank) { - if (rank % 2 == 0 || rank == root) continue; + //if (rank % 2 == 0 || rank == root) continue; + if (rank == root) continue; dest_ranks[dest_rank_idx] = rank; ++dest_rank_idx; @@ -258,7 +260,8 @@ int test_broadcast_mixed( // // Perform Broadcast // - if(myrank % 2 == 1 || myrank == root) { + //if(myrank % 2 == 1 || myrank == root) { + if(1) { fprintf(stderr, "perform bcast from rank %d\n", myrank); parsec_dtd_broadcast( dtd_tp, root, @@ -271,7 +274,8 @@ int test_broadcast_mixed( // Retrieve value of broadcasted data // //if(myrank % 2 == 1 || myrank == root) { - if(myrank % 2 == 1) { + //if(myrank % 2 == 1) { + if(1) { //for (int rank = 0; rank < world; ++rank) { //if (rank % 2 == 0 || rank == root) continue; @@ -283,7 +287,7 @@ int test_broadcast_mixed( sizeof(int*), &data_value_out, PARSEC_VALUE, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - parsec_insert_dtd_task(retrieve_task); + //parsec_insert_dtd_task(retrieve_task); //} } From d56c3b9e21a377dd848fbe89f1e7c61c82100a3f Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 31 Jan 2022 13:40:18 -0500 Subject: [PATCH 27/41] linking bcast meta data to propagate bcast data with global ID --- parsec/interfaces/superscalar/collectives.c | 6 +- .../superscalar/overlap_strategies.c | 66 ++++++++++++------- .../superscalar/parsec_dtd_broadcast.c | 30 ++++++--- parsec/remote_dep.c | 3 +- parsec/remote_dep_mpi.c | 3 +- .../dtd_test_broadcast_collective.c | 2 +- 6 files changed, 69 insertions(+), 41 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 231f36869..a28cd2baa 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -113,13 +113,13 @@ void parsec_dtd_broadcast( new_data_copy->coherency_state = PARSEC_DATA_COHERENCY_OWNED; new_data_copy->device_private = malloc(sizeof(int)*2500); bcast_keys_root->data_copy = new_data_copy; - bcast_keys_root->ht_item.key = (parsec_key_t)key; - parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); if(myrank == root) { bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; + bcast_keys_root->ht_item.key = (parsec_key_t)bcast_id; + parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); parsec_data_copy = bcast_keys_root->data_copy; data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); @@ -185,7 +185,7 @@ void parsec_dtd_broadcast( parsec_insert_dtd_task(dtd_bcast_key_root); /* Post the bcast tasks for the actual data */ - //parsec_insert_dtd_task(dtd_bcast_task_root); + parsec_insert_dtd_task(dtd_bcast_task_root); } #endif diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 69cb75713..6448f55ef 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -175,39 +175,57 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, //if(current_task->deps_out != NULL) { /* we have not propagate the remote deps yet, otherwise will be set to NULL */ if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { + parsec_remote_deps_t *deps = NULL; + PARSEC_ALLOCATE_REMOTE_DEPS_IF_NULL(deps, this_task, MAX_PARAM_COUNT); + deps->root = rank_src; + deps->outgoing_mask |= (1 << 0); /* only 1 flow */ + deps->max_priority = 0; + + struct remote_dep_output_param_s* output = &deps->output[0]; + output->data.data = current_task->super.data[0].data_out; + output->data.arena = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].arena; + output->data.layout = parsec_dtd_arenas_datatypes[FLOW_OF(current_task, current_dep)->arena_index].opaque_dtt; + output->data.count = 1; + output->data.displ = 0; + output->priority = 0; + assert(NULL != current_task->super.data[current_dep].data_out); - //fprintf(stderr, "bcast root task %p data with global key %d\n", current_task, current_task->ht_item.key); + parsec_dtd_tile_t *tile = NULL; + tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, (parsec_key_t)current_task->super.locals[0].value); + fprintf(stderr, "bcast root task %p data with global key %d tile %p\n", current_task, current_task->ht_item.key, tile); + int* data_ptr = (int*)parsec_data_copy_get_ptr(tile->data_copy); + populate_remote_deps(data_ptr, deps); //current_task->deps_out->output[0].data.data = // current_task->super.data[current_dep].data_out; //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - //parsec_remote_dep_activate( - // es, (parsec_task_t *)current_task, - // current_task->deps_out, - // current_task->deps_out->outgoing_mask); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + deps, + deps->outgoing_mask); //current_task->deps_out = NULL; } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* current node is part of the broadcast operation, propagate downstream */ //int root = current_task->deps_out->root; + parsec_release_dep_fct_arg_t* arg = (parsec_release_dep_fct_arg_t*)ontask_arg; + parsec_remote_deps_t* deps = arg->remote_deps; + int root = deps->root; int my_rank = current_task->super.taskpool->context->my_rank; - int _array_pos, _array_mask; - struct remote_dep_output_param_s* output; - //output = ¤t_task->deps_out->output[0]; - _array_pos = my_rank / (8 * sizeof(uint32_t)); - _array_mask = 1 << (my_rank % (8 * sizeof(uint32_t))); - //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p\n", my_rank, root, current_task); - - if ((output->rank_bits[_array_pos] & _array_mask)) { - assert(NULL != current_task->super.data[current_dep].data_out); - - //current_task->deps_out->output[0].data.data = - // current_task->super.data[0].data_out; - //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - //parsec_remote_dep_activate( - // es, (parsec_task_t *)current_task, - // current_task->deps_out, - // current_task->deps_out->outgoing_mask); - //current_task->deps_out = NULL; - } + + parsec_dtd_tile_t* item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)current_task->super.locals[0].value ); + int* data_ptr = (int*)item->data_copy->device_private; + populate_remote_deps(data_ptr, deps); + fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with item %p value0 %d\n", my_rank, root, current_task, item, data_ptr[0]); + + assert(NULL != current_task->super.data[current_dep].data_out); + + //current_task->deps_out->output[0].data.data = + // current_task->super.data[0].data_out; + //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate( + es, (parsec_task_t *)current_task, + deps, + deps->outgoing_mask); + //current_task->deps_out = NULL; } //} } /* BCAST DATA propagation */ diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index 1380b6848..67f01b387 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -169,16 +169,25 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, parsec_dtd_taskpool_t *tp = (parsec_dtd_taskpool_t *)current_task->super.taskpool; parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); dtd_task = parsec_dtd_find_task(tp, key); - if(dtd_task == NULL) { - int* buffer = malloc(sizeof(int)*30*30); - memcpy(buffer, data_ptr, sizeof(int)*30*30); - dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); - parsec_hash_table_t *hash_table = tp->keys_hash_table; - item->ht_item.key = (parsec_key_t)key; - item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; - item->value = (void *)buffer; - parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); - } else { + + // store the meta data info into the key hash table + int* buffer = malloc(sizeof(int)*30*30); + memcpy(buffer, data_ptr, sizeof(int)*30*30); + dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); + parsec_hash_table_t *hash_table = tp->keys_hash_table; + item->ht_item.key = (parsec_key_t)key; + item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; + item->value = (void *)buffer; + parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); + + parsec_dtd_tile_t* bcast_keys = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); + bcast_keys->ht_item.key = (parsec_key_t)(data_ptr[0]); + bcast_keys->data_copy = PARSEC_OBJ_NEW(parsec_data_copy_t); + bcast_keys->data_copy->device_private = (void *)buffer; + parsec_hash_table_nolock_insert( parsec_bcast_keys_hash, &bcast_keys->ht_item ); + fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p with value pointer %p on rank %d\n", bcast_keys, buffer, es->virtual_process->parsec_context->my_rank); + + if(dtd_task != NULL) { parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); parsec_remote_deps_t *dep = parsec_dtd_find_task(tp, key2); @@ -250,6 +259,7 @@ int parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) { (void)es; (void)this_task; + fprintf(stderr, "executed the body of bcast data fn\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index ae9d18cd0..45459ee94 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -547,8 +547,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, if(remote_dep_bcast_child_permits) { if( PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type ) { parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; - if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || - this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { + if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); remote_deps->msg.locals[0].value = this_dtd_task->super.locals[0].value; diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 1df413ac9..2e480c6c8 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -896,7 +896,8 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tTranslate mask from 0x%lx to 0x%x (remote_dep_release_incoming)", complete_mask, action_mask); - if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { + if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || + task.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { (void)task.task_class->release_deps(es, &task, action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, origin); diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index ff34491ae..a6d449a18 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -287,7 +287,7 @@ int test_broadcast_mixed( sizeof(int*), &data_value_out, PARSEC_VALUE, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - //parsec_insert_dtd_task(retrieve_task); + parsec_insert_dtd_task(retrieve_task); //} } From b1b6ce21a5ddaaf134414547ee8acdbbe1bde194 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sun, 6 Feb 2022 21:47:31 -0500 Subject: [PATCH 28/41] fix remote dep refcount issue --- parsec/interfaces/superscalar/parsec_dtd_data_flush.c | 1 + parsec/remote_dep.c | 8 ++++---- parsec/remote_dep_mpi.c | 4 +++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c index e93d21e76..1d54cbd4c 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c +++ b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c @@ -40,6 +40,7 @@ parsec_dtd_data_flush_sndrcv(parsec_execution_stream_t *es, assert(tile != NULL); + fprintf(stderr, "completed data flush task on rank %d\n", current_task->rank); #if defined(DISTRIBUTED) if(tile->rank == current_task->rank) { /* this is a receive task*/ if( current_task->super.data[0].data_in != tile->data_copy ) { diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 45459ee94..cb39b471d 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -135,7 +135,7 @@ remote_dep_complete_and_cleanup(parsec_remote_deps_t** deps, (*deps)->outgoing_mask = 0; if(ncompleted) remote_dep_dec_flying_messages((*deps)->taskpool); - //remote_deps_free(*deps); + remote_deps_free(*deps); *deps = NULL; return 1; } @@ -579,8 +579,8 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, (void)parsec_atomic_fetch_inc_int32(&remote_deps->pending_ack); } //if(PARSEC_TASKPOOL_TYPE_DTD == task->taskpool->taskpool_type && task->task_class->task_class_id == 2) - remote_dep_inc_flying_messages(task->taskpool); - (void)parsec_atomic_fetch_inc_int32(&remote_deps->pending_ack); + //remote_dep_inc_flying_messages(task->taskpool); + //(void)parsec_atomic_fetch_inc_int32(&remote_deps->pending_ack); remote_dep_send(es, rank, remote_deps); } else { PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- skip (not my direct descendant)", @@ -591,7 +591,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, } } } - //remote_dep_complete_and_cleanup(&remote_deps, (keeper ? 1 : 0)); + remote_dep_complete_and_cleanup(&remote_deps, (keeper ? 1 : 0)); return 0; } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 2e480c6c8..c8831d9fe 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -898,6 +898,8 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, complete_mask, action_mask); if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || task.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { + //remote_dep_inc_flying_messages(origin->taskpool); + (void)parsec_atomic_fetch_inc_int32(&origin->pending_ack); (void)task.task_class->release_deps(es, &task, action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, origin); @@ -939,7 +941,7 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, if(PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type) { remote_dep_complete_and_cleanup(&origin, 1); } else { - //remote_dep_complete_and_cleanup(&origin, 1); + remote_dep_complete_and_cleanup(&origin, 1); //remote_deps_free(origin); //remote_dep_dec_flying_messages(task.taskpool); From 1b222f045597f6367bbad907dba90c57acd8374f Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sun, 6 Feb 2022 23:47:41 -0500 Subject: [PATCH 29/41] minor update to test case --- .../superscalar/parsec_dtd_broadcast.c | 2 +- .../dtd_test_broadcast_collective.c | 29 +++++++++---------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index 67f01b387..f83ffcd94 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -105,7 +105,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, if( (flow_mask & (1<root = my_rank; deps->outgoing_mask |= (1 << 0); /* only 1 flow */ diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index a6d449a18..c07c3fa3c 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -291,40 +291,38 @@ int test_broadcast_mixed( //} } -for(int iter=1; iter <= 0; iter++) { - // Second round of broadcast, create another array of keys for this bcast - //key_root = B->data_key(B, root+iter*world, 0); - key_root = B->data_key(B, root, 0); - bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); +for(int iter=1; iter <= 1; iter++) { - sleep(5); int new_value = -1; key_root = key = A->data_key(A, root+iter*world, 0); dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); if (root == myrank) { - //*data_ptr = 1998; + //*data_ptr = 1998; new_value = 1998+iter; + parsec_dtd_taskpool_insert_task(dtd_tp, + write_task_fn, 0, "write_task", + PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | TILE_FULL, + sizeof(int), &new_value, PARSEC_VALUE, + sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, + PARSEC_DTD_ARG_END); } else { //data_value_out = data_ptr; } - parsec_dtd_taskpool_insert_task(dtd_tp, - write_task_fn, 0, "write_task", - PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | TILE_FULL, - sizeof(int), &new_value, PARSEC_VALUE, - sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, - PARSEC_DTD_ARG_END); // Put all rank indexes into `dest_ranks` array except for the root // node. - dest_rank_idx = 0; + + int dest_rank_idx = 0; + int *dest_ranks = (int*)malloc(world*sizeof(int)); + for (int rank = 0; rank < world; ++rank) { if (rank == root) continue; dest_ranks[dest_rank_idx] = rank; ++dest_rank_idx; } - num_dest_ranks = dest_rank_idx; + int num_dest_ranks = dest_rank_idx; // // Perform Broadcast AGAIN @@ -338,7 +336,6 @@ for(int iter=1; iter <= 0; iter++) { // // Retrieve value of broadcasted data // - //for (int rank = 0; rank < world; ++rank) { //if ( myrank != root) { parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( dtd_tp, retrieve_task_fn, 0, "retrieve_task", From 92c2d35aa220cc650efaf9025ef7fa61e1563834 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Sun, 27 Feb 2022 20:25:25 -0500 Subject: [PATCH 30/41] changes to allow QR to work --- parsec/interfaces/superscalar/collectives.c | 13 +++--- .../interfaces/superscalar/insert_function.c | 6 ++- .../superscalar/overlap_strategies.c | 10 +++-- .../superscalar/parsec_dtd_broadcast.c | 7 ++-- .../superscalar/parsec_dtd_data_flush.c | 42 +++++++++++++++++++ parsec/remote_dep.c | 1 + parsec/remote_dep_mpi.c | 31 +++++++++----- 7 files changed, 87 insertions(+), 23 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index a28cd2baa..9ad9bf00d 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -101,7 +101,8 @@ void parsec_dtd_broadcast( int myrank = taskpool->context->my_rank; parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)taskpool; - bcast_keys_root = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); + //bcast_keys_root = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); + bcast_keys_root = (parsec_dtd_tile_t *) malloc(sizeof(parsec_dtd_tile_t)); SET_LAST_ACCESSOR(bcast_keys_root); bcast_keys_root->dc = NULL; bcast_keys_root->arena_index = -1; @@ -115,11 +116,10 @@ void parsec_dtd_broadcast( bcast_keys_root->data_copy = new_data_copy; if(myrank == root) { - bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); + bcast_id = ( (1<<27) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; - - bcast_keys_root->ht_item.key = (parsec_key_t)bcast_id; - parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); + + bcast_keys_root->ht_item.key = ((uintptr_t)bcast_id)<<32; parsec_data_copy = bcast_keys_root->data_copy; data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); @@ -130,6 +130,9 @@ void parsec_dtd_broadcast( //pack the ranks at the end of the tiles as well data_ptr[400+i+1] = dest_ranks[i]; } + + fprintf(stderr, "on rank %d inserting key tile into bcast_keys_hash with key %ld num dest ranks %d\n", myrank, bcast_keys_root->ht_item.key, data_ptr[400]); + parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); } // Retrieve DTD tile's data_copy diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 6df9f7e84..f71fbe21f 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -792,6 +792,7 @@ parsec_dtd_untrack_task( parsec_dtd_taskpool_t *tp, parsec_hash_table_t *hash_table = tp->task_hash_table; void *value; + fprintf(stderr, "untracking task with key value %ld on rank %d\n", key, tp->super.context->my_rank); dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); if( NULL == item ) return NULL; @@ -1087,7 +1088,7 @@ parsec_dtd_data_collection_init( parsec_data_collection_t *dc ) void parsec_dtd_data_collection_fini( parsec_data_collection_t *dc ) { - //parsec_hash_table_fini(dc->tile_h_table); + parsec_hash_table_fini(dc->tile_h_table); PARSEC_OBJ_RELEASE(dc->tile_h_table); parsec_dc_unregister_id(dc->dc_id); } @@ -1654,6 +1655,7 @@ parsec_dtd_release_deps(parsec_execution_stream_t *es, if((action_mask & (1 << flow_index))) { if(!(track_flow & (1U << flow_index))) { uint64_t key = (((uint64_t)this_task->locals[0].value<<32) | (1U<locals[0].value, tp->super.context->my_rank); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); this_dtd_task = parsec_dtd_find_task( tp, key ); assert(this_dtd_task != NULL); @@ -2930,7 +2932,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) /* Releasing every remote_task */ if( parsec_dtd_task_is_remote( this_task ) ) { - // parsec_dtd_remote_task_release( this_task ); + parsec_dtd_remote_task_release( this_task ); } /* Increase the count of satisfied flows to counter-balance the increase in the diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 6448f55ef..a083a9c09 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -191,8 +191,12 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, assert(NULL != current_task->super.data[current_dep].data_out); parsec_dtd_tile_t *tile = NULL; - tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, (parsec_key_t)current_task->super.locals[0].value); - fprintf(stderr, "bcast root task %p data with global key %d tile %p\n", current_task, current_task->ht_item.key, tile); + parsec_key_t key = ((uintptr_t)current_task->super.locals[0].value)<<32; + while(tile == NULL){ + tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, key); + //fprintf(stderr, "bcast root task %p data with global key %d tile %p on rank %d\n", current_task, current_task->ht_item.key, tile, current_task->super.taskpool->context->my_rank); + fprintf(stderr, "bcast root task %p data with global key %ld tile %p on rank %d\n", current_task, key, tile, current_task->super.taskpool->context->my_rank); + } int* data_ptr = (int*)parsec_data_copy_get_ptr(tile->data_copy); populate_remote_deps(data_ptr, deps); //current_task->deps_out->output[0].data.data = @@ -211,7 +215,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, int root = deps->root; int my_rank = current_task->super.taskpool->context->my_rank; - parsec_dtd_tile_t* item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)current_task->super.locals[0].value ); + parsec_dtd_tile_t* item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); int* data_ptr = (int*)item->data_copy->device_private; populate_remote_deps(data_ptr, deps); fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with item %p value0 %d\n", my_rank, root, current_task, item, data_ptr[0]); diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index f83ffcd94..a64e71c5c 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -180,12 +180,13 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, item->value = (void *)buffer; parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); - parsec_dtd_tile_t* bcast_keys = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); - bcast_keys->ht_item.key = (parsec_key_t)(data_ptr[0]); + //parsec_dtd_tile_t* bcast_keys = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); + parsec_dtd_tile_t* bcast_keys = (parsec_dtd_tile_t *)malloc(sizeof(parsec_dtd_tile_t)); + bcast_keys->ht_item.key = (parsec_key_t)((uintptr_t)data_ptr[0]); bcast_keys->data_copy = PARSEC_OBJ_NEW(parsec_data_copy_t); bcast_keys->data_copy->device_private = (void *)buffer; parsec_hash_table_nolock_insert( parsec_bcast_keys_hash, &bcast_keys->ht_item ); - fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p with value pointer %p on rank %d\n", bcast_keys, buffer, es->virtual_process->parsec_context->my_rank); + fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p key %d with value pointer %p on rank %d\n", bcast_keys, data_ptr[0], buffer, es->virtual_process->parsec_context->my_rank); if(dtd_task != NULL) { parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); diff --git a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c index 1d54cbd4c..a6a53201b 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c +++ b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c @@ -186,6 +186,27 @@ parsec_insert_dtd_flush_task(parsec_dtd_task_t *this_task, parsec_dtd_tile_t *ti this_task, flow_index, last_user.op_type, tile_op_type, last_user.alive); + if(last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { + /* local parent and we are inserting a remote task, indicates it needs to send data */ + if(parsec_dtd_task_is_local(last_writer.task) && parsec_dtd_task_is_remote(this_task)) + { + int _array_pos, _array_mask; + _array_pos = this_task->rank / (8 * sizeof(int)); + _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); + if(last_writer.task->rank_bits[_array_pos] & _array_mask) + { + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + } else + { + last_writer.task->rank_bits[_array_pos] |= _array_mask; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; + last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + } + } + } else { + /* do nothing */ + } } else { parsec_dtd_set_parent(last_writer.task, last_writer.flow_index, this_task, flow_index, last_writer.op_type, @@ -194,6 +215,27 @@ parsec_insert_dtd_flush_task(parsec_dtd_task_t *this_task, parsec_dtd_tile_t *ti this_task, flow_index, (PARENT_OF(this_task, flow_index))->op_type, tile_op_type, last_user.alive); + if(last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_KEY_TC_ID && + last_writer.task->super.task_class->task_class_id != PARSEC_DTD_BCAST_DATA_TC_ID) { + /* local parent and we are inserting a remote task, indicates it needs to send data */ + if(parsec_dtd_task_is_local(last_writer.task) && parsec_dtd_task_is_remote(this_task)) + { + int _array_pos, _array_mask; + _array_pos = this_task->rank / (8 * sizeof(int)); + _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); + if(last_writer.task->rank_bits[_array_pos] & _array_mask) + { + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + } else + { + last_writer.task->rank_bits[_array_pos] |= _array_mask; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; + last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + } + } + } else { + /* do nothing */ + } parsec_dtd_task_t *parent_task = (PARENT_OF(this_task, flow_index))->task; if( parsec_dtd_task_is_local(parent_task) || parsec_dtd_task_is_local(this_task) ) { diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index cb39b471d..bcaaa9146 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -551,6 +551,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); remote_deps->msg.locals[0].value = this_dtd_task->super.locals[0].value; + fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[0].value); } } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- send (%x)", diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index c8831d9fe..cca8cd809 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -744,6 +744,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, uint64_t key = (uint64_t)origin->msg.locals[0].value<<32 | (1U<msg.locals[0].value, k, key); parsec_hash_table_lock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); dtd_task = parsec_dtd_find_task( dtd_tp, key ); @@ -896,13 +897,19 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tTranslate mask from 0x%lx to 0x%x (remote_dep_release_incoming)", complete_mask, action_mask); - if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || - task.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { - //remote_dep_inc_flying_messages(origin->taskpool); - (void)parsec_atomic_fetch_inc_int32(&origin->pending_ack); - (void)task.task_class->release_deps(es, &task, - action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, - origin); + if( PARSEC_TASKPOOL_TYPE_DTD == origin->taskpool->taskpool_type ) { + if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || + task.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { + //remote_dep_inc_flying_messages(origin->taskpool); + (void)parsec_atomic_fetch_inc_int32(&origin->pending_ack); + (void)task.task_class->release_deps(es, &task, + action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, + origin); + } else { + (void)task.task_class->release_deps(es, &task, + action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, + NULL); + } } else { (void)task.task_class->release_deps(es, &task, action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, @@ -925,7 +932,10 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, #if defined(PARSEC_DIST_COLLECTIVES) if( PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type ) /* indicates it is a PTG taskpool */ + { + origin->outgoing_mask = 0; parsec_remote_dep_propagate(es, &task, origin); + } #endif /* PARSEC_DIST_COLLECTIVES */ /** * Release the dependency owned by the communication engine for all data @@ -2136,9 +2146,10 @@ remote_dep_mpi_save_activate_cb(parsec_execution_stream_t* es, &deps->msg, dep_count, dep_dtt, dep_comm); deps->from = status->MPI_SOURCE; - if(es->virtual_process->parsec_context->my_rank == 1){ - fprintf(stderr, "save activate cb with value %d\n", deps->msg.locals[0].value); - } + //if(es->virtual_process->parsec_context->my_rank == 1){ + // fprintf(stderr, "save activate cb with value %d\n", deps->msg.locals[0].value); + //} + /* Retrieve the data arenas and update the msg.incoming_mask to reflect * the data we should be receiving from the predecessor. */ From 1ce2dbe6c49243021d1fbce14a9cdf8b9edc24b9 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Thu, 3 Mar 2022 13:44:18 -0500 Subject: [PATCH 31/41] increase nb pending task of task pool for a remote dep, similar to PTG coll --- .../interfaces/superscalar/insert_function.c | 4 ++-- .../superscalar/overlap_strategies.c | 2 +- parsec/remote_dep.c | 2 +- parsec/remote_dep_mpi.c | 6 +++--- .../dtd_test_broadcast_collective.c | 20 +++++++++---------- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index f71fbe21f..83e40ce57 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1807,7 +1807,7 @@ parsec_dtd_release_local_task( parsec_dtd_task_t *this_task ) assert( current_flow == 0 ); tile->flushed = FLUSHED; parsec_dtd_tile_remove( tile->dc, tile->key ); - parsec_dtd_tile_release( tile ); + //parsec_dtd_tile_release( tile ); } } assert(this_task->super.super.super.obj_reference_count == 1); @@ -1861,7 +1861,7 @@ parsec_dtd_remote_task_release( parsec_dtd_task_t *this_task ) assert( current_flow == 0 ); tile->flushed = FLUSHED; parsec_dtd_tile_remove( tile->dc, tile->key ); - parsec_dtd_tile_release( tile ); + //parsec_dtd_tile_release( tile ); } } assert(this_task->super.super.super.obj_reference_count == 1); diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index a083a9c09..0e3162ca5 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -201,7 +201,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, populate_remote_deps(data_ptr, deps); //current_task->deps_out->output[0].data.data = // current_task->super.data[current_dep].data_out; - //(void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); parsec_remote_dep_activate( es, (parsec_task_t *)current_task, deps, diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index bcaaa9146..b1df4e319 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -551,7 +551,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); remote_deps->msg.locals[0].value = this_dtd_task->super.locals[0].value; - fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[0].value); + //fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[0].value); } } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- send (%x)", diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index cca8cd809..0101f7537 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -744,14 +744,14 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, uint64_t key = (uint64_t)origin->msg.locals[0].value<<32 | (1U<msg.locals[0].value, k, key); + //fprintf(stderr, "get datatype for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); parsec_hash_table_lock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); dtd_task = parsec_dtd_find_task( dtd_tp, key ); if( NULL == dtd_task ) { return_defer = 1; - fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); + //fprintf(stderr, "defer receive for key %d k %d %llu\n", origin->msg.locals[0].value, k, key); /* AM buffers are reused by the comm engine once the activation * has been conveyed to upper layer. In case of DTD we might receive msg to @@ -900,7 +900,7 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, if( PARSEC_TASKPOOL_TYPE_DTD == origin->taskpool->taskpool_type ) { if(task.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID || task.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { - //remote_dep_inc_flying_messages(origin->taskpool); + remote_dep_inc_flying_messages(origin->taskpool); (void)parsec_atomic_fetch_inc_int32(&origin->pending_ack); (void)task.task_class->release_deps(es, &task, action_mask | PARSEC_ACTION_RELEASE_LOCAL_DEPS, diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index c07c3fa3c..3a2566342 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -184,12 +184,12 @@ int test_broadcast_mixed( data_value = -10-myrank; } - parsec_tiled_matrix_dc_t *dcB; - dcB = create_and_distribute_data(myrank, world, nb_bcast, nt); - parsec_data_collection_set_key((parsec_data_collection_t *)dcB, "B"); + //parsec_tiled_matrix_dc_t *dcB; + //dcB = create_and_distribute_data(myrank, world, nb_bcast, nt); + //parsec_data_collection_set_key((parsec_data_collection_t *)dcB, "B"); - parsec_data_collection_t *B = (parsec_data_collection_t *)dcB; - parsec_dtd_data_collection_init(B); + //parsec_data_collection_t *B = (parsec_data_collection_t *)dcB; + //parsec_dtd_data_collection_init(B); parsec_tiled_matrix_dc_t *dcA; dcA = create_and_distribute_data(myrank, world, nb, nt); @@ -235,8 +235,8 @@ int test_broadcast_mixed( if(1) { key_root = key = A->data_key(A, root, 0); dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); - key_root = B->data_key(B, root, 0); - bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); + //key_root = B->data_key(B, root, 0); + //bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); } // Create array of destination ranks @@ -291,7 +291,7 @@ int test_broadcast_mixed( //} } -for(int iter=1; iter <= 1; iter++) { +for(int iter=1; iter <= 0; iter++) { int new_value = -1; key_root = key = A->data_key(A, root+iter*world, 0); @@ -381,9 +381,9 @@ for(int iter=1; iter <= 1; iter++) { parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_BCAST].opaque_dtt); PARSEC_OBJ_RELEASE(parsec_dtd_arenas_datatypes[TILE_BCAST].arena); parsec_dtd_data_collection_fini( A ); - parsec_dtd_data_collection_fini( B ); + //parsec_dtd_data_collection_fini( B ); free_data(dcA); - free_data(dcB); + //free_data(dcB); parsec_taskpool_free( dtd_tp ); From 6dc8829773203fd6f9fc8221dadfebe97790daa4 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 7 Mar 2022 00:37:26 -0500 Subject: [PATCH 32/41] cache multiple bcast keys for binomial in the locals --- parsec/remote_dep.c | 6 +++--- parsec/remote_dep_mpi.c | 10 ++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index b1df4e319..5d0336877 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -549,9 +549,9 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); - this_dtd_task->super.locals[0].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); - remote_deps->msg.locals[0].value = this_dtd_task->super.locals[0].value; - //fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[0].value); + this_dtd_task->super.locals[rank].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); + remote_deps->msg.locals[rank].value = this_dtd_task->super.locals[rank].value; + fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[rank].value); } } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- send (%x)", diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 0101f7537..9b53829e7 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -741,10 +741,16 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, for(k = 0; origin->msg.output_mask>>k; k++) { if(!(origin->msg.output_mask & (1U<msg.locals[0].value<<32 | (1U<msg.task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { + key = (uint64_t)origin->msg.locals[es->virtual_process->parsec_context->my_rank].value<<32 | (1U<msg.locals[es->virtual_process->parsec_context->my_rank].value, k, key); + origin->msg.locals[0].value = origin->msg.locals[es->virtual_process->parsec_context->my_rank].value; + } else { + key = (uint64_t)origin->msg.locals[0].value<<32 | (1U<msg.locals[0].value, k, key); parsec_hash_table_lock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); dtd_task = parsec_dtd_find_task( dtd_tp, key ); From 13f4f5890d7261a6d07945d17b6df8f3b4faa86e Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 21 Mar 2022 13:10:16 -0400 Subject: [PATCH 33/41] fix p2p after bcast data issue --- .../interfaces/superscalar/insert_function.c | 23 +++++++++++++++++-- .../superscalar/overlap_strategies.c | 19 ++++++++++----- parsec/remote_dep.c | 8 ++++++- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 83e40ce57..bcba99f19 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -1474,11 +1474,17 @@ dtd_release_dep_fct( parsec_execution_stream_t *es, parsec_release_dep_fct_arg_t *arg = (parsec_release_dep_fct_arg_t *)param; parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)newcontext; int32_t not_ready = 1; - + #if defined(DISTRIBUTED) if( dst_rank != src_rank && src_rank == oldcontext->taskpool->context->my_rank) { assert( 0 == (arg->action_mask & PARSEC_ACTION_RECV_INIT_REMOTE_DEPS) ); + /* TODO: check that the desc task is a read task, then it should skip here, now 4 is unmqr */ + if(PARSEC_DTD_BCAST_DATA_TC_ID == oldcontext->task_class->task_class_id) { + if(4 == current_task->super.task_class->task_class_id) { + return PARSEC_ITERATE_CONTINUE; + } + } if( arg->action_mask & PARSEC_ACTION_SEND_INIT_REMOTE_DEPS ) { if( parsec_dtd_not_sent_to_rank((parsec_dtd_task_t *)oldcontext, dep->belongs_to->flow_index, dst_rank) ) { @@ -1499,6 +1505,11 @@ dtd_release_dep_fct( parsec_execution_stream_t *es, arg->remote_deps->bcast_keys[dep->dep_datatype_index] = 0; arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= src_rank<<18; arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= (FLOW_OF(real_parent_task, dep->belongs_to->flow_index))->msg_keys[dst_rank]; + } else { + arg->remote_deps->bcast_keys[dep->dep_datatype_index] = 0; + arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= src_rank<<18; + arg->remote_deps->bcast_keys[dep->dep_datatype_index] |= (FLOW_OF(real_parent_task, dep->belongs_to->flow_index))->msg_keys[dst_rank]; + arg->remote_deps->bcast_keys[15] = 1; /* a flag to indicate bcast_keys has been set */ } output = &arg->remote_deps->output[dep->dep_datatype_index]; assert( (-1 == arg->remote_deps->root) || (arg->remote_deps->root == src_rank) ); @@ -2755,7 +2766,15 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) } } } else { - /* do nothing */ + /* For bcast data task, if we have a remote write descendant, generate the p2p key for send */ + if(last_writer.task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID && parsec_dtd_task_is_remote(this_task) + && parsec_dtd_task_is_local(last_writer.task)) { + if(PARSEC_INOUT == (tile_op_type & PARSEC_GET_OP_TYPE)) { + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; + last_writer.task->super.locals[5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + fprintf(stderr, "BCAST_DATA %p last_writer.task on root %d with send ID to rank %d task %p as %d\n", last_writer.task, last_writer.task->rank, this_task->rank, this_task, last_writer.task->super.locals[5].value); + } + } } /* Are we using the same data multiple times for the same task? */ diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 0e3162ca5..eb336d17f 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -146,7 +146,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, parsec_dtd_task_t *current_task = (parsec_dtd_task_t *)this_task; int current_dep; parsec_dtd_task_t *current_desc = NULL; - int op_type_on_current_flow, desc_op_type, desc_flow_index; + int op_type_on_current_flow, desc_op_type, desc_flow_index, cur_desc_op_type; parsec_dtd_tile_t *tile; parsec_dep_t deps; @@ -294,6 +294,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, data.displ = 0; desc_op_type = ((DESC_OF(current_task, current_dep))->op_type & PARSEC_GET_OP_TYPE); + cur_desc_op_type = ((DESC_OF(current_task, current_dep))->op_type & PARSEC_GET_OP_TYPE); desc_flow_index = (DESC_OF(current_task, current_dep))->flow_index; int get_out = 0, tmp_desc_flow_index, release_parent = 0; @@ -384,15 +385,19 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, rank_dst = current_desc->rank; - ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, - &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); + //if((PARSEC_DTD_BCAST_DATA_TC_ID != current_task->super.task_class->task_class_id) || (PARSEC_OUTPUT == ((DESC_OF(current_task, current_dep))->op_type & PARSEC_GET_OP_TYPE) || PARSEC_INOUT == ((DESC_OF(current_task, current_dep))->op_type & PARSEC_GET_OP_TYPE))) { + ontask( es, (parsec_task_t *)current_desc, (parsec_task_t *)current_task, + &deps, &data, rank_src, rank_dst, vpid_dst, ontask_arg ); + //} vpid_dst = (vpid_dst+1) % current_task->super.taskpool->context->nb_vp; #if defined(DISTRIBUTED) if( (action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) && (NULL != arg->remote_deps) ) { - (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); - parsec_remote_dep_activate(es, (parsec_task_t *)current_task, arg->remote_deps, arg->remote_deps->outgoing_mask); - arg->remote_deps = NULL; + //if((PARSEC_DTD_BCAST_DATA_TC_ID != current_task->super.task_class->task_class_id) || (PARSEC_OUTPUT == (cur_desc_op_type & PARSEC_GET_OP_TYPE) || PARSEC_INOUT == (cur_desc_op_type & PARSEC_GET_OP_TYPE))) { + (void)parsec_atomic_fetch_inc_int32(¤t_task->super.data[current_dep].data_out->readers); + parsec_remote_dep_activate(es, (parsec_task_t *)current_task, arg->remote_deps, arg->remote_deps->outgoing_mask); + arg->remote_deps = NULL; + //} } #endif @@ -407,6 +412,8 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, } } } + + cur_desc_op_type = desc_op_type; } while (0 == get_out); } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 5d0336877..31c40ab9a 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -538,7 +538,13 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, remote_deps->msg.locals[0].value = remote_deps->bcast_keys[i]; /* p2p, update the key for this message */ remote_dep_bcast_child_permits = remote_dep_bcast_star_child(my_idx, idx); } else { - remote_dep_bcast_child_permits = remote_dep_bcast_child(my_idx, idx); + /* TODO: when the parent is BCAST_DATA tc, but we need to do a p2p to remote write */ + if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID && remote_deps->bcast_keys[15]) { + remote_deps->msg.locals[0].value = remote_deps->bcast_keys[i]; /* p2p, update the key for this message */ + remote_dep_bcast_child_permits = remote_dep_bcast_star_child(my_idx, idx); + } else { + remote_dep_bcast_child_permits = remote_dep_bcast_child(my_idx, idx); + } } } else { remote_dep_bcast_child_permits = remote_dep_bcast_child(my_idx, idx); From 788b1bc0b0c363d9a7a95ab3f92b9d638a35235e Mon Sep 17 00:00:00 2001 From: yu-pei Date: Fri, 1 Apr 2022 10:07:39 -0400 Subject: [PATCH 34/41] reset bcast keys in remote_deps when freed --- parsec/interfaces/superscalar/collectives.c | 2 +- .../interfaces/superscalar/insert_function.c | 20 +++++++++++++------ .../superscalar/overlap_strategies.c | 2 +- .../superscalar/parsec_dtd_broadcast.c | 6 +++--- parsec/remote_dep.c | 3 +++ parsec/remote_dep_mpi.c | 11 ++++++---- 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 9ad9bf00d..76a3d6d7d 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -131,7 +131,7 @@ void parsec_dtd_broadcast( data_ptr[400+i+1] = dest_ranks[i]; } - fprintf(stderr, "on rank %d inserting key tile into bcast_keys_hash with key %ld num dest ranks %d\n", myrank, bcast_keys_root->ht_item.key, data_ptr[400]); + //fprintf(stderr, "on rank %d inserting key tile into bcast_keys_hash with key %ld num dest ranks %d\n", myrank, bcast_keys_root->ht_item.key, data_ptr[400]); parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); } diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index bcba99f19..7464e6038 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -792,7 +792,7 @@ parsec_dtd_untrack_task( parsec_dtd_taskpool_t *tp, parsec_hash_table_t *hash_table = tp->task_hash_table; void *value; - fprintf(stderr, "untracking task with key value %ld on rank %d\n", key, tp->super.context->my_rank); + //fprintf(stderr, "untracking task with key value %ld on rank %d\n", key, tp->super.context->my_rank); dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); if( NULL == item ) return NULL; @@ -1479,9 +1479,9 @@ dtd_release_dep_fct( parsec_execution_stream_t *es, if( dst_rank != src_rank && src_rank == oldcontext->taskpool->context->my_rank) { assert( 0 == (arg->action_mask & PARSEC_ACTION_RECV_INIT_REMOTE_DEPS) ); - /* TODO: check that the desc task is a read task, then it should skip here, now 4 is unmqr */ + /* check that the desc task is a read task, then it should skip here */ if(PARSEC_DTD_BCAST_DATA_TC_ID == oldcontext->task_class->task_class_id) { - if(4 == current_task->super.task_class->task_class_id) { + if(PARSEC_INPUT == ((FLOW_OF(current_task, dep->dep_index))->op_type & PARSEC_GET_OP_TYPE)) { return PARSEC_ITERATE_CONTINUE; } } @@ -1666,7 +1666,7 @@ parsec_dtd_release_deps(parsec_execution_stream_t *es, if((action_mask & (1 << flow_index))) { if(!(track_flow & (1U << flow_index))) { uint64_t key = (((uint64_t)this_task->locals[0].value<<32) | (1U<locals[0].value, tp->super.context->my_rank); + //fprintf(stderr, "release_deps with key value %ld local 0 value %d on rank %d\n", key, this_task->locals[0].value, tp->super.context->my_rank); parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key); this_dtd_task = parsec_dtd_find_task( tp, key ); assert(this_dtd_task != NULL); @@ -2873,7 +2873,15 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) } } } else { - /* do nothing */ + /* For bcast data task, if we have a remote write descendant, generate the p2p key for send */ + if(last_writer.task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID && parsec_dtd_task_is_remote(this_task) + && parsec_dtd_task_is_local(last_writer.task)) { + if(PARSEC_INOUT == (tile_op_type & PARSEC_GET_OP_TYPE)) { + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; + last_writer.task->super.locals[5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + fprintf(stderr, "BCAST_DATA2222 %p last_writer.task on root %d with send ID to rank %d task %p as %d\n", last_writer.task, last_writer.task->rank, this_task->rank, this_task, last_writer.task->super.locals[5].value); + } + } } /* we can avoid all the hash table crap if the last_writer is not alive */ @@ -2963,7 +2971,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) uint64_t key = ((uint64_t)(this_task->super.locals[0].value)<<32) | (1U<<0); parsec_hash_table_lock_bucket(dtd_tp->task_hash_table, (parsec_key_t)key); parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( dtd_tp, key ); - fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); + //fprintf(stderr, "tracking remote task of key %d on rank %d\n", this_task->super.locals[0].value, dtd_tp->super.context->my_rank); if( NULL == dep ) { if( !(flow->flags & TASK_INSERTED) ) { flow->flags |= TASK_INSERTED; diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index eb336d17f..df9388c9e 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -218,7 +218,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, parsec_dtd_tile_t* item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); int* data_ptr = (int*)item->data_copy->device_private; populate_remote_deps(data_ptr, deps); - fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with item %p value0 %d\n", my_rank, root, current_task, item, data_ptr[0]); + //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with item %p value0 %d\n", my_rank, root, current_task, item, data_ptr[0]); assert(NULL != current_task->super.data[current_dep].data_out); diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index a64e71c5c..c9f356c7a 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -186,7 +186,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, bcast_keys->data_copy = PARSEC_OBJ_NEW(parsec_data_copy_t); bcast_keys->data_copy->device_private = (void *)buffer; parsec_hash_table_nolock_insert( parsec_bcast_keys_hash, &bcast_keys->ht_item ); - fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p key %d with value pointer %p on rank %d\n", bcast_keys, data_ptr[0], buffer, es->virtual_process->parsec_context->my_rank); + //fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p key %d with value pointer %p on rank %d\n", bcast_keys, data_ptr[0], buffer, es->virtual_process->parsec_context->my_rank); if(dtd_task != NULL) { parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); @@ -243,7 +243,7 @@ int parsec_dtd_bcast_key_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) { (void)es; (void)this_task; - fprintf(stderr, "executed the body of bcast key fn\n"); + //fprintf(stderr, "executed the body of bcast key fn\n"); return PARSEC_HOOK_RETURN_DONE; } @@ -260,7 +260,7 @@ int parsec_dtd_bcast_data_fn( parsec_execution_stream_t *es, parsec_task_t *this_task) { (void)es; (void)this_task; - fprintf(stderr, "executed the body of bcast data fn\n"); + //fprintf(stderr, "executed the body of bcast data fn\n"); return PARSEC_HOOK_RETURN_DONE; } diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 31c40ab9a..61a9aa4c5 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -212,6 +212,9 @@ inline void remote_deps_free(parsec_remote_deps_t* deps) memset( &deps->msg, 0, sizeof(remote_dep_wire_activate_t) ); #endif deps->taskpool = NULL; + //memset( &deps->msg, 0, sizeof(remote_dep_wire_activate_t) ); + deps->bcast_flag = 0; /* default this dep is not for bcast */ + memset(deps->bcast_keys, 0, sizeof(uint32_t)*16); parsec_lifo_push(deps->origin, (parsec_list_item_t*)deps); PARSEC_VALGRIND_MEMPOOL_FREE(deps->origin, ((unsigned char *)deps)+sizeof(parsec_list_item_t)); } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 9b53829e7..5c386e92c 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -957,10 +957,13 @@ remote_dep_release_incoming(parsec_execution_stream_t* es, if(PARSEC_TASKPOOL_TYPE_PTG == origin->taskpool->taskpool_type) { remote_dep_complete_and_cleanup(&origin, 1); } else { - remote_dep_complete_and_cleanup(&origin, 1); - //remote_deps_free(origin); - //remote_dep_dec_flying_messages(task.taskpool); - + /* if it is bcast keys or bcast data */ + if(origin->msg.task_class_id == 1 || origin->msg.task_class_id == 2) { + remote_dep_complete_and_cleanup(&origin, 1); + } else { + origin->outgoing_mask = 0; + remote_deps_free(origin); + } } #else remote_deps_free(origin); From 0e4fc3d11c4c6464003b65fdc94983c00e98faf5 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Fri, 1 Apr 2022 13:04:39 -0400 Subject: [PATCH 35/41] remove printf for perf testing --- parsec/interfaces/superscalar/insert_function.c | 4 ++-- parsec/interfaces/superscalar/overlap_strategies.c | 2 +- parsec/interfaces/superscalar/parsec_dtd_data_flush.c | 2 +- parsec/remote_dep.c | 2 +- parsec/remote_dep_mpi.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 7464e6038..e94a320e0 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -2772,7 +2772,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) if(PARSEC_INOUT == (tile_op_type & PARSEC_GET_OP_TYPE)) { FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; last_writer.task->super.locals[5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; - fprintf(stderr, "BCAST_DATA %p last_writer.task on root %d with send ID to rank %d task %p as %d\n", last_writer.task, last_writer.task->rank, this_task->rank, this_task, last_writer.task->super.locals[5].value); + //fprintf(stderr, "BCAST_DATA %p last_writer.task on root %d with send ID to rank %d task %p as %d\n", last_writer.task, last_writer.task->rank, this_task->rank, this_task, last_writer.task->super.locals[5].value); } } } @@ -2879,7 +2879,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) if(PARSEC_INOUT == (tile_op_type & PARSEC_GET_OP_TYPE)) { FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; last_writer.task->super.locals[5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; - fprintf(stderr, "BCAST_DATA2222 %p last_writer.task on root %d with send ID to rank %d task %p as %d\n", last_writer.task, last_writer.task->rank, this_task->rank, this_task, last_writer.task->super.locals[5].value); + //fprintf(stderr, "BCAST_DATA2222 %p last_writer.task on root %d with send ID to rank %d task %p as %d\n", last_writer.task, last_writer.task->rank, this_task->rank, this_task, last_writer.task->super.locals[5].value); } } } diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index df9388c9e..0dbfea2ed 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -195,7 +195,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, while(tile == NULL){ tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, key); //fprintf(stderr, "bcast root task %p data with global key %d tile %p on rank %d\n", current_task, current_task->ht_item.key, tile, current_task->super.taskpool->context->my_rank); - fprintf(stderr, "bcast root task %p data with global key %ld tile %p on rank %d\n", current_task, key, tile, current_task->super.taskpool->context->my_rank); + //fprintf(stderr, "bcast root task %p data with global key %ld tile %p on rank %d\n", current_task, key, tile, current_task->super.taskpool->context->my_rank); } int* data_ptr = (int*)parsec_data_copy_get_ptr(tile->data_copy); populate_remote_deps(data_ptr, deps); diff --git a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c index a6a53201b..b613ae062 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c +++ b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c @@ -40,7 +40,7 @@ parsec_dtd_data_flush_sndrcv(parsec_execution_stream_t *es, assert(tile != NULL); - fprintf(stderr, "completed data flush task on rank %d\n", current_task->rank); + //fprintf(stderr, "completed data flush task on rank %d\n", current_task->rank); #if defined(DISTRIBUTED) if(tile->rank == current_task->rank) { /* this is a receive task*/ if( current_task->super.data[0].data_in != tile->data_copy ) { diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 61a9aa4c5..5f7f3952f 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -560,7 +560,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); this_dtd_task->super.locals[rank].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); remote_deps->msg.locals[rank].value = this_dtd_task->super.locals[rank].value; - fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[rank].value); + //fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[rank].value); } } PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "[%d:%d] task %s my_idx %d idx %d rank %d -- send (%x)", diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index 5c386e92c..bf72f0305 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -744,7 +744,7 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, uint64_t key = 0; if(origin->msg.task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { key = (uint64_t)origin->msg.locals[es->virtual_process->parsec_context->my_rank].value<<32 | (1U<msg.locals[es->virtual_process->parsec_context->my_rank].value, k, key); + //fprintf(stderr, "get datatype for key %d k %d %llu\n", origin->msg.locals[es->virtual_process->parsec_context->my_rank].value, k, key); origin->msg.locals[0].value = origin->msg.locals[es->virtual_process->parsec_context->my_rank].value; } else { key = (uint64_t)origin->msg.locals[0].value<<32 | (1U< Date: Fri, 1 Apr 2022 16:16:46 -0400 Subject: [PATCH 36/41] pack the local keys into locals --- parsec/remote_dep.c | 9 ++++++--- parsec/remote_dep_mpi.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index 5f7f3952f..b476b6c9e 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -452,7 +452,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, uint32_t propagation_mask) { const parsec_task_class_t* tc = task->task_class; - int i, my_idx, idx, current_mask, keeper = 0; + int i, my_idx, idx, current_mask, keeper = 0, child_count = 0; unsigned int array_index, count, bit_index; struct remote_dep_output_param_s* output; @@ -558,8 +558,11 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, parsec_dtd_task_t *this_dtd_task = (parsec_dtd_task_t *) task; if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); - this_dtd_task->super.locals[rank].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); - remote_deps->msg.locals[rank].value = this_dtd_task->super.locals[rank].value; + this_dtd_task->super.locals[4+child_count*2].value = rank; + this_dtd_task->super.locals[4+child_count*2+1].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); + remote_deps->msg.locals[4+child_count*2].value = this_dtd_task->super.locals[4+child_count*2].value; + remote_deps->msg.locals[4+child_count*2+1].value = this_dtd_task->super.locals[4+child_count*2+1].value; + child_count += 1; //fprintf(stderr, "for remote_dep %p update key in activate to %d\n", remote_deps, this_dtd_task->super.locals[rank].value); } } diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c index bf72f0305..23a62d5b5 100644 --- a/parsec/remote_dep_mpi.c +++ b/parsec/remote_dep_mpi.c @@ -743,9 +743,14 @@ remote_dep_get_datatypes(parsec_execution_stream_t* es, uint64_t key = 0; if(origin->msg.task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { - key = (uint64_t)origin->msg.locals[es->virtual_process->parsec_context->my_rank].value<<32 | (1U<msg.locals[es->virtual_process->parsec_context->my_rank].value, k, key); - origin->msg.locals[0].value = origin->msg.locals[es->virtual_process->parsec_context->my_rank].value; + for(int idx = 4; idx < 15; idx += 2) { + if(origin->msg.locals[idx].value == es->virtual_process->parsec_context->my_rank) { + key = (uint64_t)origin->msg.locals[idx+1].value<<32 | (1U<msg.locals[es->virtual_process->parsec_context->my_rank].value, k, key); + origin->msg.locals[0].value = origin->msg.locals[idx+1].value; + break; + } + } } else { key = (uint64_t)origin->msg.locals[0].value<<32 | (1U< Date: Mon, 11 Apr 2022 10:45:16 -0400 Subject: [PATCH 37/41] updates to prevent multiple bcast forwarding, and seems like a misuse of hashtable insert? --- parsec/interfaces/superscalar/collectives.c | 7 ++- .../interfaces/superscalar/insert_function.c | 2 +- .../superscalar/overlap_strategies.c | 57 ++++++++++++++++--- .../superscalar/parsec_dtd_broadcast.c | 13 +++-- 4 files changed, 61 insertions(+), 18 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index 76a3d6d7d..ae0597c1c 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -119,7 +119,6 @@ void parsec_dtd_broadcast( bcast_id = ( (1<<27) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; - bcast_keys_root->ht_item.key = ((uintptr_t)bcast_id)<<32; parsec_data_copy = bcast_keys_root->data_copy; data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); @@ -130,9 +129,11 @@ void parsec_dtd_broadcast( //pack the ranks at the end of the tiles as well data_ptr[400+i+1] = dest_ranks[i]; } + bcast_keys_root->ht_item.key = (parsec_key_t)((uintptr_t)data_ptr[0]); //fprintf(stderr, "on rank %d inserting key tile into bcast_keys_hash with key %ld num dest ranks %d\n", myrank, bcast_keys_root->ht_item.key, data_ptr[400]); parsec_hash_table_insert(parsec_bcast_keys_hash, &bcast_keys_root->ht_item); + parsec_mfence(); /* Write */ } // Retrieve DTD tile's data_copy @@ -185,10 +186,10 @@ void parsec_dtd_broadcast( dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; } /* Post the bcast of keys and ranks array */ - parsec_insert_dtd_task(dtd_bcast_key_root); - /* Post the bcast tasks for the actual data */ parsec_insert_dtd_task(dtd_bcast_task_root); + //sleep(1); + parsec_insert_dtd_task(dtd_bcast_key_root); } #endif diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index e94a320e0..28a109eec 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -766,6 +766,7 @@ parsec_dtd_track_task( parsec_dtd_taskpool_t *tp, { dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); + //fprintf(stderr, "tracking task with key value %ld on rank %d\n", key, tp->super.context->my_rank); parsec_hash_table_t *hash_table = tp->task_hash_table; item->ht_item.key = (parsec_key_t)key; @@ -1851,7 +1852,6 @@ void parsec_dtd_remote_task_release( parsec_dtd_task_t *this_task ) { parsec_object_t *object = (parsec_object_t *)this_task; - //parsec_atomic_fetch_inc_int32( &object->obj_reference_count); assert(object->obj_reference_count > 1); if( 2 == parsec_atomic_fetch_dec_int32( &object->obj_reference_count ) ){ int current_flow; diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 0dbfea2ed..c4dd1d05e 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -20,6 +20,13 @@ #include "parsec/interfaces/superscalar/insert_function_internal.h" #include "parsec/utils/debug.h" +#define MIN(x, y) ( (x)<(y)?(x):(y) ) +static inline unsigned long exponential_backoff(uint64_t k) +{ + unsigned int n = MIN( 64, k ); + unsigned int r = (unsigned int) ((double)n * ((double)rand()/(double)RAND_MAX)); + return r * 5410; +} /***************************************************************************//** * * This function makes sure that nextinline descendant is really NULL @@ -174,7 +181,9 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, * propagate the data down to descendants as well */ //if(current_task->deps_out != NULL) { /* we have not propagate the remote deps yet, otherwise will be set to NULL */ - if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { + if(action_mask & PARSEC_ACTION_COMPLETE_LOCAL_TASK) { + if (parsec_dtd_task_is_local(current_task)) { + if(current_task->super.locals[3].value != 10086) { parsec_remote_deps_t *deps = NULL; PARSEC_ALLOCATE_REMOTE_DEPS_IF_NULL(deps, this_task, MAX_PARAM_COUNT); deps->root = rank_src; @@ -191,11 +200,23 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, assert(NULL != current_task->super.data[current_dep].data_out); parsec_dtd_tile_t *tile = NULL; - parsec_key_t key = ((uintptr_t)current_task->super.locals[0].value)<<32; + parsec_key_t key = (parsec_key_t)((uintptr_t)current_task->super.locals[0].value); + int count = 1; + struct timespec rqtp; + rqtp.tv_sec = 0; + while(tile == NULL){ - tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, key); - //fprintf(stderr, "bcast root task %p data with global key %d tile %p on rank %d\n", current_task, current_task->ht_item.key, tile, current_task->super.taskpool->context->my_rank); - //fprintf(stderr, "bcast root task %p data with global key %ld tile %p on rank %d\n", current_task, key, tile, current_task->super.taskpool->context->my_rank); + count += 1; + tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, key); + //if(count %1000 == 0)fprintf(stderr, "bcast root task %p data with global key %d tile %p on rank %d\n", current_task, current_task->ht_item.key, tile, current_task->super.taskpool->context->my_rank); + //sleep(1); + if(count == 100) { + rqtp.tv_nsec = exponential_backoff(count); + nanosleep(&rqtp, NULL); + count = 0; + fprintf(stderr, "bcast root task %p data with global key %ld tile %p on rank %d\n", current_task, key, tile, current_task->super.taskpool->context->my_rank); + sleep(1); + } } int* data_ptr = (int*)parsec_data_copy_get_ptr(tile->data_copy); populate_remote_deps(data_ptr, deps); @@ -207,18 +228,36 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, deps, deps->outgoing_mask); //current_task->deps_out = NULL; + current_task->super.locals[3].value = 10086; + } + } } else if(action_mask & PARSEC_ACTION_RELEASE_LOCAL_DEPS) { /* current node is part of the broadcast operation, propagate downstream */ //int root = current_task->deps_out->root; + if(current_task->super.locals[3].value != 10086) { parsec_release_dep_fct_arg_t* arg = (parsec_release_dep_fct_arg_t*)ontask_arg; parsec_remote_deps_t* deps = arg->remote_deps; int root = deps->root; int my_rank = current_task->super.taskpool->context->my_rank; - - parsec_dtd_tile_t* item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); + parsec_dtd_tile_t* item = NULL; + int count = 1; + struct timespec rqtp; + rqtp.tv_sec = 0; + + while(item == NULL) { + count += 1; + item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); + if(count == 100){ + fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with key %d \n", my_rank, root, current_task, current_task->super.locals[0].value); + sleep(1); + rqtp.tv_nsec = exponential_backoff(count); + nanosleep(&rqtp, NULL); + count = 0; + } + } int* data_ptr = (int*)item->data_copy->device_private; populate_remote_deps(data_ptr, deps); - //fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with item %p value0 %d\n", my_rank, root, current_task, item, data_ptr[0]); + parsec_hash_table_nolock_remove( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); assert(NULL != current_task->super.data[current_dep].data_out); @@ -230,6 +269,8 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, deps, deps->outgoing_mask); //current_task->deps_out = NULL; + current_task->super.locals[3].value = 10086; + } } //} } /* BCAST DATA propagation */ diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index c9f356c7a..ff67e646b 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -146,6 +146,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* We are part of the broadcast, forward message */ int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); + int* buffer = malloc(sizeof(int)*30*30); + memcpy(buffer, data_ptr, sizeof(int)*30*30); populate_remote_deps(data_ptr, deps); //successor = get_chain_successor(es, (parsec_task_t*)current_task, current_task->deps_out); if(successor == -1) { @@ -171,22 +173,21 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, dtd_task = parsec_dtd_find_task(tp, key); // store the meta data info into the key hash table - int* buffer = malloc(sizeof(int)*30*30); - memcpy(buffer, data_ptr, sizeof(int)*30*30); dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_thread_mempool_allocate(tp->hash_table_bucket_mempool->thread_mempools); parsec_hash_table_t *hash_table = tp->keys_hash_table; item->ht_item.key = (parsec_key_t)key; item->mempool_owner = tp->hash_table_bucket_mempool->thread_mempools; item->value = (void *)buffer; - parsec_hash_table_nolock_insert( hash_table, &item->ht_item ); + parsec_hash_table_insert( hash_table, &item->ht_item ); //parsec_dtd_tile_t* bcast_keys = (parsec_dtd_tile_t *) parsec_thread_mempool_allocate( parsec_bcast_keys_tile_mempool->thread_mempools ); parsec_dtd_tile_t* bcast_keys = (parsec_dtd_tile_t *)malloc(sizeof(parsec_dtd_tile_t)); - bcast_keys->ht_item.key = (parsec_key_t)((uintptr_t)data_ptr[0]); + bcast_keys->ht_item.key = (parsec_key_t)((uintptr_t)buffer[0]); bcast_keys->data_copy = PARSEC_OBJ_NEW(parsec_data_copy_t); bcast_keys->data_copy->device_private = (void *)buffer; - parsec_hash_table_nolock_insert( parsec_bcast_keys_hash, &bcast_keys->ht_item ); - //fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p key %d with value pointer %p on rank %d\n", bcast_keys, data_ptr[0], buffer, es->virtual_process->parsec_context->my_rank); + parsec_hash_table_insert( parsec_bcast_keys_hash, &bcast_keys->ht_item ); + parsec_mfence(); /* Write */ + //fprintf(stderr, "insert into parsec_bcast_keys_hash the item %p key %d with value pointer %p on rank %d\n", bcast_keys, buffer[0], buffer, es->virtual_process->parsec_context->my_rank); if(dtd_task != NULL) { parsec_hash_table_lock_bucket(tp->task_hash_table, (parsec_key_t)key2); From f0bd339e985ab079e741d6f1b8390e5209efdde4 Mon Sep 17 00:00:00 2001 From: Yu Pei Date: Fri, 22 Apr 2022 20:52:48 +0300 Subject: [PATCH 38/41] some internal param changes to run on 512 nodes of Shaheen --- parsec/interfaces/superscalar/collectives.c | 10 +++++----- parsec/interfaces/superscalar/overlap_strategies.c | 2 +- parsec/interfaces/superscalar/parsec_dtd_broadcast.c | 10 +++++----- parsec/remote_dep.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/parsec/interfaces/superscalar/collectives.c b/parsec/interfaces/superscalar/collectives.c index ae0597c1c..1274695a6 100644 --- a/parsec/interfaces/superscalar/collectives.c +++ b/parsec/interfaces/superscalar/collectives.c @@ -116,18 +116,18 @@ void parsec_dtd_broadcast( bcast_keys_root->data_copy = new_data_copy; if(myrank == root) { - bcast_id = ( (1<<27) | (root << 18) | dtd_tp->bcast_id); + bcast_id = ( (1<<30) | (root << 18) | dtd_tp->bcast_id); dtd_tp->bcast_id++; parsec_data_copy = bcast_keys_root->data_copy; data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); data_ptr[0] = bcast_id; - data_ptr[400] = num_dest_ranks; + data_ptr[600] = num_dest_ranks; for(int i = 0; i < num_dest_ranks; i++) { data_ptr[dest_ranks[i]+1] = dtd_tp->send_task_id[dest_ranks[i]]++; //pack the ranks at the end of the tiles as well - data_ptr[400+i+1] = dest_ranks[i]; + data_ptr[600+i+1] = dest_ranks[i]; } bcast_keys_root->ht_item.key = (parsec_key_t)((uintptr_t)data_ptr[0]); @@ -165,7 +165,7 @@ void parsec_dtd_broadcast( dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; }else{ - bcast_id = ( (1<<28) | (root << 20) | dtd_tp->recv_task_id[root]++); + bcast_id = ( (1<<28) | (root << 18) | dtd_tp->recv_task_id[root]++); dtd_bcast_task_root->ht_item.key = bcast_id; dtd_bcast_task_root->super.locals[0].value = dtd_bcast_task_root->ht_item.key; } @@ -181,7 +181,7 @@ void parsec_dtd_broadcast( if(myrank == root) { /* nothing here since the key is stored in the key array and will be updated before remote_dep_activate */ }else{ - bcast_id = ( (1<<29) | (root << 20) | (dtd_tp->recv_task_id[root] -1)); + bcast_id = ( (1<<29) | (root << 18) | (dtd_tp->recv_task_id[root] -1)); dtd_bcast_key_root->ht_item.key = bcast_id; dtd_bcast_key_root->super.locals[0].value = dtd_bcast_key_root->ht_item.key; } diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index c4dd1d05e..650ddeb3c 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -257,7 +257,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, } int* data_ptr = (int*)item->data_copy->device_private; populate_remote_deps(data_ptr, deps); - parsec_hash_table_nolock_remove( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); + //parsec_hash_table_nolock_remove( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); assert(NULL != current_task->super.data[current_dep].data_out); diff --git a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c index ff67e646b..2b6f15bd5 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_broadcast.c +++ b/parsec/interfaces/superscalar/parsec_dtd_broadcast.c @@ -17,9 +17,9 @@ populate_remote_deps(int* data_ptr, parsec_remote_deps_t* remote_deps) int _array_pos, _array_mask; uint32_t dest_rank_idx; /* TODO: don't assume the length of data_ptr */ - int num_dest_ranks = data_ptr[400]; + int num_dest_ranks = data_ptr[600]; for(dest_rank_idx = 0; dest_rank_idx < (uint32_t)num_dest_ranks; ++dest_rank_idx) { - uint32_t dest_rank = data_ptr[400+dest_rank_idx+1]; + uint32_t dest_rank = data_ptr[600+dest_rank_idx+1]; _array_pos = dest_rank / (8 * sizeof(uint32_t)); _array_mask = 1 << (dest_rank % (8 * sizeof(uint32_t))); @@ -146,8 +146,8 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, /* We are part of the broadcast, forward message */ int* data_ptr = (int*)parsec_data_copy_get_ptr(current_task->super.data[0].data_out); - int* buffer = malloc(sizeof(int)*30*30); - memcpy(buffer, data_ptr, sizeof(int)*30*30); + int* buffer = malloc(sizeof(int)*50*50); + memcpy(buffer, data_ptr, sizeof(int)*50*50); populate_remote_deps(data_ptr, deps); //successor = get_chain_successor(es, (parsec_task_t*)current_task, current_task->deps_out); if(successor == -1) { @@ -164,7 +164,7 @@ parsec_dtd_bcast_key_iterate_successors(parsec_execution_stream_t *es, deps->outgoing_mask); //current_task->deps_out = NULL; /* update the BCAST DATA task or dep with the global ID that we know now */ - uint64_t key = ((uint64_t)(1<<28 | (root << 20 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); + uint64_t key = ((uint64_t)(1<<28 | (root << 18 ) | data_ptr[es->virtual_process->parsec_context->my_rank+1])<<32) | (1U<<0); uint64_t key2 = ((uint64_t)(data_ptr[0])<<32) | (1U<<0); parsec_dtd_task_t* dtd_task = NULL; diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c index b476b6c9e..a404e4a10 100644 --- a/parsec/remote_dep.c +++ b/parsec/remote_dep.c @@ -559,7 +559,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es, if(this_dtd_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_KEY_TC_ID) { int* data_ptr = (int*)parsec_data_copy_get_ptr(this_dtd_task->super.data[0].data_out); this_dtd_task->super.locals[4+child_count*2].value = rank; - this_dtd_task->super.locals[4+child_count*2+1].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 20) | *(data_ptr+1+rank)); + this_dtd_task->super.locals[4+child_count*2+1].value = this_dtd_task->ht_item.key = ((1<<29) |(remote_deps->root << 18) | *(data_ptr+1+rank)); remote_deps->msg.locals[4+child_count*2].value = this_dtd_task->super.locals[4+child_count*2].value; remote_deps->msg.locals[4+child_count*2+1].value = this_dtd_task->super.locals[4+child_count*2+1].value; child_count += 1; From 0615a414b44eab9f58c0e80805e8a4f9f919dcc7 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Mon, 25 Apr 2022 13:27:06 -0400 Subject: [PATCH 39/41] fix other cases where nolock find was used --- parsec/interfaces/superscalar/insert_function.c | 2 +- parsec/interfaces/superscalar/overlap_strategies.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 28a109eec..4f1abec46 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -2344,7 +2344,7 @@ parsec_dtd_set_descendant(parsec_dtd_task_t *parent_task, uint8_t parent_flow_in parsec_remote_deps_t *dep = parsec_dtd_find_remote_dep( tp, key ); if(real_parent_task->super.task_class->task_class_id == PARSEC_DTD_BCAST_DATA_TC_ID) { parsec_hash_table_t *hash_table = tp->keys_hash_table; - dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_nolock_find( hash_table, (parsec_key_t)key ); + dtd_hash_table_pointer_item_t *item = (dtd_hash_table_pointer_item_t *)parsec_hash_table_find( hash_table, (parsec_key_t)key ); if(item) { int* data_ptr = (int*)item->value; parsec_dtd_untrack_task(tp, key); diff --git a/parsec/interfaces/superscalar/overlap_strategies.c b/parsec/interfaces/superscalar/overlap_strategies.c index 650ddeb3c..968f9da4b 100644 --- a/parsec/interfaces/superscalar/overlap_strategies.c +++ b/parsec/interfaces/superscalar/overlap_strategies.c @@ -207,7 +207,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, while(tile == NULL){ count += 1; - tile = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find(parsec_bcast_keys_hash, key); + tile = (parsec_dtd_tile_t *)parsec_hash_table_find(parsec_bcast_keys_hash, key); //if(count %1000 == 0)fprintf(stderr, "bcast root task %p data with global key %d tile %p on rank %d\n", current_task, current_task->ht_item.key, tile, current_task->super.taskpool->context->my_rank); //sleep(1); if(count == 100) { @@ -246,7 +246,7 @@ parsec_dtd_ordering_correctly( parsec_execution_stream_t *es, while(item == NULL) { count += 1; - item = (parsec_dtd_tile_t *)parsec_hash_table_nolock_find( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); + item = (parsec_dtd_tile_t *)parsec_hash_table_find( parsec_bcast_keys_hash, (parsec_key_t)((uintptr_t)current_task->super.locals[0].value)); if(count == 100){ fprintf(stderr, "bcast data continue on rank %d, from root %d, for task %p with key %d \n", my_rank, root, current_task, current_task->super.locals[0].value); sleep(1); From 27cfc44462d7b7f604cd2bfb84561d05cd9b29e8 Mon Sep 17 00:00:00 2001 From: yu-pei Date: Wed, 18 May 2022 13:28:32 -0400 Subject: [PATCH 40/41] include broadcast benchmark codes for PTG and DTD --- .../interfaces/superscalar/insert_function.c | 10 +- .../superscalar/insert_function_internal.h | 1 + .../superscalar/parsec_dtd_data_flush.c | 8 +- tests/CMakeLists.txt | 2 + tests/interfaces/superscalar/CMakeLists.txt | 1 + .../dtd_test_broadcast_collective.c | 224 ++++---------- .../superscalar/dtd_test_broadcast_p2p.c | 281 ++++++++++++++++++ tests/ptg_bcast.jdf | 160 ++++++++++ 8 files changed, 505 insertions(+), 182 deletions(-) create mode 100644 tests/interfaces/superscalar/dtd_test_broadcast_p2p.c create mode 100644 tests/ptg_bcast.jdf diff --git a/parsec/interfaces/superscalar/insert_function.c b/parsec/interfaces/superscalar/insert_function.c index 4f1abec46..d1aeb9aff 100644 --- a/parsec/interfaces/superscalar/insert_function.c +++ b/parsec/interfaces/superscalar/insert_function.c @@ -2757,12 +2757,14 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); if(last_writer.task->rank_bits[_array_pos] & _array_mask) { - FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + //FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->send_id_storage[this_task->rank]; } else { last_writer.task->rank_bits[_array_pos] |= _array_mask; FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; - last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + //last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + last_writer.task->send_id_storage[this_task->rank] = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; } } } else { @@ -2864,12 +2866,12 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); if(last_writer.task->rank_bits[_array_pos] & _array_mask) { - FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->send_id_storage[this_task->rank]; } else { last_writer.task->rank_bits[_array_pos] |= _array_mask; FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; - last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + last_writer.task->send_id_storage[this_task->rank] = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; } } } else { diff --git a/parsec/interfaces/superscalar/insert_function_internal.h b/parsec/interfaces/superscalar/insert_function_internal.h index fb0d3fd3e..3d34d5669 100644 --- a/parsec/interfaces/superscalar/insert_function_internal.h +++ b/parsec/interfaces/superscalar/insert_function_internal.h @@ -183,6 +183,7 @@ struct parsec_dtd_task_s { int32_t rank; int32_t flow_count; int32_t rank_bits[MAX_RANK_INFO]; + int send_id_storage[MAX_RANK_INFO*sizeof(int)*8]; /* enable user trimming, store dest rank send ID for a task, same for all the flows in that task */ /* for testing PTG inserting task in DTD */ parsec_task_t *orig_task; }; diff --git a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c index b613ae062..5c6685f49 100644 --- a/parsec/interfaces/superscalar/parsec_dtd_data_flush.c +++ b/parsec/interfaces/superscalar/parsec_dtd_data_flush.c @@ -196,12 +196,12 @@ parsec_insert_dtd_flush_task(parsec_dtd_task_t *this_task, parsec_dtd_tile_t *ti _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); if(last_writer.task->rank_bits[_array_pos] & _array_mask) { - FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->send_id_storage[this_task->rank]; } else { last_writer.task->rank_bits[_array_pos] |= _array_mask; FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; - last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + last_writer.task->send_id_storage[this_task->rank] = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; } } } else { @@ -225,12 +225,12 @@ parsec_insert_dtd_flush_task(parsec_dtd_task_t *this_task, parsec_dtd_tile_t *ti _array_mask = 1 << (this_task->rank % (8 * sizeof(int))); if(last_writer.task->rank_bits[_array_pos] & _array_mask) { - FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->super.locals[5+this_task->rank%5].value; + FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = last_writer.task->send_id_storage[this_task->rank]; } else { last_writer.task->rank_bits[_array_pos] |= _array_mask; FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank] = dtd_tp->send_task_id[this_task->rank]++; - last_writer.task->super.locals[5+this_task->rank%5].value = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; + last_writer.task->send_id_storage[this_task->rank] = FLOW_OF(last_writer.task, last_writer.flow_index)->msg_keys[this_task->rank]; } } } else { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9254ea8d3..3447c4236 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -71,6 +71,8 @@ target_ptg_sources(complex_deps PRIVATE "complex_deps.jdf") if( MPI_C_FOUND ) parsec_addtest(C multichain) target_ptg_sources(multichain PRIVATE "multichain.jdf") + parsec_addtest(C ptg_bcast) + target_ptg_sources(ptg_bcast PRIVATE "ptg_bcast.jdf") endif( MPI_C_FOUND ) parsec_addtest(C compose "compose.c") diff --git a/tests/interfaces/superscalar/CMakeLists.txt b/tests/interfaces/superscalar/CMakeLists.txt index aba15aecf..78d117659 100644 --- a/tests/interfaces/superscalar/CMakeLists.txt +++ b/tests/interfaces/superscalar/CMakeLists.txt @@ -21,6 +21,7 @@ parsec_addtest(C dtd_test_global_id_for_dc_assumed "dtd_test_global_id_for_dc_as parsec_addtest(C dtd_test_explicit_task_creation "dtd_test_explicit_task_creation.c;${COMMON_DATA}") parsec_addtest(C dtd_test_tp_enqueue_dequeue "dtd_test_tp_enqueue_dequeue.c") parsec_addtest(C dtd_test_broadcast_collective "dtd_test_broadcast_collective.c") +parsec_addtest(C dtd_test_broadcast_p2p "dtd_test_broadcast_p2p.c") # # Shared Memory Testings diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index 3a2566342..ba1cb19bb 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -73,32 +73,24 @@ int write_task_fn( (void)es; // INOUT data - int *val_out; + double *val_in; // Value to set the data to - int data_value; + double data_value; // Task rank int dest_rank; - parsec_dtd_unpack_args(this_task, &val_out, &data_value, &dest_rank); + parsec_dtd_unpack_args(this_task, &val_in, &dest_rank, &data_value); int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - printf("[write_task] rank = %d, data_value = %d\n", myrank, data_value); + //printf("[write_task] rank = %d, data_value = %f\n", myrank, data_value); - *val_out = data_value; + *val_in = data_value; return PARSEC_HOOK_RETURN_DONE; } -// For debugging purpose -void busy_wait() { - // Debug - int stop = 1; - while (stop) {} -} - - // Retrieve value associated with input data_copy for verification. int retrieve_task_fn( parsec_execution_stream_t *es, @@ -107,20 +99,20 @@ int retrieve_task_fn( int myrank = -1; // INPUT data - int *val_in; + double *val_in; // Task rank int dest_rank; - int *val_out; + double *val_out; parsec_dtd_unpack_args(this_task, &val_in, &dest_rank, &val_out); /* int myrank; */ MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - printf("[read_task] rank = %d, val_in = %d\n", myrank, *val_in); + //printf("[read_task] rank = %d, val_in = %f\n", myrank, *val_in); - *val_out = *val_in; + //*val_out = *val_in; return PARSEC_HOOK_RETURN_DONE; } @@ -135,7 +127,7 @@ int dummy_task_fn( } int test_broadcast_mixed( - int world, int myrank, parsec_context_t* parsec_context, int root) { + int world, int myrank, parsec_context_t* parsec_context, int root, int num_elem) { // Test return value: // - 0: success @@ -153,72 +145,43 @@ int test_broadcast_mixed( int data_value = 0; //sleep(40); - // One element per tile - nb = 1; + //number of elements per tile + nb = num_elem; // few tiles per node - nt = world*5; + nt = world; parsec_taskpool_t *dtd_tp = parsec_dtd_taskpool_new(); parsec_matrix_add2arena_rect( &parsec_dtd_arenas_datatypes[TILE_FULL], - parsec_datatype_int32_t, + parsec_datatype_double_t, nb, 1, nb); - - parsec_matrix_add2arena_rect( - &parsec_dtd_arenas_datatypes[TILE_BCAST], - parsec_datatype_int32_t, - nb_bcast, nb_bcast, nb_bcast); + // Initial value on the root node. All node should have this value // at the end of the operation. - int data_root = 55; + double data_root = 55; - // Final value received on non-root nodes. - int *data_value_out = (int*) calloc(1, sizeof(int)); - *data_value_out = -33; - if( root == myrank ) { - data_value = data_root; - } - else { - data_value = -10-myrank; - } - - //parsec_tiled_matrix_dc_t *dcB; - //dcB = create_and_distribute_data(myrank, world, nb_bcast, nt); - //parsec_data_collection_set_key((parsec_data_collection_t *)dcB, "B"); - - //parsec_data_collection_t *B = (parsec_data_collection_t *)dcB; - //parsec_dtd_data_collection_init(B); parsec_tiled_matrix_dc_t *dcA; dcA = create_and_distribute_data(myrank, world, nb, nt); parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); parsec_data_collection_t *A = (parsec_data_collection_t *)dcA; + two_dim_block_cyclic_t *__dcA = dcA; parsec_dtd_data_collection_init(A); - // Initialize tiles parsec_data_copy_t *parsec_data_copy; parsec_data_t *parsec_data; // Pointer to local tile data - int *data_ptr; + double *data_ptr; // Local tile key int key; key = A->data_key(A, myrank, 0); parsec_data = A->data_of_key(A, key); parsec_data_copy = parsec_data_get_copy(parsec_data, 0); - data_ptr = (int*)parsec_data_copy_get_ptr(parsec_data_copy); - if (root == myrank) { - *data_ptr = data_value; - } - else { - // Initialise this value with rubbish. It should be equal to - // `data_value` after the execution on even-indexed processes. - data_value_out = data_ptr; - } - //parsec_output(0, "Initial data, node: %d A At key[%d]: %d\n", myrank, key, *data_ptr); + data_ptr = (double*)parsec_data_copy_get_ptr(parsec_data_copy); // Registering the dtd_handle with PARSEC context perr = parsec_context_add_taskpool( parsec_context, dtd_tp ); @@ -227,17 +190,24 @@ int test_broadcast_mixed( perr = parsec_context_start(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_start"); + // Initialize tiles + if( root == myrank ) { + parsec_task_t *root_task = parsec_dtd_taskpool_create_task( + dtd_tp, write_task_fn, 0, "root_task", + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, myrank, 0), PARSEC_INOUT | TILE_FULL, + sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(double*), &data_root, PARSEC_VALUE, + PARSEC_DTD_ARG_END); + parsec_dtd_task_t *dtd_root_task = (parsec_dtd_task_t *)root_task; + parsec_insert_dtd_task(dtd_root_task); + } + // Key of tile associated with root node int key_root; parsec_dtd_tile_t* dtd_tile_root; - parsec_dtd_tile_t* bcast_keys_root; - //if(myrank % 2 == 1 || myrank == root) { - if(1) { - key_root = key = A->data_key(A, root, 0); - dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); - //key_root = B->data_key(B, root, 0); - //bcast_keys_root = PARSEC_DTD_TILE_OF_KEY(B, key_root); - } + + key_root = key = A->data_key(A, root, 0); + dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); // Create array of destination ranks int num_dest_ranks = 0; @@ -246,12 +216,9 @@ int test_broadcast_mixed( // Destination rank index int dest_rank_idx = 0 ; - // Put odd rank indexes into `dest_ranks` array except for the root - // node. VALID ONLY ON THE ROOT NODE + // VALID ONLY ON THE ROOT NODE for (int rank = 0; rank < world; ++rank) { - //if (rank % 2 == 0 || rank == root) continue; if (rank == root) continue; - dest_ranks[dest_rank_idx] = rank; ++dest_rank_idx; } @@ -260,96 +227,27 @@ int test_broadcast_mixed( // // Perform Broadcast // - //if(myrank % 2 == 1 || myrank == root) { - if(1) { - fprintf(stderr, "perform bcast from rank %d\n", myrank); - parsec_dtd_broadcast( - dtd_tp, root, - dtd_tile_root, TILE_FULL, - //bcast_keys_root, TILE_BCAST, - dest_ranks, num_dest_ranks); - } - - // - // Retrieve value of broadcasted data - // - //if(myrank % 2 == 1 || myrank == root) { - //if(myrank % 2 == 1) { - if(1) { - //for (int rank = 0; rank < world; ++rank) { - - //if (rank % 2 == 0 || rank == root) continue; - - parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( - dtd_tp, retrieve_task_fn, 0, "retrieve_task", - PASSED_BY_REF, dtd_tile_root, PARSEC_INPUT | TILE_FULL, - sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, - sizeof(int*), &data_value_out, PARSEC_VALUE, - PARSEC_DTD_ARG_END); - parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; - parsec_insert_dtd_task(retrieve_task); - - //} - } -for(int iter=1; iter <= 0; iter++) { - - int new_value = -1; - key_root = key = A->data_key(A, root+iter*world, 0); - dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); - if (root == myrank) { - //*data_ptr = 1998; - new_value = 1998+iter; - parsec_dtd_taskpool_insert_task(dtd_tp, - write_task_fn, 0, "write_task", - PASSED_BY_REF, dtd_tile_root, PARSEC_INOUT | TILE_FULL, - sizeof(int), &new_value, PARSEC_VALUE, - sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, - PARSEC_DTD_ARG_END); - } - else { - //data_value_out = data_ptr; - } - - - // Put all rank indexes into `dest_ranks` array except for the root - // node. - - int dest_rank_idx = 0; - int *dest_ranks = (int*)malloc(world*sizeof(int)); - - for (int rank = 0; rank < world; ++rank) { - if (rank == root) continue; - dest_ranks[dest_rank_idx] = rank; - ++dest_rank_idx; - } - int num_dest_ranks = dest_rank_idx; - - // - // Perform Broadcast AGAIN - // + //fprintf(stderr, "perform bcast from rank %d\n", myrank); parsec_dtd_broadcast( dtd_tp, root, dtd_tile_root, TILE_FULL, - //bcast_keys_root, TILE_BCAST, dest_ranks, num_dest_ranks); // // Retrieve value of broadcasted data // - //if ( myrank != root) { + double* data_value_out = -1; + if(1) { parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( dtd_tp, retrieve_task_fn, 0, "retrieve_task", PASSED_BY_REF, dtd_tile_root, PARSEC_INPUT | TILE_FULL, sizeof(int), &myrank, PARSEC_VALUE | PARSEC_AFFINITY, - sizeof(int*), &data_value_out, PARSEC_VALUE, + sizeof(double*), &data_value_out, PARSEC_VALUE, PARSEC_DTD_ARG_END); parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; parsec_insert_dtd_task(retrieve_task); - - //} -} + } parsec_dtd_data_flush_all( dtd_tp, A ); - //parsec_dtd_data_flush_all( dtd_tp, B ); // Wait for task completion perr = parsec_dtd_taskpool_wait( dtd_tp ); @@ -357,47 +255,27 @@ for(int iter=1; iter <= 0; iter++) { perr = parsec_context_wait(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_wait"); - - // Check whether we obtained the correct value on the current node - // at the end of the test. Odd processes should have received the - // value form the root and other processes should have kept their - // original value - if ((myrank == root) || - ((myrank % 2 == 1) && (data_root == *data_value_out)) || - ((myrank % 2 == 0) && (data_value == *data_ptr))) { - // Data received as expected - ret = 0; - } - else { - // Error - ret = -1; - } - -// parsec_output( 0, "Checking result, node: %d, data_value_out: %d\n", myrank, *data_value_out ); // Cleanup data and parsec data structures parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_FULL].opaque_dtt); PARSEC_OBJ_RELEASE(parsec_dtd_arenas_datatypes[TILE_FULL].arena); - parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_BCAST].opaque_dtt); - PARSEC_OBJ_RELEASE(parsec_dtd_arenas_datatypes[TILE_BCAST].arena); parsec_dtd_data_collection_fini( A ); - //parsec_dtd_data_collection_fini( B ); free_data(dcA); - //free_data(dcB); - parsec_taskpool_free( dtd_tp ); return ret; - } int main(int argc, char **argv) { int ret; parsec_context_t* parsec_context = NULL; - + double starttime, endtime; int rank, world; + char *p; + int nt = strtol(argv[1], &p, 10); + nt = nt*nt; { int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided); @@ -408,16 +286,14 @@ int main(int argc, char **argv) { /* int ncores = 1; */ int ncores = 2; parsec_context = parsec_init(ncores, &argc, &argv); - - // Root node for the broadcast operation - - sleep(30); - // - // Simple broadcast // Testing trimming with a mixed destinations of receivers for broadcast - ret = test_broadcast_mixed(world, rank, parsec_context, 0); - + //MPI_Barrier(MPI_COMM_WORLD); + starttime = MPI_Wtime(); + ret = test_broadcast_mixed(world, rank, parsec_context, 0, nt); + MPI_Barrier(MPI_COMM_WORLD); + endtime = MPI_Wtime(); + if(rank==0)printf("That took %f seconds\n",endtime-starttime); parsec_fini(&parsec_context); diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c b/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c new file mode 100644 index 000000000..6bc771932 --- /dev/null +++ b/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c @@ -0,0 +1,281 @@ +#include "mpi.h" + +#include "parsec.h" +#include "parsec/arena.h" +#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/remote_dep.h" +#include "parsec/data_internal.h" +#include "parsec/interfaces/superscalar/insert_function_internal.h" +#include "parsec/interfaces/superscalar/insert_function.h" + +#include +#include + +enum regions + { + TILE_FULL, + TILE_BCAST + }; + +parsec_tiled_matrix_dc_t *create_and_distribute_data(int rank, int world, int mb, int mt) +{ + two_dim_block_cyclic_t *m = (two_dim_block_cyclic_t*)malloc(sizeof(two_dim_block_cyclic_t)); + two_dim_block_cyclic_init(m, matrix_ComplexDouble, matrix_Tile, + rank, + mb, 1, + mt*mb, 1, + 0, 0, + mt*mb, 1, + world, 1, + 1, 1, + 0, 0); + + m->mat = parsec_data_allocate((size_t)m->super.nb_local_tiles * + (size_t)m->super.bsiz * + (size_t)parsec_datadist_getsizeoftype(m->super.mtype)); + + return (parsec_tiled_matrix_dc_t*)m; +} + +void free_data(parsec_tiled_matrix_dc_t *d) +{ + parsec_matrix_destroy_data(d); + parsec_data_collection_destroy(&d->super); + free(d); +} + +// Read data +int read_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task ) { + (void)es; + + // INPUT data + int *val_in; + // Task rank + int dest_rank; + + parsec_dtd_unpack_args(this_task, &val_in, &dest_rank); + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + //printf("[read_task] rank = %d, val_in = %d\n", myrank, *val_in); + + return PARSEC_HOOK_RETURN_DONE; +} + +// Write data +int write_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es; + + // INOUT data + double *val_in; + // Value to set the data to + double data_value; + // Task rank + int dest_rank; + + parsec_dtd_unpack_args(this_task, &val_in, &dest_rank, &data_value); + + //sleep(1); + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + //printf("[write_task] rank = %d, data_value = %f\n", myrank, data_value); + + *val_in = data_value; + + return PARSEC_HOOK_RETURN_DONE; +} + +// Retrieve value associated with input data_copy for verification. +int retrieve_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task ) { + (void)es; + + int myrank = -1; + // INPUT data + double *val_in; + // Task rank + int dest_rank; + + double *val_out; + + parsec_dtd_unpack_args(this_task, &val_in, &dest_rank, &val_out); + + /* int myrank; */ + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + //printf("[read_task] rank = %d, val_in = %f\n", myrank, *val_in); + + //*val_out = *val_in; + + return PARSEC_HOOK_RETURN_DONE; +} + + +int dummy_task_fn( + parsec_execution_stream_t *es, + parsec_task_t *this_task) { + (void)es;(void)this_task; + + return PARSEC_HOOK_RETURN_DONE; +} + +int test_broadcast_mixed( + int world, int myrank, parsec_context_t* parsec_context, int root, int num_elem) { + + // Test return value: + // - 0: success + // - Failure otherwise + int ret = 0; + + // Error code return by parsec routines + int perr; + + // Tile size + int nb = 1; + int nb_bcast = 30; + // Total number of tiles + int nt = 1; + int data_value = 0; + + //sleep(40); + //number of elements per tile + nb = num_elem; + // few tiles per node + nt = world; + + parsec_taskpool_t *dtd_tp = parsec_dtd_taskpool_new(); + + parsec_matrix_add2arena_rect( + &parsec_dtd_arenas_datatypes[TILE_FULL], + parsec_datatype_double_t, + nb, 1, nb); + + // Initial value on the root node. All node should have this value + // at the end of the operation. + double data_root = 55; + + + + parsec_tiled_matrix_dc_t *dcA; + dcA = create_and_distribute_data(myrank, world, nb, nt); + parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A"); + + parsec_data_collection_t *A = (parsec_data_collection_t *)dcA; + two_dim_block_cyclic_t *__dcA = dcA; + parsec_dtd_data_collection_init(A); + + parsec_data_copy_t *parsec_data_copy; + parsec_data_t *parsec_data; + // Pointer to local tile data + double *data_ptr; + // Local tile key + int key; + + key = A->data_key(A, myrank, 0); + parsec_data = A->data_of_key(A, key); + parsec_data_copy = parsec_data_get_copy(parsec_data, 0); + data_ptr = (double*)parsec_data_copy_get_ptr(parsec_data_copy); + + // Registering the dtd_handle with PARSEC context + perr = parsec_context_add_taskpool( parsec_context, dtd_tp ); + PARSEC_CHECK_ERROR(perr, "parsec_context_add_taskpool"); + + perr = parsec_context_start(parsec_context); + PARSEC_CHECK_ERROR(perr, "parsec_context_start"); + MPI_Barrier(MPI_COMM_WORLD); + // Initialize tiles + if( 1 ) { + parsec_task_t *root_task = parsec_dtd_taskpool_create_task( + dtd_tp, write_task_fn, 0, "root_task", + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, root, 0), PARSEC_INOUT | TILE_FULL, + sizeof(int), &root, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(double*), &data_root, PARSEC_VALUE, + PARSEC_DTD_ARG_END); + parsec_dtd_task_t *dtd_root_task = (parsec_dtd_task_t *)root_task; + parsec_insert_dtd_task(dtd_root_task); + } + + // Key of tile associated with root node + int key_root; + parsec_dtd_tile_t* dtd_tile_root; + + key_root = key = A->data_key(A, root, 0); + dtd_tile_root = PARSEC_DTD_TILE_OF_KEY(A, key_root); + + // + // Retrieve value of broadcasted data + // + double* data_value_out = -1; + for(int irank = 0; irank < world; irank++) { + //parsec_task_t *retrieve_task = parsec_dtd_taskpool_create_task( + parsec_dtd_taskpool_insert_task( + dtd_tp, retrieve_task_fn, 0, "retrieve_task", + PASSED_BY_REF, PARSEC_DTD_TILE_OF(A, root, 0), PARSEC_INPUT | TILE_FULL, + sizeof(int), &irank, PARSEC_VALUE | PARSEC_AFFINITY, + sizeof(double*), &data_value_out, PARSEC_VALUE, + PARSEC_DTD_ARG_END); + //parsec_dtd_task_t *dtd_retrieve_task = (parsec_dtd_task_t *)retrieve_task; + //parsec_insert_dtd_task(retrieve_task); + } + parsec_dtd_data_flush_all( dtd_tp, A ); + + // Wait for task completion + perr = parsec_dtd_taskpool_wait( dtd_tp ); + PARSEC_CHECK_ERROR(perr, "parsec_dtd_taskpool_wait"); + + perr = parsec_context_wait(parsec_context); + PARSEC_CHECK_ERROR(perr, "parsec_context_wait"); + + // Cleanup data and parsec data structures + parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_FULL].opaque_dtt); + PARSEC_OBJ_RELEASE(parsec_dtd_arenas_datatypes[TILE_FULL].arena); + parsec_dtd_data_collection_fini( A ); + free_data(dcA); + parsec_taskpool_free( dtd_tp ); + + return ret; +} + +int main(int argc, char **argv) { + + int ret; + parsec_context_t* parsec_context = NULL; + double starttime, endtime; + int rank, world; + + char *p; + int nt = strtol(argv[1], &p, 10); + nt = nt*nt; + { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided); + } + MPI_Comm_size(MPI_COMM_WORLD, &world); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* int ncores = 1; */ + int ncores = 2; + parsec_context = parsec_init(ncores, &argc, &argv); + + // Testing trimming with a mixed destinations of receivers for broadcast + //MPI_Barrier(MPI_COMM_WORLD); + starttime = MPI_Wtime(); + ret = test_broadcast_mixed(world, rank, parsec_context, 0, nt); + MPI_Barrier(MPI_COMM_WORLD); + endtime = MPI_Wtime(); + if(rank==0)printf("That took %f seconds\n",endtime-starttime); + + parsec_fini(&parsec_context); + + MPI_Finalize(); + (void)ret; + return 0; +} diff --git a/tests/ptg_bcast.jdf b/tests/ptg_bcast.jdf new file mode 100644 index 000000000..809052cc4 --- /dev/null +++ b/tests/ptg_bcast.jdf @@ -0,0 +1,160 @@ +extern "C" %{ +/** + * PTG_BCAST BENCHMARK + **/ + +#include +#include +#include +#include +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" + +#include "ptg_bcast.h" +static parsec_ptg_bcast_taskpool_t* tp; +static int verbose = 0; + +%} + +descA [type = "two_dim_block_cyclic_t*"] +descB [type = "two_dim_block_cyclic_t*" aligned=descA] + + + + +WRITER(i) + + i = 0..0 + +: descA(0, 0) + +RW A <- descA(i, 0) + -> A READERS(0..descA->super.mt-1) +BODY + if(verbose) + printf("Executed WRITE TASK ON ROOT\n"); +END + + +READERS(i) +i = 0 .. descA->super.mt-1 + +: descA(i, 0) + +READ A <- A WRITER(0) +RW B <- descB(i, 0) + -> descB(i, 0) +BODY + if(verbose) + printf("Executed READ OF A and UPDATE of B\n"); +END + +extern "C" %{ +parsec_taskpool_t* +parsec_ptg_bcast_New(parsec_tiled_matrix_dc_t *dcA, parsec_tiled_matrix_dc_t *dcB) +{ + parsec_taskpool_t* ptg_bcast_taskpool; + parsec_ptg_bcast_taskpool_t* taskpool = NULL; + + taskpool = parsec_ptg_bcast_new(dcA, dcB); + ptg_bcast_taskpool = (parsec_taskpool_t*)taskpool; + + parsec_matrix_add2arena( &taskpool->arenas_datatypes[PARSEC_ptg_bcast_DEFAULT_ARENA], + parsec_datatype_double_t, matrix_UpperLower, + 1, dcA->mb, dcA->nb, dcA->mb, + PARSEC_ARENA_ALIGNMENT_SSE, -1 ); + + return ptg_bcast_taskpool; +} + +void parsec_ptg_bcast_Destruct(parsec_taskpool_t *taskpool) +{ + parsec_ptg_bcast_taskpool_t *ptg_bcast_taskpool = (parsec_ptg_bcast_taskpool_t *)taskpool; + parsec_matrix_del2arena(&ptg_bcast_taskpool->arenas_datatypes[PARSEC_ptg_bcast_DEFAULT_ARENA]); + parsec_taskpool_free(taskpool); +} + +int parsec_ptg_bcast(parsec_context_t *parsec, + parsec_tiled_matrix_dc_t *A, + parsec_tiled_matrix_dc_t *B) +{ + parsec_taskpool_t *parsec_ptg_bcast = NULL; + + parsec_ptg_bcast = parsec_ptg_bcast_New(A, B); + + if( parsec_ptg_bcast != NULL ){ + parsec_enqueue(parsec, parsec_ptg_bcast); + parsec_context_start(parsec); + parsec_context_wait(parsec); + parsec_ptg_bcast_Destruct(parsec_ptg_bcast); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + two_dim_block_cyclic_t descA, descB; + parsec_arena_datatype_t adt; + parsec_context_t *parsec; + int rank = 0, size = 1, mat_size; + long time_elapsed; + int nt; + nt = strtol(argv[1], NULL, 10); +#ifdef DISTRIBUTED + { + int provided; + MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided); + } + int world; + MPI_Comm_size(MPI_COMM_WORLD, &world); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#endif /* DISTRIBUTED */ + parsec = parsec_init(2, &argc, &argv); + assert( NULL != parsec ); + + //MPI_Barrier(MPI_COMM_WORLD); + two_dim_block_cyclic_init( &descA, matrix_RealDouble, matrix_Tile, + rank /*rank*/, + nt*nt, 1, + world*nt*nt, 1, + 0, 0, + world*nt*nt, 1, + world, 1, + 1, 1, + 0, 0); + descA.mat = parsec_data_allocate( descA.super.nb_local_tiles * + descA.super.bsiz * + parsec_datadist_getsizeoftype(descA.super.mtype) ); + two_dim_block_cyclic_init( &descB, matrix_RealDouble, matrix_Tile, + rank /*rank*/, + nt*nt, 1, + world*nt*nt, 1, + 0, 0, + world*nt*nt, 1, + world, 1, + 1, 1, + 0, 0); + descB.mat = parsec_data_allocate( descB.super.nb_local_tiles * + descB.super.bsiz * + parsec_datadist_getsizeoftype(descA.super.mtype) ); + + //SYNC_TIME_START(); + double starttime, endtime; + starttime = MPI_Wtime(); + parsec_ptg_bcast(parsec, (parsec_tiled_matrix_dc_t *)&descA, (parsec_tiled_matrix_dc_t *)&descB); + MPI_Barrier(MPI_COMM_WORLD); + endtime = MPI_Wtime(); + if(rank==0)printf("That took %f seconds\n",endtime-starttime); + parsec_data_free(descA.mat); + parsec_data_free(descB.mat); + parsec_tiled_matrix_dc_destroy((parsec_tiled_matrix_dc_t*)&descA); + parsec_tiled_matrix_dc_destroy((parsec_tiled_matrix_dc_t*)&descB); + + parsec_fini(&parsec); +#ifdef DISTRIBUTED + MPI_Finalize(); +#endif /* DISTRIBUTED */ + return 0; +} + +%} From b9c717a7fe4679d0d99f8abd8531f9737b0d5cf4 Mon Sep 17 00:00:00 2001 From: Yu Pei Date: Thu, 19 May 2022 10:29:37 -0400 Subject: [PATCH 41/41] move the timing to just around the bcast --- .../superscalar/dtd_test_broadcast_collective.c | 16 +++++++++++----- .../superscalar/dtd_test_broadcast_p2p.c | 13 +++++++++---- tests/ptg_bcast.jdf | 16 ++++++++++++---- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c index ba1cb19bb..e8f585194 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_collective.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_collective.c @@ -84,6 +84,7 @@ int write_task_fn( int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + //sleep(1); //printf("[write_task] rank = %d, data_value = %f\n", myrank, data_value); *val_in = data_value; @@ -149,6 +150,7 @@ int test_broadcast_mixed( nb = num_elem; // few tiles per node nt = world; + double starttime, endtime; parsec_taskpool_t *dtd_tp = parsec_dtd_taskpool_new(); @@ -190,6 +192,8 @@ int test_broadcast_mixed( perr = parsec_context_start(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_start"); + MPI_Barrier(MPI_COMM_WORLD); + starttime = MPI_Wtime(); // Initialize tiles if( root == myrank ) { parsec_task_t *root_task = parsec_dtd_taskpool_create_task( @@ -255,6 +259,9 @@ int test_broadcast_mixed( perr = parsec_context_wait(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_wait"); + MPI_Barrier(MPI_COMM_WORLD); + endtime = MPI_Wtime(); + if(myrank==0)printf("That took %f seconds\n",endtime-starttime); // Cleanup data and parsec data structures parsec_type_free(&parsec_dtd_arenas_datatypes[TILE_FULL].opaque_dtt); @@ -270,7 +277,6 @@ int main(int argc, char **argv) { int ret; parsec_context_t* parsec_context = NULL; - double starttime, endtime; int rank, world; char *p; @@ -289,11 +295,11 @@ int main(int argc, char **argv) { // Testing trimming with a mixed destinations of receivers for broadcast //MPI_Barrier(MPI_COMM_WORLD); - starttime = MPI_Wtime(); + //starttime = MPI_Wtime(); ret = test_broadcast_mixed(world, rank, parsec_context, 0, nt); - MPI_Barrier(MPI_COMM_WORLD); - endtime = MPI_Wtime(); - if(rank==0)printf("That took %f seconds\n",endtime-starttime); + //MPI_Barrier(MPI_COMM_WORLD); + //endtime = MPI_Wtime(); + //if(rank==0)printf("That took %f seconds\n",endtime-starttime); parsec_fini(&parsec_context); diff --git a/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c b/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c index 6bc771932..8010d3632 100644 --- a/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c +++ b/tests/interfaces/superscalar/dtd_test_broadcast_p2p.c @@ -135,6 +135,7 @@ int test_broadcast_mixed( // - Failure otherwise int ret = 0; + double starttime, endtime; // Error code return by parsec routines int perr; @@ -191,6 +192,7 @@ int test_broadcast_mixed( perr = parsec_context_start(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_start"); MPI_Barrier(MPI_COMM_WORLD); + starttime = MPI_Wtime(); // Initialize tiles if( 1 ) { parsec_task_t *root_task = parsec_dtd_taskpool_create_task( @@ -230,6 +232,9 @@ int test_broadcast_mixed( // Wait for task completion perr = parsec_dtd_taskpool_wait( dtd_tp ); PARSEC_CHECK_ERROR(perr, "parsec_dtd_taskpool_wait"); + MPI_Barrier(MPI_COMM_WORLD); + endtime = MPI_Wtime(); + if(myrank==0)printf("That took %f seconds\n",endtime-starttime); perr = parsec_context_wait(parsec_context); PARSEC_CHECK_ERROR(perr, "parsec_context_wait"); @@ -267,11 +272,11 @@ int main(int argc, char **argv) { // Testing trimming with a mixed destinations of receivers for broadcast //MPI_Barrier(MPI_COMM_WORLD); - starttime = MPI_Wtime(); + //starttime = MPI_Wtime(); ret = test_broadcast_mixed(world, rank, parsec_context, 0, nt); - MPI_Barrier(MPI_COMM_WORLD); - endtime = MPI_Wtime(); - if(rank==0)printf("That took %f seconds\n",endtime-starttime); + //MPI_Barrier(MPI_COMM_WORLD); + //endtime = MPI_Wtime(); + //if(rank==0)printf("That took %f seconds\n",endtime-starttime); parsec_fini(&parsec_context); diff --git a/tests/ptg_bcast.jdf b/tests/ptg_bcast.jdf index 809052cc4..c6003ff43 100644 --- a/tests/ptg_bcast.jdf +++ b/tests/ptg_bcast.jdf @@ -81,10 +81,17 @@ int parsec_ptg_bcast(parsec_context_t *parsec, parsec_ptg_bcast = parsec_ptg_bcast_New(A, B); + double starttime, endtime; + +MPI_Barrier (MPI_COMM_WORLD); + starttime = MPI_Wtime(); if( parsec_ptg_bcast != NULL ){ parsec_enqueue(parsec, parsec_ptg_bcast); parsec_context_start(parsec); parsec_context_wait(parsec); +MPI_Barrier(MPI_COMM_WORLD); +endtime = MPI_Wtime (); + if(parsec->my_rank==0)printf("That took %f seconds\n",endtime-starttime); parsec_ptg_bcast_Destruct(parsec_ptg_bcast); } @@ -140,11 +147,12 @@ int main(int argc, char* argv[]) //SYNC_TIME_START(); double starttime, endtime; - starttime = MPI_Wtime(); - parsec_ptg_bcast(parsec, (parsec_tiled_matrix_dc_t *)&descA, (parsec_tiled_matrix_dc_t *)&descB); MPI_Barrier(MPI_COMM_WORLD); - endtime = MPI_Wtime(); - if(rank==0)printf("That took %f seconds\n",endtime-starttime); + //starttime = MPI_Wtime(); + parsec_ptg_bcast(parsec, (parsec_tiled_matrix_dc_t *)&descA, (parsec_tiled_matrix_dc_t *)&descB); + //MPI_Barrier(MPI_COMM_WORLD); + //endtime = MPI_Wtime(); + //if(rank==0)printf("That took %f seconds\n",endtime-starttime); parsec_data_free(descA.mat); parsec_data_free(descB.mat); parsec_tiled_matrix_dc_destroy((parsec_tiled_matrix_dc_t*)&descA);