Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,9 @@ option(PARSEC_GPU_CUDA_ALLOC_PER_TILE
"Tile based allocation engine for GPU memory (instead of internal management
of a complete allocation)" OFF)
mark_as_advanced(PARSEC_GPU_CUDA_ALLOC_PER_TILE)
option(PARSEC_GPU_WITH_OPENCL
"Enable GPU support using OpenCL kernels" OFF)
mark_as_advanced(PARSEC_GPU_WITH_OPENCL) # Hide this as it is not supported yet
if(PARSEC_GPU_WITH_OPENCL)
message(WARNING "Open CL is not supported yet, ignored.")
endif()
option(PARSEC_GPU_WITH_OPENMP
"Enable GPU support using OpenMP target" OFF)
mark_as_advanced(PARSEC_GPU_WITH_OPENMP) # Hide this as it is experimental now

### Debug options
if( "Debug" STREQUAL CMAKE_BUILD_TYPE )
Expand Down Expand Up @@ -625,6 +622,11 @@ IF( BUILD_PARSEC )
endif (CUDA_FOUND)
endif( PARSEC_GPU_WITH_CUDA )

if( PARSEC_GPU_WITH_OPENMP )
find_package(OpenMP 4.5)
set(PARSEC_HAVE_OPENMP ${OPENMP_FOUND})
endif( PARSEC_GPU_WITH_OPENMP )

find_package(AYUDAME QUIET)
set(PARSEC_HAVE_AYUDAME ${AYUDAME_FOUND})
#
Expand Down
34 changes: 34 additions & 0 deletions dplasma/cores/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ set(generated_headers "")

include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR})

if( OPENMP_FOUND )
set(CORE_OMP_HEADERS
"")
set(CORE_OMP_SOURCES
omp_zgemm.c)
else()
set(CORE_OMP_HEADERS "")
set(CORE_OMP_SOURCES "")
endif()

if( CUDA_FOUND )
set(CORE_CUDA_HEADERS
)
Expand Down Expand Up @@ -64,6 +74,30 @@ install(TARGETS dplasma_cores
ARCHIVE DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib
LIBRARY DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib)

if( OPENMP_FOUND )
# generate the cores_omp library (hooks to mkl in omp target regions)
precisions_rules_py(generated_omp_files
${CORE_OMP_SOURCES}
PRECISIONS "${DPLASMA_PRECISIONS}")

if( NOT "${generated_omp_files}" STREQUAL "" )
add_library(dplasma_cores_omp
${generated_omp_files})
set_target_properties(dplasma_cores_omp PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}")
set_target_properties(dplasma_cores_omp PROPERTIES LINK_FLAGS "-qoffload-option,mic,ld,\"-L$ENV{MKLROOT}/lib/mic -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core\"")
add_dependencies(dplasma_cores_omp
dplasma_includes
dplasma_cores_includes)
target_link_libraries(dplasma_cores_omp
${COREBLAS_LIBRARIES}
${EXTRA_LIBS})
target_link_libraries(dplasma_cores dplasma_cores_omp)
install(TARGETS dplasma_cores_omp
ARCHIVE DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib
LIBRARY DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib)
endif()
endif( OPENMP_FOUND)

if( CUDA_FOUND )
# generate the cores_cuda library (hooks to cublas)
precisions_rules_py(generated_cuda_files
Expand Down
37 changes: 37 additions & 0 deletions dplasma/cores/omp_zgemm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* @precisions normal z -> c d s
*/
#include "dplasma.h"
#include "dplasma_cores.h"
#include "dplasma_zcores.h"
#include <cblas.h>
#include <omp.h>

void CORE_omp_zgemm(PLASMA_enum transA, int transB,
int M, int N, int K,
PLASMA_Complex64_t alpha, const PLASMA_Complex64_t *A, int LDA,
const PLASMA_Complex64_t *B, int LDB,
PLASMA_Complex64_t beta, PLASMA_Complex64_t *C, int LDC, int device_index, int *device_stream) {
//map(to: transA, transB, M, N, K, alpha, beta, LDA, LDB, LDC)
#pragma omp target nowait device(device_index) depend(out: device_stream[0]) is_device_ptr(A, B, C)
{
#pragma omp declare target
void cblas_zgemm(const enum CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const void *alpha, const void *A,
const int lda, const void *B, const int ldb,
const void *beta, void *C, const int ldc);
#pragma omp end declare target

printf("Gemm %d %d %d A=%p, B=%p, C=%p device=%s event=%p (%d)\n", M, N, K, A, B, C, omp_is_initial_device()? "host": "offloaded", device_stream, device_stream[0]);
cblas_zgemm(
CblasColMajor,
(CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
M, N, K,
&alpha, A, LDA,
B, LDB,
&beta, C, LDC);
}
//#pragma omp taskwait
}

1 change: 1 addition & 0 deletions dplasma/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ list(APPEND generated_jdf
)

jdf_rules(generated_files "${generated_jdf}")
#set_source_files_properties(spotrf_U.c PROPERTIES COMPILE_FLAGS ${OpenMP_C_FLAGS})

### Generate the dplasma wrappers for all required precisions
set(SOURCES
Expand Down
4 changes: 2 additions & 2 deletions dplasma/lib/zgeqrf.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,8 @@ BODY [type=CUDA device=%{ return n; %}]
int ldak = BLKLDD( descA, k );
int ldam = BLKLDD( descA, m );

WORK = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(parsec_complex64_t));
WORKC = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(parsec_complex64_t));
WORK = parsec_cuda_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(parsec_complex64_t));
WORKC = parsec_cuda_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(parsec_complex64_t));

dplasma_cuda_ztsmqr( PlasmaLeft, PlasmaConjTrans,
descA->mb, tempnn, tempmm, tempnn, descA->nb, ib,
Expand Down
48 changes: 48 additions & 0 deletions dplasma/lib/zpotrf_U.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,54 @@ RW C <- (k == 0) ? descA(m, n) : C potrf_zgemm(m, n, k-1)

; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - m - n - 3) * (n - m) + 6 * (n - k) : PRI_MAX


BODY [type=OPENMP
weight=(m+1-k)]
{
int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
int ldak = BLKLDD( descA, k );
int ldam = BLKLDD( descA, m );

#if !defined(PARSEC_DRY_RUN)
#if 0
#pragma omp target nowait device(parsec_body.index) depend(out: parsec_openmp_stream[0]) is_device_ptr(A, B, C)
//#pragma omp task depend(inout: parsec_openmp_stream[0])
{
printf("Gemm %d %d %d A=%p, B=%p, C=%p device=%s event=%p (%d)\n", m, n, k, A, B, C, omp_is_initial_device()? "host": "offloaded", parsec_openmp_stream, parsec_openmp_stream[0]);
//for(int i = 0; i < m; i++) printf("C[0][%d]=%g\n", i, ((float*)C)[i]);
#pragma omp declare target
void CORE_zgemm(PLASMA_enum transA, int transB,
int M, int N, int K,
PLASMA_Complex64_t alpha, const PLASMA_Complex64_t *A, int LDA,
const PLASMA_Complex64_t *B, int LDB,
PLASMA_Complex64_t beta, PLASMA_Complex64_t *C, int LDC);
#pragma omp end declare target
CORE_zgemm(PlasmaConjTrans, PlasmaNoTrans,
descA->mb, tempnn, descA->nb,
(parsec_complex64_t)-1.0, A /*A(k, m)*/, ldak,
B /*A(k, n)*/, ldak,
(parsec_complex64_t) 1.0, C /*A(m, n)*/, ldam);
}
#else
CORE_omp_zgemm(PlasmaConjTrans, PlasmaNoTrans,
descA->mb, tempnn, descA->nb,
(parsec_complex64_t)-1.0, A, ldak,
B, ldak,
(parsec_complex64_t) 1.0, C, ldam, parsec_body.index, parsec_body.stream);
#endif
#endif /* !defined(PARSEC_DRY_RUN) */

printf("GEMM %d %d %d is on device %d", m, n, k, parsec_body.index);
printlog("CORE_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n",
m, n, k,
plasma_const( PlasmaConjTrans ), plasma_const( PlasmaNoTrans ),
descA->mb, tempnn, descA->nb,
-1.0, k, m, A, ldak,
k, n, B, ldak,
1.0, m, n, C, ldam);
}
END

BODY [type=CUDA
dyld=cublasZgemm dyldtype=cublas_zgemm_t
weight=(m+1-k)]
Expand Down
1 change: 1 addition & 0 deletions dplasma/tools/PrecisionGenerator/subs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@
('CORE_s', 'CORE_d', 'CORE_s', 'CORE_d' ),
('core_ssy', 'core_dsy', 'core_che', 'core_zhe' ),
('core_s', 'core_d', 'core_c', 'core_z' ),
('omp_s', 'omp_d', 'omp_c', 'omp_z' ),
('coreblas_s', 'coreblas_d', 'coreblas_c', 'coreblas_z' ),
('cuda_s', 'cuda_d', 'cuda_s', 'cuda_d' ),
('cuda_s', 'cuda_d', 'cuda_c', 'cuda_z' ),
Expand Down
15 changes: 15 additions & 0 deletions parsec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ if (CUDA_FOUND)
endif( PARSEC_WITH_DEVEL_HEADERS )
endif (CUDA_FOUND)

if (PARSEC_GPU_WITH_OPENMP)
find_package(OpenMP 4.5)
set(PARSEC_HAVE_OPENMP ${OPENMP_FOUND})
if (OPENMP_FOUND)
set_source_files_properties(devices/openmp/dev_omp.c PROPERTIES COMPILE_FLAGS ${OpenMP_C_FLAGS})
list(APPEND SOURCES devices/openmp/dev_omp.c)
if( PARSEC_WITH_DEVEL_HEADERS )
install(FILES
${CMAKE_CURRENT_SOURCE_DIR}/devices/openmp/dev_omp.h
DESTINATION include/parsec/devices/openmp )
endif( PARSEC_WITH_DEVEL_HEADERS )
endif( OPENMP_FOUND )
endif( PARSEC_GPU_WITH_OPENMP )

if( PARSEC_PROF_GRAPHER )
list(APPEND SOURCES parsec_prof_grapher.c)
endif( PARSEC_PROF_GRAPHER )
Expand All @@ -112,6 +126,7 @@ if( BUILD_PARSEC )
if (MPI_C_FOUND)
set_target_properties(parsec PROPERTIES COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}")
endif (MPI_C_FOUND)
set_target_properties(parsec PROPERTIES LINK_FLAGS "${OpenMP_C_FLAGS}")
target_link_libraries(parsec parsec-base ${EXTRA_LIBS})

install(TARGETS parsec
Expand Down
8 changes: 8 additions & 0 deletions parsec/class/parsec_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,11 @@ OBJ_CLASS_INSTANCE(parsec_list_t, parsec_object_t,
parsec_list_construct, parsec_list_destruct);


/* To be called from GDB, not from actual code */
void parsec_list_debug_walker(parsec_list_t *list)
{
parsec_list_item_t *p = (parsec_list_item_t *)list->ghost_element.list_next;
while (p != &(list->ghost_element)) {
p = (parsec_list_item_t *)p->list_next;
}
}
Loading