diff --git a/parsec b/parsec index b3e7e24c..14b823a7 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit b3e7e24c4ab42076ee39a520f1540a9fe6b553db +Subproject commit 14b823a7ff9b443868b7cd8d62373e3998f8f558 diff --git a/src/cores/CMakeLists.txt b/src/cores/CMakeLists.txt index 33cb1347..f6986f5c 100644 --- a/src/cores/CMakeLists.txt +++ b/src/cores/CMakeLists.txt @@ -2,6 +2,7 @@ # Copyright (c) 2011-2022 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. +# Copyright (c) 2026 NVIDIA Corporation. All rights reserved. # set(ZHEADERS dplasma_zcores.h core_zblas.h @@ -94,6 +95,9 @@ if(DPLASMA_HAVE_CUDA) set(ZHEADERS_CUDA "") set(ZSOURCES_CUDA dplasma_cuda_ztsmqr.c) endif() +if(DPLASMA_HAVE_HIP) +set(ZSOURCES_HIP dplasma_hip_ztsmqr.c) +endif() include(PrecisionGenerator) # reset variables @@ -102,6 +106,7 @@ set(generated_cores_cz "") set(generated_cores_sdcz "") set(generated_cores_headers "") set(generated_cores_cuda "") +set(generated_cores_hip "") ### Generate the dplasma_cores for all required precisions precisions_rules_py(generated_cores @@ -185,3 +190,31 @@ if( DPLASMA_HAVE_CUDA ) endif( NOT "${generated_cores_cuda}" STREQUAL "") endif( DPLASMA_HAVE_CUDA ) +if( DPLASMA_HAVE_HIP ) + # generate the cores_hip library (hooks to hipBLAS) + precisions_rules_py(generated_cores_hip + ${ZSOURCES_HIP} + PRECISIONS "${DPLASMA_PRECISIONS}") + + if( NOT "${generated_cores_hip}" STREQUAL "") + add_documented_files(PROJECT DPLASMA DIR ${CMAKE_CURRENT_BINARY_DIR} FILES ${generated_cores_hip}) + + add_library(dplasma_cores_hip OBJECT ${generated_cores_hip}) + + add_dependencies(dplasma_cores_hip dplasma_includes + dplasma_cores_includes) + set_target_properties(dplasma_cores_hip PROPERTIES POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS}) + target_include_directories(dplasma_cores_hip + PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + $<$:${CMAKE_CURRENT_SOURCE_DIR}>) + target_link_libraries(dplasma_cores_hip + PRIVATE + PaRSEC::parsec + LAPACKE::LAPACKE + m + roc::hipblas) + + target_sources(dplasma PRIVATE $) + endif( NOT "${generated_cores_hip}" STREQUAL "") +endif( DPLASMA_HAVE_HIP ) diff --git a/src/cores/dplasma_cuda_ztsmqr.c b/src/cores/dplasma_cuda_ztsmqr.c index 21e8f190..c3e68182 100644 --- a/src/cores/dplasma_cuda_ztsmqr.c +++ b/src/cores/dplasma_cuda_ztsmqr.c @@ -9,15 +9,19 @@ * @version 2.7.1 * @author Mathieu Faverge * @date 2010-11-15 + * + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. **/ /* * @precisions normal z -> c d s */ -#include -#include "common.h" - +#include #include +#include + +#include "common.h" +#include "dplasmaaux_cuda.h" int dplasma_cuda_zparfb(PLASMA_enum side, PLASMA_enum trans, @@ -31,6 +35,7 @@ dplasma_cuda_zparfb(PLASMA_enum side, PLASMA_enum trans, const PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK, PLASMA_Complex64_t *WORKC, int LDWORKC, + cublasHandle_t handle, cudaStream_t stream) { #if defined(PRECISION_z) || defined(PRECISION_c) @@ -43,9 +48,15 @@ dplasma_cuda_zparfb(PLASMA_enum side, PLASMA_enum trans, double mzone = -1.0; #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + cublasStatus_t cublas_status; + cudaError_t cuda_status; int j; (void)L; + cublas_status = cublasSetStream(handle, stream); + DPLASMA_CUBLAS_CHECK_STATUS("cublasSetStream ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); + /* Check input arguments */ if ((side != PlasmaLeft) && (side != PlasmaRight)) { return -1; @@ -99,66 +110,88 @@ dplasma_cuda_zparfb(PLASMA_enum side, PLASMA_enum trans, * W = W + V' * A2 * */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), - K * sizeof(cuDoubleComplex), N1, - cudaMemcpyDeviceToDevice, stream ); + cuda_status = cudaMemcpy2DAsync(WORK, LDWORK * sizeof(PLASMA_Complex64_t), + A1, LDA1 * sizeof(PLASMA_Complex64_t), + K * sizeof(PLASMA_Complex64_t), N1, + cudaMemcpyDeviceToDevice, stream); + PARSEC_CUDA_CHECK_ERROR("cudaMemcpy2DAsync ", cuda_status, + { return PLASMA_ERR_UNEXPECTED; }); - cublasZgemm(lapack_const(PlasmaConjTrans), 'N', - K, N1, M2, - zone, - (cuDoubleComplex*)V /* K*M2 */, LDV, - (cuDoubleComplex*)A2 /* M2*N1 */, LDA2, - zone, - (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); + cublas_status = cublasZgemm_v2(handle, + dplasma_cublas_op(PlasmaConjTrans), CUBLAS_OP_N, + K, N1, M2, + &zone, + (const cuDoubleComplex*)V /* K*M2 */, LDV, + (const cuDoubleComplex*)A2 /* M2*N1 */, LDA2, + &zone, + (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZgemm_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); if (WORKC == NULL) { /* W = op(T) * W */ - cublasZtrmm( 'L', 'U', - lapack_const(trans), 'N', - K, N2, - zone, - (cuDoubleComplex*)T, LDT, - (cuDoubleComplex*)WORK, LDWORK); - + cublas_status = cublasZtrmm_v2(handle, + CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, + dplasma_cublas_op(trans), CUBLAS_DIAG_NON_UNIT, + K, N2, + &zone, + (const cuDoubleComplex*)T, LDT, + (const cuDoubleComplex*)WORK, LDWORK, + (cuDoubleComplex*)WORK, LDWORK); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZtrmm_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); /* A1 = A1 - W = A1 - op(T) * W */ for(j = 0; j < N1; j++) { - cublasZaxpy(K, mzone, - (cuDoubleComplex*)(WORK + LDWORK*j), 1, - (cuDoubleComplex*)(A1 + LDA1*j), 1); + cublas_status = cublasZaxpy_v2(handle, K, &mzone, + (const cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZaxpy_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); } /* A2 = A2 - op(V) * W */ - cublasZgemm('N', 'N', - M2, N2, K, - mzone, - (cuDoubleComplex*)V /* M2*K */, LDV, - (cuDoubleComplex*)WORK /* K*N2 */, LDWORK, - zone, - (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); + cublas_status = cublasZgemm_v2(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + M2, N2, K, + &mzone, + (const cuDoubleComplex*)V /* M2*K */, LDV, + (const cuDoubleComplex*)WORK /* K*N2 */, LDWORK, + &zone, + (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZgemm_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); } else { /* Wc = V * op(T) */ - cublasZgemm( 'N', lapack_const(trans), - M2, K, K, - zone, (cuDoubleComplex*)V, LDV, - (cuDoubleComplex*)T, LDT, - zzero, (cuDoubleComplex*)WORKC, LDWORKC ); + cublas_status = cublasZgemm_v2(handle, + CUBLAS_OP_N, dplasma_cublas_op(trans), + M2, K, K, + &zone, (const cuDoubleComplex*)V, LDV, + (const cuDoubleComplex*)T, LDT, + &zzero, (cuDoubleComplex*)WORKC, LDWORKC); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZgemm_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); /* A1 = A1 - opt(T) * W */ - cublasZgemm( lapack_const(trans), 'N', - K, N1, K, - mzone, (cuDoubleComplex*)T, LDT, - (cuDoubleComplex*)WORK, LDWORK, - zone, (cuDoubleComplex*)A1, LDA1 ); + cublas_status = cublasZgemm_v2(handle, + dplasma_cublas_op(trans), CUBLAS_OP_N, + K, N1, K, + &mzone, (const cuDoubleComplex*)T, LDT, + (const cuDoubleComplex*)WORK, LDWORK, + &zone, (cuDoubleComplex*)A1, LDA1); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZgemm_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); /* A2 = A2 - Wc * W */ - cublasZgemm( 'N', 'N', - M2, N2, K, - mzone, (cuDoubleComplex*)WORKC, LDWORKC, - (cuDoubleComplex*)WORK, LDWORK, - zone, (cuDoubleComplex*)A2, LDA2 ); + cublas_status = cublasZgemm_v2(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + M2, N2, K, + &mzone, (const cuDoubleComplex*)WORKC, LDWORKC, + (const cuDoubleComplex*)WORK, LDWORK, + &zone, (cuDoubleComplex*)A2, LDA2); + DPLASMA_CUBLAS_CHECK_STATUS("cublasZgemm_v2 ", cublas_status, + { return PLASMA_ERR_UNEXPECTED; }); } } else { @@ -192,6 +225,7 @@ dplasma_cuda_ztsmqr( PLASMA_enum side, PLASMA_enum trans, const PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK, PLASMA_Complex64_t *WORKC, int LDWORKC, + cublasHandle_t handle, cudaStream_t stream) { int i, i1, i3; @@ -201,6 +235,7 @@ dplasma_cuda_ztsmqr( PLASMA_enum side, PLASMA_enum trans, int jc = 0; int mi = M1; int ni = N1; + int rc; /* Check input arguments */ if ((side != PlasmaLeft) && (side != PlasmaRight)) { @@ -292,13 +327,17 @@ dplasma_cuda_ztsmqr( PLASMA_enum side, PLASMA_enum trans, /* * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) */ - dplasma_cuda_zparfb( side, trans, PlasmaForward, PlasmaColumnwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + LDV*i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, stream ); + rc = dplasma_cuda_zparfb( side, trans, PlasmaForward, PlasmaColumnwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + LDV*i, LDV, + T + LDT*i, LDT, + WORK, LDWORK, WORKC, LDWORKC, + handle, stream ); + if (PLASMA_SUCCESS != rc) { + return rc; + } } return PLASMA_SUCCESS; } diff --git a/src/cores/dplasma_hip_ztsmqr.c b/src/cores/dplasma_hip_ztsmqr.c new file mode 100644 index 00000000..bc66b6c7 --- /dev/null +++ b/src/cores/dplasma_hip_ztsmqr.c @@ -0,0 +1,346 @@ +/** + * + * @file dplasma_hip_ztsmqr.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Mathieu Faverge + * @date 2010-11-15 + * + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. + **/ +/* + * @precisions normal z -> c d s + */ +#include +#include + +#include + +#include "common.h" +#include "dplasmaaux_hip.h" + +int +dplasma_hip_zparfb(PLASMA_enum side, PLASMA_enum trans, + PLASMA_enum direct, PLASMA_enum storev, + int M1, int N1, + int M2, int N2, + int K, int L, + PLASMA_Complex64_t *A1, int LDA1, + PLASMA_Complex64_t *A2, int LDA2, + const PLASMA_Complex64_t *V, int LDV, + const PLASMA_Complex64_t *T, int LDT, + PLASMA_Complex64_t *WORK, int LDWORK, + PLASMA_Complex64_t *WORKC, int LDWORKC, + hipblasHandle_t handle, + hipStream_t stream) +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zzero = { 0.0, 0.0 }; + hipblasDoubleComplex zone = { 1.0, 0.0 }; + hipblasDoubleComplex mzone = { -1.0, 0.0 }; +#else + double zzero = 0.0; + double zone = 1.0; + double mzone = -1.0; +#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + + hipblasStatus_t hipblas_status; + hipError_t hip_status; + int j; + (void)L; + + hipblas_status = hipblasSetStream(handle, stream); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasSetStream ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + + /* Check input arguments */ + if ((side != PlasmaLeft) && (side != PlasmaRight)) { + return -1; + } + if ((trans != PlasmaNoTrans) && (trans != PlasmaConjTrans)) { + return -2; + } + if ((direct != PlasmaForward) && (direct != PlasmaBackward)) { + return -3; + } + if ((storev != PlasmaColumnwise) && (storev != PlasmaRowwise)) { + return -4; + } + if (M1 < 0) { + return -5; + } + if (N1 < 0) { + return -6; + } + if ((M2 < 0) || + ( (side == PlasmaRight) && (M1 != M2) ) ) { + return -7; + } + if ((N2 < 0) || + ( (side == PlasmaLeft) && (N1 != N2) ) ) { + return -8; + } + if (K < 0) { + return -9; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + return PLASMA_SUCCESS; + + if (direct == PlasmaForward) { + + if (side == PlasmaLeft) { + + /* + * Column or Rowwise / Forward / Left + * ---------------------------------- + * + * Form H * A or H' * A where A = ( A1 ) + * ( A2 ) + */ + + /* + * W = A1 + V' * A2: + * W = A1 + * W = W + V' * A2 + * + */ + hip_status = hipMemcpy2DAsync(WORK, LDWORK * sizeof(PLASMA_Complex64_t), + A1, LDA1 * sizeof(PLASMA_Complex64_t), + K * sizeof(PLASMA_Complex64_t), N1, + hipMemcpyDeviceToDevice, stream); + if (hipSuccess != hip_status) { + parsec_warning("%s:%d hipMemcpy2DAsync %s", + __FILE__, __LINE__, hipGetErrorString(hip_status)); + return PLASMA_ERR_UNEXPECTED; + } + + hipblas_status = hipblasZgemm(handle, + dplasma_hipblas_op(PlasmaConjTrans), HIPBLAS_OP_N, + K, N1, M2, + &zone, + (const hipblasDoubleComplex*)V /* K*M2 */, LDV, + (const hipblasDoubleComplex*)A2 /* M2*N1 */, LDA2, + &zone, + (hipblasDoubleComplex*)WORK /* K*N1 */, LDWORK); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + + if (WORKC == NULL) { + /* W = op(T) * W */ + hipblas_status = hipblasZtrmm(handle, + HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, + dplasma_hipblas_op(trans), HIPBLAS_DIAG_NON_UNIT, + K, N2, + &zone, + (const hipblasDoubleComplex*)T, LDT, + (const hipblasDoubleComplex*)WORK, LDWORK, + (hipblasDoubleComplex*)WORK, LDWORK); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrmm ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + + /* A1 = A1 - W = A1 - op(T) * W */ + for(j = 0; j < N1; j++) { + hipblas_status = hipblasZaxpy(handle, K, &mzone, + (const hipblasDoubleComplex*)(WORK + LDWORK*j), 1, + (hipblasDoubleComplex*)(A1 + LDA1*j), 1); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZaxpy ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + } + + /* A2 = A2 - op(V) * W */ + hipblas_status = hipblasZgemm(handle, + HIPBLAS_OP_N, HIPBLAS_OP_N, + M2, N2, K, + &mzone, + (const hipblasDoubleComplex*)V /* M2*K */, LDV, + (const hipblasDoubleComplex*)WORK /* K*N2 */, LDWORK, + &zone, + (hipblasDoubleComplex*)A2 /* m2*N2 */, LDA2); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + + } else { + /* Wc = V * op(T) */ + hipblas_status = hipblasZgemm(handle, + HIPBLAS_OP_N, dplasma_hipblas_op(trans), + M2, K, K, + &zone, (const hipblasDoubleComplex*)V, LDV, + (const hipblasDoubleComplex*)T, LDT, + &zzero, (hipblasDoubleComplex*)WORKC, LDWORKC); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + + /* A1 = A1 - opt(T) * W */ + hipblas_status = hipblasZgemm(handle, + dplasma_hipblas_op(trans), HIPBLAS_OP_N, + K, N1, K, + &mzone, (const hipblasDoubleComplex*)T, LDT, + (const hipblasDoubleComplex*)WORK, LDWORK, + &zone, (hipblasDoubleComplex*)A1, LDA1); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + + /* A2 = A2 - Wc * W */ + hipblas_status = hipblasZgemm(handle, + HIPBLAS_OP_N, HIPBLAS_OP_N, + M2, N2, K, + &mzone, (const hipblasDoubleComplex*)WORKC, LDWORKC, + (const hipblasDoubleComplex*)WORK, LDWORK, + &zone, (hipblasDoubleComplex*)A2, LDA2); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm ", hipblas_status, + { return PLASMA_ERR_UNEXPECTED; }); + } + } + else { + /* + * Column or Rowwise / Forward / Right + * ----------------------------------- + * + * Form H * A or H' * A where A = ( A1 A2 ) + * + */ + fprintf(stderr, "Not implemented (Column or Rowwise / Forward / Right)"); + return PLASMA_ERR_NOT_SUPPORTED; + } + } + else { + fprintf(stderr, "Not implemented (Backward / Left or Right)"); + return PLASMA_ERR_NOT_SUPPORTED; + } + + return PLASMA_SUCCESS; +} + +int +dplasma_hip_ztsmqr( PLASMA_enum side, PLASMA_enum trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + PLASMA_Complex64_t *A1, int LDA1, + PLASMA_Complex64_t *A2, int LDA2, + const PLASMA_Complex64_t *V, int LDV, + const PLASMA_Complex64_t *T, int LDT, + PLASMA_Complex64_t *WORK, int LDWORK, + PLASMA_Complex64_t *WORKC, int LDWORKC, + hipblasHandle_t handle, + hipStream_t stream) +{ + int i, i1, i3; + int NQ, NW; + int kb; + int ic = 0; + int jc = 0; + int mi = M1; + int ni = N1; + int rc; + + /* Check input arguments */ + if ((side != PlasmaLeft) && (side != PlasmaRight)) { + return -1; + } + + /* NQ is the order of Q */ + if (side == PlasmaLeft) { + NQ = M2; + NW = IB; + } + else { + NQ = N2; + NW = M1; + } + + if ((trans != PlasmaNoTrans) && (trans != PlasmaConjTrans)) { + return -2; + } + if (M1 < 0) { + return -3; + } + if (N1 < 0) { + return -4; + } + if ( (M2 < 0) || + ( (M2 != M1) && (side == PlasmaRight) ) ){ + return -5; + } + if ( (N2 < 0) || + ( (N2 != N1) && (side == PlasmaLeft) ) ){ + return -6; + } + if ((K < 0) || + ( (side == PlasmaLeft) && (K > M1) ) || + ( (side == PlasmaRight) && (K > N1) ) ) { + return -7; + } + if (IB < 0) { + return -8; + } + if (LDA1 < max(1,M1)){ + return -10; + } + if (LDA2 < max(1,M2)){ + return -12; + } + if (LDV < max(1,NQ)){ + return -14; + } + if (LDT < max(1,IB)){ + return -16; + } + if (LDWORK < max(1,NW)){ + return -18; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + return PLASMA_SUCCESS; + + if (((side == PlasmaLeft) && (trans != PlasmaNoTrans)) + || ((side == PlasmaRight) && (trans == PlasmaNoTrans))) { + i1 = 0; + i3 = IB; + } + else { + i1 = ((K-1) / IB)*IB; + i3 = -IB; + } + + for(i = i1; (i > -1) && (i < K); i += i3) { + kb = min(IB, K-i); + + if (side == PlasmaLeft) { + /* + * H or H' is applied to C(i:m,1:n) + */ + mi = M1 - i; + ic = i; + } + else { + /* + * H or H' is applied to C(1:m,i:n) + */ + ni = N1 - i; + jc = i; + } + /* + * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) + */ + rc = dplasma_hip_zparfb( side, trans, PlasmaForward, PlasmaColumnwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + LDV*i, LDV, + T + LDT*i, LDT, + WORK, LDWORK, WORKC, LDWORKC, + handle, stream ); + if (PLASMA_SUCCESS != rc) { + return rc; + } + } + return PLASMA_SUCCESS; +} diff --git a/src/cores/dplasma_zcores.h b/src/cores/dplasma_zcores.h index 1665d39b..49f71896 100644 --- a/src/cores/dplasma_zcores.h +++ b/src/cores/dplasma_zcores.h @@ -2,6 +2,7 @@ * Copyright (c) 2011-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * @@ -22,6 +23,7 @@ int blgchase_ztrdv2(int NT, int N, int NB, #if defined(DPLASMA_HAVE_CUDA) #include #include +#include int dplasma_cuda_zparfb(PLASMA_enum side, PLASMA_enum trans, PLASMA_enum direct, PLASMA_enum storev, @@ -34,6 +36,7 @@ int dplasma_cuda_zparfb(PLASMA_enum side, PLASMA_enum trans, const PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK, PLASMA_Complex64_t *WORKC, int LDWORKC, + cublasHandle_t handle, cudaStream_t stream); int dplasma_cuda_ztsmqr( PLASMA_enum side, PLASMA_enum trans, @@ -46,7 +49,40 @@ int dplasma_cuda_ztsmqr( PLASMA_enum side, PLASMA_enum trans, const PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK, PLASMA_Complex64_t *WORKC, int LDWORKC, + cublasHandle_t handle, cudaStream_t stream); #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#include + +int dplasma_hip_zparfb(PLASMA_enum side, PLASMA_enum trans, + PLASMA_enum direct, PLASMA_enum storev, + int M1, int N1, + int M2, int N2, + int K, int L, + PLASMA_Complex64_t *A1, int LDA1, + PLASMA_Complex64_t *A2, int LDA2, + const PLASMA_Complex64_t *V, int LDV, + const PLASMA_Complex64_t *T, int LDT, + PLASMA_Complex64_t *WORK, int LDWORK, + PLASMA_Complex64_t *WORKC, int LDWORKC, + hipblasHandle_t handle, + hipStream_t stream); + +int dplasma_hip_ztsmqr( PLASMA_enum side, PLASMA_enum trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + PLASMA_Complex64_t *A1, int LDA1, + PLASMA_Complex64_t *A2, int LDA2, + const PLASMA_Complex64_t *V, int LDV, + const PLASMA_Complex64_t *T, int LDT, + PLASMA_Complex64_t *WORK, int LDWORK, + PLASMA_Complex64_t *WORKC, int LDWORKC, + hipblasHandle_t handle, + hipStream_t stream); +#endif /* defined(DPLASMA_HAVE_HIP) */ + #endif /* _DPLASMA_Z_CORES_ */ diff --git a/src/zgemm_NN.jdf b/src/zgemm_NN.jdf index c0debf8e..a45dfffc 100644 --- a/src/zgemm_NN.jdf +++ b/src/zgemm_NN.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -84,6 +82,9 @@ Q [type = "int" hidden=on default="-1"] lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -181,14 +182,49 @@ BODY [type=CUDA] int ldcm = LDA(ddescC, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbk, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldam, + B, ldbk, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = LDA(ddescA, A); + int ldbk = LDA(ddescB, B); + int ldcm = LDA(ddescC, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + dplasma_hipblas_op(transA), dplasma_hipblas_op(transB), + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbk, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index a0713a25..02f7a9ac 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2017-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -169,6 +167,7 @@ xMax [ type = int default = "-1" hidden=on ] yMax [ type = int default = "-1" hidden=on ] zMax [ type = int default = "-1" hidden=on ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] /********************************************************* @@ -425,14 +424,16 @@ BODY [type=CUDA] m, n, k, cAmb, cAnb, cBmb, cBnb, cCmb, cCnb); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbk, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldam, + B, ldbk, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); /* Quick and dirty emulation of the next GEMM */ if( k == descC->mt -1 ) { diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 02c6cc9c..f52a1e50 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -83,6 +81,7 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] /************************************************** @@ -239,14 +238,16 @@ BODY [type=CUDA creal(lbeta), m, n, ldcm ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbk, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldam, + B, ldbk, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index bdf40a3b..fc0b8cbf 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -84,6 +82,9 @@ Q [type = "int" hidden=on default="-1"] lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -186,14 +187,55 @@ BODY [type=CUDA int ldcm = descC->mb; cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbn, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldam, + B, ldbn, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = descA->mb; + int ldbn = descB->mb; + int ldcm = descC->mb; + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + dplasma_hipblas_op(transA), dplasma_hipblas_op(transB), + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbn, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index bf9924bf..de088571 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -83,6 +81,7 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] /************************************************** @@ -239,14 +238,16 @@ BODY [type=CUDA creal(lbeta), m, n, ldcm ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbn, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldam, + B, ldbn, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index c46b8d43..d132b25e 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -84,6 +82,9 @@ Q [type = "int" hidden=on default="-1"] lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -186,14 +187,55 @@ BODY [type=CUDA int ldcm = descC->mb; cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldak, - (cuDoubleComplex*)B, ldbk, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldak, + B, ldbk, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->mt-1 ? descA->m - k * descA->mb : descA->mb; + int ldak = descA->mb; + int ldbk = descB->mb; + int ldcm = descC->mb; + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + dplasma_hipblas_op(transA), dplasma_hipblas_op(transB), + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldak, + (hipblasDoubleComplex*)B, ldbk, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index 2d962d6d..13b8cb24 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -83,6 +81,7 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] /************************************************** @@ -238,14 +237,16 @@ BODY [type=CUDA creal(lbeta), m, n, ldcm ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldak, - (cuDoubleComplex*)B, ldbk, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldak, + B, ldbk, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index e5035afa..94109bed 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -84,6 +82,9 @@ Q [type = "int" hidden=on default="-1"] lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -186,14 +187,55 @@ BODY [type=CUDA int ldcm = descC->mb; cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbn, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, ldam, + B, ldbn, + &lbeta, C, ldcm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->m - k * descA->mb : descA->mb; + int ldam = descA->mb; + int ldbn = descB->mb; + int ldcm = descC->mb; + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + dplasma_hipblas_op(transA), dplasma_hipblas_op(transB), + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbn, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 1543f137..f0365ba8 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -3,14 +3,12 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -83,6 +81,7 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] /************************************************** @@ -238,17 +237,19 @@ BODY [type=CUDA creal(lbeta), m, n, ldcm ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(transA), dplasma_cublas_op(transB), tempmm, tempnn, tempkk, - lalpha, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldbn, - lbeta, (cuDoubleComplex*)C, ldcm ); - status = cublasGetError(); + &lalpha, A, ldam, + B, ldbn, + &lbeta, C, ldcm ); if(status != CUBLAS_STATUS_SUCCESS){ printf("ISSUE\n"); } - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 9534375c..88b80710 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c @@ -130,6 +131,15 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, } } + +#if defined(DPLASMA_HAVE_CUDA) + ((parsec_zgemm_NN_summa_taskpool_t*)zgemm_tp)->_g_cuda_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + ((parsec_zgemm_NN_summa_taskpool_t*)zgemm_tp)->_g_cuda_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif + int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, parsec_datatype_double_complex_t, @@ -192,6 +202,22 @@ dplasma_zgemm_default_new(dplasma_enum_t transA, dplasma_enum_t transB, } } +#if defined(DPLASMA_HAVE_CUDA) + ((parsec_zgemm_NN_taskpool_t*)zgemm_tp)->_g_cuda_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + ((parsec_zgemm_NN_taskpool_t*)zgemm_tp)->_g_cuda_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif + +#if defined(DPLASMA_HAVE_HIP) + ((parsec_zgemm_NN_taskpool_t*)zgemm_tp)->_g_hip_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + ((parsec_zgemm_NN_taskpool_t*)zgemm_tp)->_g_hip_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif + int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, parsec_datatype_double_complex_t, @@ -391,6 +417,12 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, K = B->mt; tp->_g_zMax = (K + d - 1) / d - 1; +#if defined(DPLASMA_HAVE_CUDA) + tp->_g_cuda_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + tp->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf index 400dd61d..9267a21b 100644 --- a/src/zgeqrf.jdf +++ b/src/zgeqrf.jdf @@ -3,12 +3,15 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ +#include "dplasma/config.h" #include "dplasmajdf.h" +#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #if defined(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT) @@ -20,9 +23,9 @@ static void ztsqrt_recursive_cb(parsec_taskpool_t* tp, const parsec_recursive_ca static void ztsmqr_recursive_cb(parsec_taskpool_t* tp, const parsec_recursive_callback_t* data); #endif /* PARSEC_HAVE_DEV_RECURSIVE_SUPPORT */ -#if defined(DPLASMA_HAVE_CUDA) +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) #include "cores/dplasma_zcores.h" -#endif /* defined(DPLASMA_HAVE_CUDA) */ +#endif /* defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define DEFAULT 0 @@ -71,6 +74,9 @@ p_tau [type = "parsec_memory_pool_t *" size = "(sizeof(dplasma_complex64_t) * smallnb [type = "int" hidden = on default = "descA->nb" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + zgeqrt_typechange(k) [profile = off] /* Execution space */ k = 0 .. %{ return dplasma_imin((descA->nt-1),(descA->mt-1)); %} @@ -486,21 +492,71 @@ BODY [type=CUDA device=%{ return n; %} int ldam_A2 = descA->mb; int ldam_V = descA->mb; int ldam_T = descT->mb; + int rc; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); WORK = parsec_device_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(dplasma_complex64_t)); WORKC = parsec_device_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(dplasma_complex64_t)); - dplasma_cuda_ztsmqr( dplasmaLeft, dplasmaConjTrans, - descA->mb, tempnn, tempmm, tempnn, descA->nb, ib, - A1 /* descA(k,n) */, ldak_A1, - A2 /* descA(m,n) */, ldam_A2, - V /* descA(m,k) */, ldam_V, - T /* descT(m,k) */, ldam_T, - WORK, ib, - WORKC, descA->mb, - parsec_body.stream ); - - parsec_device_push_workspace(gpu_device, gpu_stream); + rc = dplasma_cuda_ztsmqr( dplasmaLeft, dplasmaConjTrans, + descA->mb, tempnn, tempmm, tempnn, descA->nb, ib, + A1 /* descA(k,n) */, ldak_A1, + A2 /* descA(m,n) */, ldam_A2, + V /* descA(m,k) */, ldam_V, + T /* descT(m,k) */, ldam_T, + WORK, ib, + WORKC, descA->mb, + handles->cublas_handle, parsec_body.stream ); + + parsec_device_push_workspace(gpu_device, gpu_stream); + if( DPLASMA_SUCCESS != rc ) { + return PARSEC_HOOK_RETURN_ERROR; + } +} +END + +BODY [type=HIP device=%{ return n; %} + A1.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + A2.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + V.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + T.size=%{ return descT->mb*descT->nb*parsec_datadist_getsizeoftype(descT->mtype);%} + A1.dc=ddescA A2.dc=ddescA V.dc=ddescA T.dc=ddescT + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ + dplasma_complex64_t *WORK, *WORKC; + int tempmm = ((m)==(descA->mt-1)) ? (descA->m-(m*descA->mb)) : descA->mb; + int tempnn = ((n)==(descA->nt-1)) ? (descA->n-(n*descA->nb)) : descA->nb; + int ldak_A1 = descA->mb; + int ldam_A2 = descA->mb; + int ldam_V = descA->mb; + int ldam_T = descT->mb; + int rc; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + + WORK = parsec_device_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(dplasma_complex64_t)); + WORKC = parsec_device_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(dplasma_complex64_t)); + + rc = dplasma_hip_ztsmqr( dplasmaLeft, dplasmaConjTrans, + descA->mb, tempnn, tempmm, tempnn, descA->nb, ib, + A1 /* descA(k,n) */, ldak_A1, + A2 /* descA(m,n) */, ldam_A2, + V /* descA(m,k) */, ldam_V, + T /* descT(m,k) */, ldam_T, + WORK, ib, + WORKC, descA->mb, + handles->hipblas_handle, parsec_body.stream ); + + parsec_device_push_workspace(gpu_device, gpu_stream); + if( DPLASMA_SUCCESS != rc ) { + return PARSEC_HOOK_RETURN_ERROR; + } } END diff --git a/src/zgeqrf_wrapper.c b/src/zgeqrf_wrapper.c index 70f8bd0b..c4bd6f0b 100644 --- a/src/zgeqrf_wrapper.c +++ b/src/zgeqrf_wrapper.c @@ -2,6 +2,7 @@ * Copyright (c) 2011-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c @@ -139,6 +140,19 @@ dplasma_zgeqrf_New( parsec_tiled_matrix_t *A, ddc_T, ib, NULL, NULL ); +#if defined(DPLASMA_HAVE_CUDA) + tp->_g_cuda_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + tp->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif +#if defined(DPLASMA_HAVE_HIP) + tp->_g_hip_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + tp->_g_p_tau = (parsec_memory_pool_t*)malloc(sizeof(parsec_memory_pool_t)); parsec_private_memory_init( tp->_g_p_tau, T->nb * sizeof(dplasma_complex64_t) ); diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index da30423a..0eab0294 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -3,16 +3,15 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" +#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" /* Define the different shapes this JDF is using */ @@ -41,6 +40,9 @@ ib [type = "int" hidden=on default="(32)" ] KT [type = "int" hidden=on default="(dplasma_imin( descA->mt, descA->nt )-1)" ] INFO [type = "int*" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + zgetrf_nopiv(k) [flops = inline_c%{ return FLOPS_ZGETRF(CLEAN_MB(descA, k), CLEAN_NB(descA, k)); %}] /* Execution Space */ k = 0 .. KT @@ -216,19 +218,61 @@ BODY [type=CUDA int ldam_C = descA->mb; cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + printloggpu("zgemm(%d, %d, %d)\n" + "\t(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descA->mb, -1, A(%d,%d)[%p], ldam %d, A(%d,%d)[%p], ldak %d, 1.000000, A(%d,%d)[%p], ldam %d)\n", + k, n, m, m, k, A, ldam_A, k, n, B, ldak_B, m, n, C, ldam_C); + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, + tempmm, tempnn, descA->mb, + &mzone, A, ldam_A, + B, ldak_B, + &zone, C, ldam_C); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + A.dc=ddescA B.dc=ddescA C.dc=ddescA + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + + int tempmm = ((m)==(descA->mt-1)) ? (descA->m-(m*descA->mb)) : descA->mb; + int tempnn = ((n)==(descA->nt-1)) ? (descA->n-(n*descA->nb)) : descA->nb; + int ldam_A = descA->mb; + int ldak_B = descA->mb; + int ldam_C = descA->mb; + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; printloggpu("zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descA->mb, -1, A(%d,%d)[%p], ldam %d, A(%d,%d)[%p], ldak %d, 1.000000, A(%d,%d)[%p], ldam %d)\n", k, n, m, m, k, A, ldam_A, k, n, B, ldak_B, m, n, C, ldam_C); - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_N, tempmm, tempnn, descA->mb, - mzone, (cuDoubleComplex*)A, ldam_A, - (cuDoubleComplex*)B, ldak_B, - zone, (cuDoubleComplex*)C, ldam_C); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, (hipblasDoubleComplex*)A, ldam_A, + (hipblasDoubleComplex*)B, ldak_B, + &zone, (hipblasDoubleComplex*)C, ldam_C); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -251,4 +295,3 @@ BODY 1., C /* descA(m,n) */, ldam_C); } END - diff --git a/src/zgetrf_nopiv_wrapper.c b/src/zgetrf_nopiv_wrapper.c index cd5bc4bc..a9263b55 100644 --- a/src/zgetrf_nopiv_wrapper.c +++ b/src/zgetrf_nopiv_wrapper.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c @@ -76,6 +77,19 @@ dplasma_zgetrf_nopiv_New( parsec_tiled_matrix_t *A, parsec_zgetrf_nopiv_taskpool_t *parsec_getrf_nopiv; parsec_getrf_nopiv = parsec_zgetrf_nopiv_new( ddc_A, INFO ); +#if defined(DPLASMA_HAVE_CUDA) + parsec_getrf_nopiv->_g_cuda_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + parsec_getrf_nopiv->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif +#if defined(DPLASMA_HAVE_HIP) + parsec_getrf_nopiv->_g_hip_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + parsec_getrf_nopiv->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, parsec_datatype_double_complex_t, diff --git a/src/zpoinv_L.jdf b/src/zpoinv_L.jdf index 4a68565a..dfe76db9 100644 --- a/src/zpoinv_L.jdf +++ b/src/zpoinv_L.jdf @@ -3,20 +3,25 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" +#include "potrf_gpu_workspaces.h" #include "parsec/data_dist/matrix/matrix.h" %} +uplo [type = "dplasma_enum_t"] descA [type = "parsec_tiled_matrix_t*"] +INFO [type = "int*"] +cuda_handles_infokey [type = "int" hidden = on default = -1] +cuda_workspaces_infokey [type = "int" hidden = on default = -1] +hip_handles_infokey [type = "int" hidden = on default = -1] +hip_workspaces_infokey [type = "int" hidden = on default = -1] potrf_zpotrf(k) /* Execution Space */ @@ -33,11 +38,63 @@ potrf_zpotrf(k) ; 44 +BODY [type=CUDA] +{ + int tempk = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + + cusolverStatus_t status; + dplasma_potrf_gpu_workspaces_t *wp; + cuDoubleComplex *workspace; + int *d_iinfo; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + wp = parsec_info_get(&gpu_device->super.infos, cuda_workspaces_infokey); + assert(NULL != wp); + + workspace = (cuDoubleComplex*)wp->tmpmem; + d_iinfo = (int*)(wp->tmpmem + wp->lwork * sizeof(cuDoubleComplex)); + + status = cusolverDnZpotrf( handles->cusolverDn_handle, dplasma_cublas_fill(uplo), + tempk, A, ldak, workspace, wp->lwork, d_iinfo); + PARSEC_CUDA_CHECK_ERROR( "cusolverDnZpotrf ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempk = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + + rocblas_status status; + rocblas_fill rocblas_uplo; + dplasma_potrf_gpu_workspaces_t *wp; + int *d_iinfo; + dplasma_hip_handles_t *handles; + + if( PlasmaLower == uplo ) + rocblas_uplo = rocblas_fill_lower; + if( PlasmaUpper == uplo ) + rocblas_uplo = rocblas_fill_upper; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); + assert(NULL != wp); + d_iinfo = (int*)wp->tmpmem; + + status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempk, A, ldak, d_iinfo); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; int ldak = BLKLDD( descA, k ); - int info = 0; + int iinfo = 0; printlog("CORE_potrf_zpotrf(%d)\n" @@ -45,7 +102,9 @@ BODY k, k, k, A); CORE_zpotrf(dplasmaLower, tempkm, - A /* descA(k,k) */, ldak, &info ); + A /* descA(k,k) */, ldak, &iinfo ); + if ( iinfo != 0 && *INFO == 0 ) + *INFO = k * descA->mb + iinfo; } END @@ -66,6 +125,51 @@ potrf_ztrsm(k, m) ; 40 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, tempmm, descA->mb, &zone, A, ldak, B, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, tempmm, descA->mb, &zone, A, ldak, B, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; @@ -100,6 +204,53 @@ potrf_zherk(k, m) ; 40 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); + cuDoubleComplex mzone = make_cuDoubleComplex( -1., 0. ); +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, tempmm, descA->mb, &mzone, A, ldam, &zone, C, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, tempmm, descA->mb, &mzone, A, ldam, &zone, C, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; @@ -139,26 +290,49 @@ potrf_zgemm(k, m, n) BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); - cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); + cuDoubleComplex mzone = make_cuDoubleComplex( -1., 0. ); #else - double zone = 1.; + double zone = 1.; double mzone = -1.; #endif - int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; int ldam = BLKLDD( descA, m ); int ldan = BLKLDD( descA, n ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(dplasmaConjTrans), - tempmm, descA->mb, descA->mb, - mzone, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldan, - zone, (cuDoubleComplex*)C, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_C, tempmm, descA->mb, descA->mb, &mzone, A, ldam, B, ldan, &zone, C, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int ldam = BLKLDD( descA, m ); + int ldan = BLKLDD( descA, n ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_C, tempmm, descA->mb, descA->mb, &mzone, A, ldam, B, ldan, &zone, C, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -202,6 +376,53 @@ trtri_ztrsmR(k, m) ; 30 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( -1., 0. ); +#else + double zone = -1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, tempmm, tempkn, &zone, A, ldak, B, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { -1., 0. }; +#else + double zone = -1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, tempmm, tempkn, &zone, A, ldak, B, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; @@ -246,25 +467,47 @@ trtri_zgemm(k, m, n) BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); #else - double zone = 1.; + double zone = 1.; #endif - int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; int ldam = BLKLDD( descA, m ); int ldak = BLKLDD( descA, k ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', - tempmm, descA->nb, tempkn, - zone, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldak, - zone, (cuDoubleComplex*)C, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, tempmm, descA->nb, tempkn, &zone, A, ldam, B, ldak, &zone, C, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempmm = (m == (descA->mt-1)) ? (descA->m - m * descA->mb) : descA->mb; + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldam = BLKLDD( descA, m ); + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_N, tempmm, descA->nb, tempkn, &zone, A, ldam, B, ldak, &zone, C, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -308,6 +551,49 @@ trtri_ztrsmL(k, n) ; 30 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, tempkn, descA->nb, &zone, A, ldak, B, ldak ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, tempkn, descA->nb, &zone, A, ldak, B, ldak ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; @@ -374,6 +660,51 @@ lauum_zherk(k, n) ; 20 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldan = BLKLDD( descA, n ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, descA->mb, tempkm, &zone, A, ldak, &zone, C, ldan ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldan = BLKLDD( descA, n ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_C, descA->mb, tempkm, &zone, A, ldak, &zone, C, ldan ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; @@ -415,24 +746,45 @@ lauum_zgemm(k, n, m) BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); #else - double zone = 1.; + double zone = 1.; #endif - int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; int ldak = BLKLDD( descA, k ); int ldam = BLKLDD( descA, m ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(dplasmaConjTrans), 'N', - descA->mb, descA->nb, tempkm, - zone, (cuDoubleComplex*)A, ldak, - (cuDoubleComplex*)B, ldak, - zone, (cuDoubleComplex*)C, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_C, CUBLAS_OP_N, descA->mb, descA->nb, tempkm, &zone, A, ldak, B, ldak, &zone, C, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_C, HIPBLAS_OP_N, descA->mb, descA->nb, tempkm, &zone, A, ldak, B, ldak, &zone, C, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -474,6 +826,49 @@ lauum_ztrmm(k, n) ; 20 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrmm_v2( handles->cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, tempkm, descA->nb, &zone, A, ldak, B, ldak, B, ldak ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrmm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrmm( handles->hipblas_handle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, tempkm, descA->nb, &zone, A, ldak, B, ldak, B, ldak ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrmm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; @@ -519,4 +914,3 @@ BODY A /* descA(k,k) */, ldak ); } END - diff --git a/src/zpoinv_U.jdf b/src/zpoinv_U.jdf index 048dc65e..f0cfa9ed 100644 --- a/src/zpoinv_U.jdf +++ b/src/zpoinv_U.jdf @@ -3,20 +3,25 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" +#include "potrf_gpu_workspaces.h" #include "parsec/data_dist/matrix/matrix.h" %} +uplo [type = "dplasma_enum_t"] descA [type = "parsec_tiled_matrix_t*"] +INFO [type = "int*"] +cuda_handles_infokey [type = "int" hidden = on default = -1] +cuda_workspaces_infokey [type = "int" hidden = on default = -1] +hip_handles_infokey [type = "int" hidden = on default = -1] +hip_workspaces_infokey [type = "int" hidden = on default = -1] potrf_zpotrf(k) /* Execution Space */ @@ -33,11 +38,63 @@ potrf_zpotrf(k) ; 44 +BODY [type=CUDA] +{ + int tempk = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + cusolverStatus_t status; + dplasma_potrf_gpu_workspaces_t *wp; + cuDoubleComplex *workspace; + int *d_iinfo; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + wp = parsec_info_get(&gpu_device->super.infos, cuda_workspaces_infokey); + assert(NULL != wp); + + workspace = (cuDoubleComplex*)wp->tmpmem; + d_iinfo = (int*)(wp->tmpmem + wp->lwork * sizeof(cuDoubleComplex)); + + status = cusolverDnZpotrf( handles->cusolverDn_handle, dplasma_cublas_fill(uplo), + tempk, A, ldak, workspace, wp->lwork, d_iinfo); + PARSEC_CUDA_CHECK_ERROR( "cusolverDnZpotrf ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempk = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + rocblas_status status; + rocblas_fill rocblas_uplo; + dplasma_potrf_gpu_workspaces_t *wp; + int *d_iinfo; + dplasma_hip_handles_t *handles; + + if( PlasmaLower == uplo ) + rocblas_uplo = rocblas_fill_lower; + if( PlasmaUpper == uplo ) + rocblas_uplo = rocblas_fill_upper; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); + assert(NULL != wp); + d_iinfo = (int*)wp->tmpmem; + + status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempk, A, ldak, d_iinfo); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkm = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; int ldak = BLKLDD( descA, k ); - int info = 0; + int iinfo = 0; printlog("CORE_potrf_zpotrf(%d)\n" @@ -45,7 +102,9 @@ BODY k, k, k, A); CORE_zpotrf(dplasmaUpper, tempkm, - A /* descA(k,k) */, ldak, &info ); + A /* descA(k,k) */, ldak, &iinfo ); + if ( iinfo != 0 && *INFO == 0 ) + *INFO = k * descA->nb + iinfo; } END @@ -66,6 +125,49 @@ potrf_ztrsm(k, m) ; 40 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, descA->nb, tempmm, &zone, A, ldak, B, ldak ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, descA->nb, tempmm, &zone, A, ldak, B, ldak ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; @@ -99,6 +201,55 @@ potrf_zherk(k, m) ; 40 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); + cuDoubleComplex mzone = make_cuDoubleComplex( -1., 0. ); +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, tempmm, descA->mb, &mzone, A, ldak, &zone, B, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_C, tempmm, descA->mb, &mzone, A, ldak, &zone, B, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; @@ -138,26 +289,49 @@ potrf_zgemm(k, m, n) BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); - cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); + cuDoubleComplex mzone = make_cuDoubleComplex( -1., 0. ); #else - double zone = 1.; + double zone = 1.; double mzone = -1.; #endif - int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; int ldak = BLKLDD( descA, k ); int ldan = BLKLDD( descA, n ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(dplasmaConjTrans), 'N', - descA->mb, tempmm, descA->mb, - mzone, (cuDoubleComplex*)A, ldak, - (cuDoubleComplex*)B, ldak, - zone, (cuDoubleComplex*)C, ldan ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_C, CUBLAS_OP_N, descA->mb, tempmm, descA->mb, &mzone, A, ldak, B, ldak, &zone, C, ldan ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = (m == (descA->nt-1)) ? (descA->n - m * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldan = BLKLDD( descA, n ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_C, HIPBLAS_OP_N, descA->mb, tempmm, descA->mb, &mzone, A, ldak, B, ldak, &zone, C, ldan ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -201,6 +375,51 @@ trtri_ztrsmL(k, n) ; 30 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( -1., 0. ); +#else + double zone = -1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int tempnn = (n == (descA->nt-1)) ? (descA->n - n * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, tempkm, tempnn, &zone, A, ldak, B, ldak ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { -1., 0. }; +#else + double zone = -1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int tempnn = (n == (descA->nt-1)) ? (descA->n - n * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, tempkm, tempnn, &zone, A, ldak, B, ldak ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; @@ -244,25 +463,47 @@ trtri_zgemm(k, m, n) BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); #else - double zone = 1.; + double zone = 1.; #endif - int tempnn = (n == (descA->nt-1)) ? (descA->n - n * descA->nb) : descA->nb; int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; int ldam = BLKLDD( descA, m ); int ldak = BLKLDD( descA, k ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', - descA->mb, tempnn, tempkm, - zone, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldak, - zone, (cuDoubleComplex*)C, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, descA->mb, tempnn, tempkm, &zone, A, ldam, B, ldak, &zone, C, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempnn = (n == (descA->nt-1)) ? (descA->n - n * descA->nb) : descA->nb; + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldam = BLKLDD( descA, m ); + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_N, descA->mb, tempnn, tempkm, &zone, A, ldam, B, ldak, &zone, C, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -306,6 +547,51 @@ trtri_ztrsmR(k, m) ; 30 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, descA->mb, tempkm, &zone, A, ldak, B, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, descA->mb, tempkm, &zone, A, ldak, B, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkm = (k == (descA->mt-1)) ? (descA->m - k * descA->mb) : descA->mb; @@ -372,6 +658,49 @@ lauum_zherk(k, m) ; 20 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, descA->mb, tempkn, &zone, A, ldam, &zone, B, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, descA->mb, tempkn, &zone, A, ldam, &zone, B, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; @@ -412,24 +741,45 @@ lauum_zgemm(k, m, n) BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); #else - double zone = 1.; + double zone = 1.; #endif - int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; int ldam = BLKLDD( descA, m ); int ldan = BLKLDD( descA, n ); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(dplasmaConjTrans), - descA->mb, descA->nb, tempkn, - zone, (cuDoubleComplex*)A, ldam, - (cuDoubleComplex*)B, ldan, - zone, (cuDoubleComplex*)C, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_C, descA->mb, descA->nb, tempkn, &zone, A, ldam, B, ldan, &zone, C, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldam = BLKLDD( descA, m ); + int ldan = BLKLDD( descA, n ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_C, descA->mb, descA->nb, tempkn, &zone, A, ldam, B, ldan, &zone, C, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -471,6 +821,51 @@ lauum_ztrmm(k, m) ; 20 +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zone = make_cuDoubleComplex( 1., 0. ); +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrmm_v2( handles->cublas_handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, descA->mb, tempkn, &zone, A, ldak, B, ldam, B, ldam ); + + PARSEC_CUDA_CHECK_ERROR( "cublasZtrmm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrmm( handles->hipblas_handle, HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, descA->mb, tempkn, &zone, A, ldak, B, ldam, B, ldam ); + + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrmm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY { int tempkn = (k == (descA->nt-1)) ? (descA->n - k * descA->nb) : descA->nb; @@ -518,4 +913,3 @@ BODY A /* descA(k,k) */, ldak ); } END - diff --git a/src/zpoinv_wrapper.c b/src/zpoinv_wrapper.c index 52cb2979..feb6e776 100644 --- a/src/zpoinv_wrapper.c +++ b/src/zpoinv_wrapper.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c @@ -9,11 +10,84 @@ */ #include "dplasma.h" #include "dplasma/types.h" +#include "dplasma/types_lapack.h" #include "dplasmaaux.h" +#include "potrf_gpu_workspaces.h" +#include "parsec/utils/zone_malloc.h" #include "zpoinv_U.h" #include "zpoinv_L.h" +#if defined(DPLASMA_HAVE_CUDA) +#include + +static void *zpoinv_create_cuda_workspace(void *obj, void *user) +{ + parsec_device_module_t *mod = (parsec_device_module_t *)obj; + zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; + cusolverDnHandle_t cusolverDnHandle; + cusolverStatus_t status; + parsec_zpoinv_U_taskpool_t *tp = (parsec_zpoinv_U_taskpool_t*)user; + dplasma_potrf_gpu_workspaces_t *wp = NULL; + int workspace_size; + int mb = tp->_g_descA->mb; + int nb = tp->_g_descA->nb; + size_t elt_size = sizeof(cuDoubleComplex); + dplasma_enum_t uplo = tp->_g_uplo; + + status = cusolverDnCreate(&cusolverDnHandle); + assert(CUSOLVER_STATUS_SUCCESS == status); + (void)status; + + status = cusolverDnDpotrf_bufferSize(cusolverDnHandle, dplasma_cublas_fill(uplo), nb, NULL, mb, &workspace_size); + assert(CUSOLVER_STATUS_SUCCESS == status); + + cusolverDnDestroy(cusolverDnHandle); + + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); + wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int)); + assert(NULL != wp->tmpmem); + wp->lwork = workspace_size; + wp->memory = memory; + + return wp; +} + +static void zpoinv_destroy_cuda_workspace(void *_ws, void *_n) +{ + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; + zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); + free(ws); + (void)_n; +} +#endif + +#if defined(DPLASMA_HAVE_HIP) +static void *zpoinv_create_hip_workspace(void *obj, void *user) +{ + parsec_device_module_t *mod = (parsec_device_module_t *)obj; + zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; + dplasma_potrf_gpu_workspaces_t *wp = NULL; + (void)user; + + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); + wp->tmpmem = zone_malloc(memory, sizeof(int)); + assert(NULL != wp->tmpmem); + wp->lwork = 0; + wp->memory = memory; + + return wp; +} + +static void zpoinv_destroy_hip_workspace(void *_ws, void *_n) +{ + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; + zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); + free(ws); + (void)_n; +} +#endif + /** ******************************************************************************* * @@ -71,6 +145,10 @@ dplasma_zpoinv_New( dplasma_enum_t uplo, int *info ) { parsec_zpoinv_L_taskpool_t *parsec_zpoinv = NULL; +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + char workspace_info_name[64]; + static int uid = 0; +#endif parsec_taskpool_t *tp = NULL; /* Check input arguments */ @@ -81,7 +159,7 @@ dplasma_zpoinv_New( dplasma_enum_t uplo, *info = 0; if ( uplo == dplasmaUpper ) { - tp = (parsec_taskpool_t*)parsec_zpoinv_U_new( A /*, info */); + tp = (parsec_taskpool_t*)parsec_zpoinv_U_new( uplo, A, info ); /* Upper part of A with diagonal part */ /* dplasma_add2arena_upper( &((parsec_zpoinv_U_taskpool_t*)parsec_poinv)->arenas_datatypes[PARSEC_zpoinv_U_UPPER_TILE_ADT_IDX], */ @@ -89,7 +167,7 @@ dplasma_zpoinv_New( dplasma_enum_t uplo, /* PARSEC_ARENA_ALIGNMENT_SSE, */ /* parsec_datatype_double_complex_t, A->mb, 1 ); */ } else { - tp = (parsec_taskpool_t*)parsec_zpoinv_L_new( A /*, info */); + tp = (parsec_taskpool_t*)parsec_zpoinv_L_new( uplo, A, info ); /* Lower part of A with diagonal part */ /* dplasma_add2arena_lower( &((parsec_zpoinv_L_taskpool_t*)parsec_poinv)->arenas_datatypes[PARSEC_zpoinv_L_LOWER_TILE_ADT_IDX], */ @@ -100,6 +178,33 @@ dplasma_zpoinv_New( dplasma_enum_t uplo, parsec_zpoinv = (parsec_zpoinv_L_taskpool_t*)tp; +#if defined(DPLASMA_HAVE_CUDA) + /* It doesn't cost anything to define these infos if we have CUDA but + * don't have GPUs on the current machine, so we do it non-conditionally. */ + parsec_zpoinv->_g_cuda_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); + snprintf(workspace_info_name, 64, "DPLASMA::ZPOINV(%d)::CUDA::WS", uid++); + parsec_zpoinv->_g_cuda_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, + zpoinv_destroy_cuda_workspace, NULL, + zpoinv_create_cuda_workspace, parsec_zpoinv, + NULL); +#else + parsec_zpoinv->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; + parsec_zpoinv->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally. */ + parsec_zpoinv->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); + snprintf(workspace_info_name, 64, "DPLASMA::ZPOINV(%d)::HIP::WS", uid++); + parsec_zpoinv->_g_hip_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, + zpoinv_destroy_hip_workspace, NULL, + zpoinv_create_hip_workspace, parsec_zpoinv, + NULL); +#else + parsec_zpoinv->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; + parsec_zpoinv->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + dplasma_add2arena_tile( &parsec_zpoinv->arenas_datatypes[PARSEC_zpoinv_L_DEFAULT_ADT_IDX], A->mb*A->nb*sizeof(dplasma_complex64_t), PARSEC_ARENA_ALIGNMENT_SSE, @@ -136,6 +241,12 @@ dplasma_zpoinv_Destruct( parsec_taskpool_t *tp ) dplasma_matrix_del2arena( &parsec_zpoinv->arenas_datatypes[PARSEC_zpoinv_L_DEFAULT_ADT_IDX ] ); /* dplasma_matrix_del2arena( parsec_zpoinv->arenas_datatypes[PARSEC_zpoinv_L_LOWER_TILE_ADT_IDX] ); */ +#if defined(DPLASMA_HAVE_CUDA) + parsec_info_unregister(&parsec_per_device_infos, parsec_zpoinv->_g_cuda_workspaces_infokey, NULL); +#endif +#if defined(DPLASMA_HAVE_HIP) + parsec_info_unregister(&parsec_per_device_infos, parsec_zpoinv->_g_hip_workspaces_infokey, NULL); +#endif parsec_taskpool_free(tp); } @@ -269,4 +380,3 @@ dplasma_zpoinv_sync( parsec_context_t *parsec, return info; } - diff --git a/src/ztrmm_LLN.jdf b/src/ztrmm_LLN.jdf index b36a9e46..da527c20 100644 --- a/src/ztrmm_LLN.jdf +++ b/src/ztrmm_LLN.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] @@ -177,14 +176,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(trans), 'N', + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(trans), CUBLAS_OP_N, tempmm, tempnn, descB->mb, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_LLT.jdf b/src/ztrmm_LLT.jdf index fd25b586..66847fb7 100644 --- a/src/ztrmm_LLT.jdf +++ b/src/ztrmm_LLT.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(m, k) [profile = off] @@ -177,14 +176,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(trans), 'N', + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(trans), CUBLAS_OP_N, tempmm, tempnn, tempkm, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_LUN.jdf b/src/ztrmm_LUN.jdf index 129f98d6..4f751d14 100644 --- a/src/ztrmm_LUN.jdf +++ b/src/ztrmm_LUN.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(m, k) [profile = off] @@ -177,14 +176,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(trans), 'N', + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(trans), CUBLAS_OP_N, tempmm, tempnn, tempkn, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_LUT.jdf b/src/ztrmm_LUT.jdf index 50d123f0..e7a5e9fb 100644 --- a/src/ztrmm_LUT.jdf +++ b/src/ztrmm_LUT.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(m, k) [profile = off] @@ -175,14 +174,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(trans), 'N', + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(trans), CUBLAS_OP_N, tempmm, tempnn, descB->mb, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_RLN.jdf b/src/ztrmm_RLN.jdf index b0d6e973..e70d3a3c 100644 --- a/src/ztrmm_RLN.jdf +++ b/src/ztrmm_RLN.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(n, k) [profile = off] @@ -175,14 +174,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(trans), + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, dplasma_cublas_op(trans), tempmm, tempnn, tempkn, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_RLT.jdf b/src/ztrmm_RLT.jdf index 9cdd6dde..64526b70 100644 --- a/src/ztrmm_RLT.jdf +++ b/src/ztrmm_RLT.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(n, k) [profile = off] @@ -175,14 +174,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(trans), + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, dplasma_cublas_op(trans), tempmm, tempnn, descB->mb, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_RUN.jdf b/src/ztrmm_RUN.jdf index 4c4def99..36eb75cf 100644 --- a/src/ztrmm_RUN.jdf +++ b/src/ztrmm_RUN.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(n, k) [profile = off] @@ -175,14 +174,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(trans), + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, dplasma_cublas_op(trans), tempmm, tempnn, descB->mb, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_RUT.jdf b/src/ztrmm_RUT.jdf index aab8b5dd..cf99eaec 100644 --- a/src/ztrmm_RUT.jdf +++ b/src/ztrmm_RUT.jdf @@ -3,15 +3,13 @@ extern "C" %{ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -58,6 +56,7 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] read_A(n, k) [profile = off] @@ -177,14 +176,17 @@ BODY [type=CUDA] int ldc = LDA(ddescB, C); cublasStatus_t status; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(trans), + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, dplasma_cublas_op(trans), tempmm, tempnn, tempkn, - lalpha, (cuDoubleComplex*)A, lda, - (cuDoubleComplex*)B, ldb, - lbeta, (cuDoubleComplex*)C, ldc ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &lalpha, A, lda, + B, ldb, + &lbeta, C, ldc ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrmm_wrapper.c b/src/ztrmm_wrapper.c index 06afd925..9a3e5de0 100644 --- a/src/ztrmm_wrapper.c +++ b/src/ztrmm_wrapper.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * @@ -128,13 +129,6 @@ dplasma_ztrmm_New( dplasma_enum_t side, dplasma_enum_t uplo, parsec_trmm = parsec_ztrmm_LLN_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); -#if defined(DPLASMA_HAVE_HIP) - /* It doesn't cost anything to define these infos if we have HIP but - * don't have GPUs on the current machine, so we do it non-conditionally */ - parsec_trmm->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); -#else - parsec_trmm->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; -#endif parsec_tp = (parsec_taskpool_t*)parsec_trmm; } else { /* trans =! dplasmaNoTrans */ parsec_ztrmm_LLT_taskpool_t* parsec_trmm; @@ -190,6 +184,27 @@ dplasma_ztrmm_New( dplasma_enum_t side, dplasma_enum_t uplo, } } + /* + * All ztrmm variants declare the same hidden HIP handle key. The taskpool + * structs keep the generated globals in the same prefix, so this mirrors + * the existing common ddescA/ddescB handling below instead of duplicating + * the assignment in each of the eight constructor branches. + */ +#if defined(DPLASMA_HAVE_HIP) + ((parsec_ztrmm_LLN_taskpool_t*)parsec_tp)->_g_hip_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + ((parsec_ztrmm_LLN_taskpool_t*)parsec_tp)->_g_hip_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif +#if defined(DPLASMA_HAVE_CUDA) + ((parsec_ztrmm_LLN_taskpool_t*)parsec_tp)->_g_cuda_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + ((parsec_ztrmm_LLN_taskpool_t*)parsec_tp)->_g_cuda_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif + /* When supporting LAPACK we can't assume both matrixes have the same layout, e.g. LDA. * Therefore, generate types for both. */ diff --git a/src/ztrsm_LLN.jdf b/src/ztrsm_LLN.jdf index 2b1986c4..78f8d73e 100644 --- a/src/ztrsm_LLN.jdf +++ b/src/ztrsm_LLN.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,32 +35,95 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,n) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, k), CLEAN_NB(descB, n)); %}] /* Execution space */ k = 0 .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, k,k); %} +loc_B = %{ return LOC(descB, k,n); %} + : descB(k,n) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (k>=1) ? E zgemm(k-1, k, n) - <- (0==k) ? descB(k,n) - -> descB(k,n) + RW B <- (k>=1) ? E zgemm(k-1, k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] + <- (0==k) ? ddescB(k,n) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + -> ddescB(k,n) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] -> (descB->mt>=(k+2)) ? D zgemm(k, (k+1)..(descB->mt-1), n) +BODY [type=CUDA] +{ + int tempkm = ((k)==(descB->mt-1)) ? (descB->m-(k*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = (k == 0) ? make_cuDoubleComplex(creal(alpha), cimag(alpha)) + : make_cuDoubleComplex(1.0, 0.0); +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempkm, tempnn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempkm = ((k)==(descB->mt-1)) ? (descB->m-(k*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {1., 0.}; + if(k == 0) { + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + } +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempkm, tempnn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempkm = ((k)==(descB->mt-1)) ? (descB->m-(k*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); - int lda = BLKLDD( descA, k ); - int ldb = BLKLDD( descB, k ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempkm, tempnn, lalpha, @@ -65,9 +142,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, k,k); %} + + : descA(k,k) - RW A <- descA(k,k) + READ A <- ddescA(k,k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0..(descB->nt-1)) BODY { @@ -82,13 +163,19 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N m = (k+1) .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_C = %{ return LOC(descA, (m),(k)); %} +loc_D = %{ return LOC(descB, (k),(n)); %} +loc_E = %{ return LOC(descB, m,n); %} + + : descB(m,n) - READ C <- C zgemm_in_A0(k,m) + READ C <- C zgemm_in_A0(k,m) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, A_SHAPE, TILED); %} ] - READ D <- B ztrsm(k, n) + READ D <- B ztrsm(k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_D, B_SHAPE, TILED); %} ] RW E <- (k>=1) ? E zgemm(k-1, m, n) - <- (0==k) ? descB(m,n) + <- (0==k) ? ddescB(m,n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] -> (m>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==m) ? B ztrsm(m, n) @@ -105,20 +192,22 @@ BODY [type=CUDA] int tempmm = ((m) == (descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n) == (descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int lda = BLKLDD( descA, m ); - int ldbk = BLKLDD( descB, k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, C); + int ldbk = LDA(ddescB, D); + int ldb = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, tempmm, tempnn, descB->mb, - mzone, (cuDoubleComplex*)C, lda, - (cuDoubleComplex*)D, ldbk, - lalpha, (cuDoubleComplex*)E, ldb ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, C, lda, + D, ldbk, + &lalpha, E, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -137,9 +226,9 @@ BODY [type=HIP] int tempmm = ((m) == (descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n) == (descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int lda = BLKLDD( descA, m ); - int ldbk = BLKLDD( descB, k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, C); + int ldbk = LDA(ddescB, D); + int ldb = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); @@ -158,9 +247,9 @@ BODY dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); int tempmm = ((m) == (descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n) == (descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int lda = BLKLDD( descA, m ); - int ldbk = BLKLDD( descB, k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, C); + int ldbk = LDA(ddescB, D); + int ldb = LDA(ddescB, E); CORE_zgemm(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descB->mb, @@ -180,9 +269,13 @@ zgemm_in_A0(k,m) [profile = off] k = 0 .. (descB->mt-2) m = (k+1) .. (descB->mt-1) +loc_C = %{ return LOC(descA, m,k); %} + + : descA(m,k) - RW C <- descA(m,k) + READ C <- ddescA(m,k) [ type = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, LAPACK); %} ] -> C zgemm(k,m,0..(descB->nt-1)) BODY { diff --git a/src/ztrsm_LLT.jdf b/src/ztrsm_LLT.jdf index 5b851f01..1da3ccb7 100644 --- a/src/ztrsm_LLT.jdf +++ b/src/ztrsm_LLT.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,32 +35,95 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,n) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, k), CLEAN_NB(descB, n)); %}] /* Execution space */ k = 0 .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, (descB->mt-1)-k,(descB->mt-1)-k); %} +loc_B = %{ return LOC(descB, (descB->mt-1)-k,n); %} + : descB((descB->mt-1)-k,n) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (0==k) ? descB((descB->mt-1)-k,n) - <- (k>=1) ? E zgemm(k-1, k, n) - -> descB((descB->mt-1)-k,n) + RW B <- (0==k) ? ddescB((descB->mt-1)-k,n) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + <- (k>=1) ? E zgemm(k-1, k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] + -> ddescB((descB->mt-1)-k,n) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] -> (descB->mt>=(2+k)) ? D zgemm(k, (k+1)..(descB->mt-1), n) +BODY [type=CUDA] +{ + int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = (k == 0) ? make_cuDoubleComplex(creal(alpha), cimag(alpha)) + : make_cuDoubleComplex(1.0, 0.0); +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempkm, tempnn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {1., 0.}; + if(k == 0) { + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + } +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempkm, tempnn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); - int lda = BLKLDD( descA, (descB->mt-1)-k ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempkm, tempnn, lalpha, @@ -65,9 +142,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, (descB->mt-1)-k,(descB->mt-1)-k); %} + + : descA((descB->mt-1)-k,(descB->mt-1)-k) - RW A <- descA((descB->mt-1)-k,(descB->mt-1)-k) + READ A <- ddescA((descB->mt-1)-k,(descB->mt-1)-k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0..(descB->nt-1)) BODY { @@ -83,13 +164,19 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(descB->mb, CLEAN_NB(descB, m = (k+1) .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_C = %{ return LOC(descA, (descB->mt-1)-(k),(descB->mt-1)-(m)); %} +loc_D = %{ return LOC(descB, (descB->mt-1)-(k),(n)); %} +loc_E = %{ return LOC(descB, (descB->mt-1)-m,n); %} + + : descB((descB->mt-1)-m,n) - READ C <- C zgemm_in_A0(k,m) + READ C <- C zgemm_in_A0(k,m) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, A_SHAPE, TILED); %} ] - READ D <- B ztrsm(k, n) + READ D <- B ztrsm(k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_D, B_SHAPE, TILED); %} ] RW E <- (k>=1) ? E zgemm(k-1, m, n) - <- (0==k) ? descB((descB->mt-1)-m,n) + <- (0==k) ? ddescB((descB->mt-1)-m,n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] -> ((1+k)==m) ? B ztrsm(m, n) -> (m>=(k+2)) ? E zgemm(k+1, m, n) @@ -106,20 +193,22 @@ BODY [type=CUDA] int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; - int lda = BLKLDD( descA, (descB->mt-1)-k ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); - int ldbm = BLKLDD( descB, (descB->mt-1)-m ); + int lda = LDA(ddescA, C); + int ldb = LDA(ddescB, D); + int ldbm = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(trans), 'N', + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(trans), CUBLAS_OP_N, descB->mb, tempnn, tempkm, - mzone, (cuDoubleComplex*)C, lda, - (cuDoubleComplex*)D, ldb, - lalpha, (cuDoubleComplex*)E, ldbm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, C, lda, + D, ldb, + &lalpha, E, ldbm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -138,9 +227,9 @@ BODY [type=HIP] int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; - int lda = BLKLDD( descA, (descB->mt-1)-k ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); - int ldbm = BLKLDD( descB, (descB->mt-1)-m ); + int lda = LDA(ddescA, C); + int ldb = LDA(ddescB, D); + int ldbm = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); @@ -158,9 +247,9 @@ BODY { int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; - int lda = BLKLDD( descA, (descB->mt-1)-k ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); - int ldbm = BLKLDD( descB, (descB->mt-1)-m ); + int lda = LDA(ddescA, C); + int ldb = LDA(ddescB, D); + int ldbm = LDA(ddescB, E); dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); CORE_zgemm(trans, dplasmaNoTrans, @@ -182,9 +271,13 @@ zgemm_in_A0(k,m) [profile = off] k = 0 .. (descB->mt-2) m = (k+1) .. (descB->mt-1) +loc_C = %{ return LOC(descA, (descB->mt-1)-k,(descB->mt-1)-m); %} + + : descA((descB->mt-1)-k,(descB->mt-1)-m) - RW C <- descA((descB->mt-1)-k,(descB->mt-1)-m) + READ C <- ddescA((descB->mt-1)-k,(descB->mt-1)-m) [ type = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, LAPACK); %} ] -> C zgemm(k,m,0..(descB->nt-1)) BODY { diff --git a/src/ztrsm_LUN.jdf b/src/ztrsm_LUN.jdf index 1a88fc87..df88a5eb 100644 --- a/src/ztrsm_LUN.jdf +++ b/src/ztrsm_LUN.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,32 +35,95 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,n) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, k), CLEAN_NB(descB, n)); %}] /* Execution Space */ k = 0 .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, (descB->mt-1)-k,(descB->mt-1)-k); %} +loc_B = %{ return LOC(descB, (descB->mt-1)-k,n); %} + : descB((descB->mt-1)-k,n) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (0==k) ? descB((descB->mt-1)-k,n) - <- (k>=1) ? E zgemm(k-1, k, n) - -> descB((descB->mt-1)-k,n) + RW B <- (0==k) ? ddescB((descB->mt-1)-k,n) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + <- (k>=1) ? E zgemm(k-1, k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] + -> ddescB((descB->mt-1)-k,n) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] -> (descB->mt>=(2+k)) ? D zgemm(k, (k+1)..(descB->mt-1), n) +BODY [type=CUDA] +{ + int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = (k == 0) ? make_cuDoubleComplex(creal(alpha), cimag(alpha)) + : make_cuDoubleComplex(1.0, 0.0); +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempkm, tempnn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {1., 0.}; + if(k == 0) { + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + } +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempkm, tempnn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); - int lda = BLKLDD( descA, (descB->mt-1)-k ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempkm, tempnn, lalpha, @@ -65,9 +142,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, (descB->mt-1)-k,(descB->mt-1)-k); %} + + : descA((descB->mt-1)-k,(descB->mt-1)-k) - RW A <- descA((descB->mt-1)-k,(descB->mt-1)-k) + READ A <- ddescA((descB->mt-1)-k,(descB->mt-1)-k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0..(descB->nt-1)) BODY { @@ -82,13 +163,19 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(descB->mb, CLEAN_NB(descB, m = (k+1) .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_C = %{ return LOC(descA, (descB->mt-1)-(m),(descB->mt-1)-(k)); %} +loc_D = %{ return LOC(descB, (descB->mt-1)-(k),(n)); %} +loc_E = %{ return LOC(descB, (descB->mt-1)-m,n); %} + + : descB((descB->mt-1)-m,n) - READ C <- C zgemm_in_A0(k,m) + READ C <- C zgemm_in_A0(k,m) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, A_SHAPE, TILED); %} ] - READ D <- B ztrsm(k, n) + READ D <- B ztrsm(k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_D, B_SHAPE, TILED); %} ] RW E <- (k>=1) ? E zgemm(k-1, m, n) - <- (0==k) ? descB((descB->mt-1)-m,n) + <- (0==k) ? ddescB((descB->mt-1)-m,n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] -> ((1+k)==m) ? B ztrsm(m, n) -> (m>=(k+2)) ? E zgemm(k+1, m, n) @@ -105,20 +192,22 @@ BODY [type=CUDA] int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; - int ldam = BLKLDD( descB, (descA.mt-1)-m ); - int ldbm = BLKLDD( descB, (descB->mt-1)-m ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); + int ldam = LDA(ddescA, C); + int ldbm = LDA(ddescB, E); + int ldb = LDA(ddescB, D); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, descB->mb, tempnn, tempkm, - mzone, (cuDoubleComplex*)C, ldam, - (cuDoubleComplex*)D, ldb, - lalpha, (cuDoubleComplex*)E, ldbm ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, C, ldam, + D, ldb, + &lalpha, E, ldbm ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -137,9 +226,9 @@ BODY [type=HIP] int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; - int ldam = BLKLDD( descB, (descA.mt-1)-m ); - int ldbm = BLKLDD( descB, (descB->mt-1)-m ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); + int ldam = LDA(ddescA, C); + int ldbm = LDA(ddescB, E); + int ldb = LDA(ddescB, D); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); @@ -158,9 +247,9 @@ BODY int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; int tempkm = ((k)==(0)) ? (descB->m-((descB->mt-1)*descB->mb)) : descB->mb; dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); - int ldam = BLKLDD( descB, (descA->mt-1)-m ); - int ldbm = BLKLDD( descB, (descB->mt-1)-m ); - int ldb = BLKLDD( descB, (descB->mt-1)-k ); + int ldam = LDA(ddescA, C); + int ldbm = LDA(ddescB, E); + int ldb = LDA(ddescB, D); CORE_zgemm(dplasmaNoTrans, dplasmaNoTrans, descB->mb, tempnn, tempkm, @@ -180,13 +269,16 @@ zgemm_in_A0(k,m) [profile = off] k = 0 .. (descB->mt-2) m = (k+1) .. (descB->mt-1) +loc_C = %{ return LOC(descA, (descB->mt-1)-m,(descB->mt-1)-k); %} + + : descA((descB->mt-1)-m,(descB->mt-1)-k) - RW C <- descA((descB->mt-1)-m,(descB->mt-1)-k) + READ C <- ddescA((descB->mt-1)-m,(descB->mt-1)-k) [ type = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, LAPACK); %} ] -> C zgemm(k,m,0..(descB->nt-1)) BODY { /* nothing */ } END - diff --git a/src/ztrsm_LUT.jdf b/src/ztrsm_LUT.jdf index 2cf7da1b..c9891f86 100644 --- a/src/ztrsm_LUT.jdf +++ b/src/ztrsm_LUT.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,32 +35,95 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,n) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, k), CLEAN_NB(descB, n)); %}] /* Execution space */ k = 0 .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, k,k); %} +loc_B = %{ return LOC(descB, k,n); %} + : descB(k,n) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (k>=1) ? E zgemm(k-1, k, n) - <- (0==k) ? descB(k,n) - -> descB(k,n) + RW B <- (k>=1) ? E zgemm(k-1, k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] + <- (0==k) ? ddescB(k,n) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + -> ddescB(k,n) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] -> (descB->mt>=(k+2)) ? D zgemm(k, (k+1)..(descB->mt-1), n) +BODY [type=CUDA] +{ + int tempkm = ((k)==(descB->mt-1)) ? (descB->m-(k*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = (k == 0) ? make_cuDoubleComplex(creal(alpha), cimag(alpha)) + : make_cuDoubleComplex(1.0, 0.0); +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempkm, tempnn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempkm = ((k)==(descB->mt-1)) ? (descB->m-(k*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {1., 0.}; + if(k == 0) { + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + } +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempkm, tempnn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempkm = ((k)==(descB->mt-1)) ? (descB->m-(k*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); - int lda = BLKLDD( descA, k ); - int ldb = BLKLDD( descB, k ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempkm, tempnn, lalpha, @@ -65,9 +142,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, k,k); %} + + : descA(k,k) - RW A <- descA(k,k) + READ A <- ddescA(k,k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0..(descB->nt-1)) BODY { @@ -82,13 +163,19 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N m = (k+1) .. (descB->mt-1) n = 0 .. (descB->nt-1) +loc_C = %{ return LOC(descA, (k), (m)); %} +loc_D = %{ return LOC(descB, (k),(n)); %} +loc_E = %{ return LOC(descB, m,n); %} + + : descB(m,n) - READ C <- C zgemm_in_A0(k,m) + READ C <- C zgemm_in_A0(k,m) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, A_SHAPE, TILED); %} ] - READ D <- B ztrsm(k, n) + READ D <- B ztrsm(k, n) [ type_remote = %{ return ADTT_DC(ddescB, loc_D, B_SHAPE, TILED); %} ] RW E <- (k>=1) ? E zgemm(k-1, m, n) - <- (0==k) ? descB(m,n) + <- (0==k) ? ddescB(m,n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] -> (m>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==m) ? B ztrsm(m, n) @@ -105,20 +192,22 @@ BODY [type=CUDA] int tempmm = ((m) == (descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n) == (descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldak = BLKLDD( descA, k ); - int ldbk = BLKLDD( descB, k ); - int ldb = BLKLDD( descB, m ); + int ldak = LDA(ddescA, C); + int ldbk = LDA(ddescB, D); + int ldb = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( dplasma_lapack_const(trans), 'N', + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, dplasma_cublas_op(trans), CUBLAS_OP_N, tempmm, tempnn, descB->mb, - mzone, (cuDoubleComplex*)C, ldak, - (cuDoubleComplex*)D, ldbk, - lalpha, (cuDoubleComplex*)E, ldb ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, C, ldak, + D, ldbk, + &lalpha, E, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -137,9 +226,9 @@ BODY [type=HIP] int tempmm = ((m) == (descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n) == (descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldak = BLKLDD( descA, k ); - int ldbk = BLKLDD( descB, k ); - int ldb = BLKLDD( descB, m ); + int ldak = LDA(ddescA, C); + int ldbk = LDA(ddescB, D); + int ldb = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); @@ -158,9 +247,9 @@ BODY dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)(1.0); int tempmm = ((m) == (descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n) == (descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldak = BLKLDD( descA, k ); - int ldbk = BLKLDD( descB, k ); - int ldb = BLKLDD( descB, m ); + int ldak = LDA(ddescA, C); + int ldbk = LDA(ddescB, D); + int ldb = LDA(ddescB, E); CORE_zgemm(trans, dplasmaNoTrans, tempmm, tempnn, descB->mb, @@ -180,9 +269,13 @@ zgemm_in_A0(k,m) [profile = off] k = 0 .. (descB->mt-2) m = (k+1) .. (descB->mt-1) +loc_C = %{ return LOC(descA, k, m); %} + + : descA(k, m) - RW C <- descA(k, m) + READ C <- ddescA(k, m) [ type = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_C, A_SHAPE, LAPACK); %} ] -> C zgemm(k,m,0..(descB->nt-1)) BODY { diff --git a/src/ztrsm_RLN.jdf b/src/ztrsm_RLN.jdf index 91674898..cbaf9960 100644 --- a/src/ztrsm_RLN.jdf +++ b/src/ztrsm_RLN.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,32 +35,95 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,m) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, m), CLEAN_NB(descB, k)); %}] /* Execution space */ k = 0 .. (descB->nt-1) m = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, (descB->nt-1)-k,(descB->nt-1)-k); %} +loc_B = %{ return LOC(descB, m,(descB->nt-1)-k); %} + : descB(m,(descB->nt-1)-k) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (0==k) ? descB(m,(descB->nt-1)-k) - <- (k>=1) ? E zgemm(k-1, m, k) + RW B <- (0==k) ? ddescB(m,(descB->nt-1)-k) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + <- (k>=1) ? E zgemm(k-1, m, k) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] -> (descB->nt>=(2+k)) ? C zgemm(k, m, (k+1)..(descB->nt-1)) - -> descB(m,(descB->nt-1)-k) + -> ddescB(m,(descB->nt-1)-k) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + +BODY [type=CUDA] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = (k == 0) ? make_cuDoubleComplex(creal(alpha), cimag(alpha)) + : make_cuDoubleComplex(1.0, 0.0); +#else + dplasma_complex64_t lalpha = (k==0) ? alpha : (dplasma_complex64_t)1.0; +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempmm, tempkn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {1., 0.}; + if(k == 0) { + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + } +#else + dplasma_complex64_t lalpha = (k==0) ? alpha : (dplasma_complex64_t)1.0; +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempmm, tempkn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; dplasma_complex64_t lalpha = (k==0) ? alpha : (dplasma_complex64_t)1.0; - int lda = BLKLDD( descA, (descB->nt-1)-k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempmm, tempkn, lalpha, @@ -65,9 +142,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, (descB->nt-1)-k,(descB->nt-1)-k); %} + + : descA((descB->nt-1)-k,(descB->nt-1)-k) - RW A <- descA((descB->nt-1)-k,(descB->nt-1)-k) + READ A <- ddescA((descB->nt-1)-k,(descB->nt-1)-k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0..(descB->mt-1)) BODY { @@ -82,13 +163,19 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), descB-> m = 0 .. (descB->mt-1) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, (descB->nt-1)-(k),(descB->nt-1)-(n)); %} +loc_C = %{ return LOC(descB, (m),(descB->nt-1)-(k)); %} +loc_E = %{ return LOC(descB, m,(descB->nt-1)-n); %} + + : descB(m,(descB->nt-1)-n) - READ C <- B ztrsm(k, m) - READ D <- D zgemm_in_A0(k,n) + READ C <- B ztrsm(k, m) [ type_remote = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, TILED); %} ] + READ D <- D zgemm_in_A0(k,n) [ type_remote = %{ return ADTT_DC(ddescA, loc_D, A_SHAPE, TILED); %} ] RW E <- (k>=1) ? E zgemm(k-1, m, n) - <- (0==k) ? descB(m,(descB->nt-1)-n) + <- (0==k) ? ddescB(m,(descB->nt-1)-n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] -> (n>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==n) ? B ztrsm(n, m) @@ -105,19 +192,22 @@ BODY [type=CUDA] int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int lda = BLKLDD( descA, (descB->nt-1)-k ); + int ldc = LDA(ddescB, C); + int lda = LDA(ddescA, D); + int lde = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, tempmm, descB->nb, tempkn, - mzone, (cuDoubleComplex*)C, ldb, - (cuDoubleComplex*)D, lda, - lalpha, (cuDoubleComplex*)E, ldb ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, C, ldc, + D, lda, + &lalpha, E, lde ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -136,17 +226,18 @@ BODY [type=HIP] int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int lda = BLKLDD( descA, (descB->nt-1)-k ); + int ldc = LDA(ddescB, C); + int lda = LDA(ddescA, D); + int lde = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_N, tempmm, descB->nb, tempkn, - &mzone, (hipblasDoubleComplex*)C, ldb, + &mzone, (hipblasDoubleComplex*)C, ldc, (hipblasDoubleComplex*)D, lda, - &lalpha, (hipblasDoubleComplex*)E, ldb ); + &lalpha, (hipblasDoubleComplex*)E, lde ); DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -155,15 +246,16 @@ BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int lda = BLKLDD( descA, (descB->nt-1)-k ); + int ldc = LDA(ddescB, C); + int lda = LDA(ddescA, D); + int lde = LDA(ddescB, E); dplasma_complex64_t lalpha = (k==0) ? alpha : (dplasma_complex64_t)1.0; CORE_zgemm(dplasmaNoTrans, dplasmaNoTrans, tempmm, descB->nb, tempkn, - -1.0, C /* descB(m, (descB->nt-1)-k) */, ldb, + -1.0, C /* descB(m, (descB->nt-1)-k) */, ldc, D /* descA((descB->nt-1)-k,(descB->nt-1)-n) */, lda, - lalpha, E /* descB(m, (descB->nt-1)-n) */, ldb ); + lalpha, E /* descB(m, (descB->nt-1)-n) */, lde ); printlog("CORE_zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, dplasmaNoTrans, tempmm, descB->nb, tempkn, mzone, descB(%d,%d)[%p], ldb, descA(%d,%d)[%p], lda, lalpha, descB(%d,%d)[%p], ldb)\n", @@ -178,9 +270,13 @@ zgemm_in_A0(k,n) [profile = off] k = 0 .. (descB->nt-2) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, (descB->nt-1)-k,(descB->nt-1)-n); %} + + : descA((descB->nt-1)-k,(descB->nt-1)-n) - RW D <- descA((descB->nt-1)-k,(descB->nt-1)-n) + READ D <- ddescA((descB->nt-1)-k,(descB->nt-1)-n) [ type = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, LAPACK); %} ] -> D zgemm(k,0..(descB->mt-1),n) BODY { diff --git a/src/ztrsm_RLT.jdf b/src/ztrsm_RLT.jdf index 50d1c73f..892c2480 100644 --- a/src/ztrsm_RLT.jdf +++ b/src/ztrsm_RLT.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,31 +35,90 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,m) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, m), CLEAN_NB(descB, k)); %}] /* Execution space */ k = 0 .. (descB->nt-1) m = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, k,k); %} +loc_B = %{ return LOC(descB, m,k); %} + : descB(m,k) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (0==k) ? descB(m,k) - <- (k>=1) ? E zgemm(k-1, m, k) + RW B <- (0==k) ? ddescB(m,k) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + <- (k>=1) ? E zgemm(k-1, m, k) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] -> (descB->nt>=(k+2)) ? C zgemm(k, m, (k+1) .. (descB->nt-1)) - -> descB(m,k) + -> ddescB(m,k) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + +BODY [type=CUDA] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(descB->nt-1)) ? (descB->n-(k*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); +#else + dplasma_complex64_t lalpha = alpha; +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempmm, tempkn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(descB->nt-1)) ? (descB->n-(k*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {creal(alpha), cimag(alpha)}; +#else + dplasma_complex64_t lalpha = alpha; +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempmm, tempkn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(descB->nt-1)) ? (descB->n-(k*descB->nb)) : descB->nb; - int lda = BLKLDD( descA, k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempmm, tempkn, alpha, @@ -64,9 +137,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, k,k); %} + + : descA(k,k) - RW A <- descA(k,k) + READ A <- ddescA(k,k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0 .. (descB->mt-1)) BODY { @@ -81,12 +158,18 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N m = 0 .. (descB->mt-1) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, (n),(k)); %} +loc_C = %{ return LOC(descB, (m),(k)); %} +loc_E = %{ return LOC(descB, m,n); %} + + : descB(m,n) - READ C <- B ztrsm(k, m) - READ D <- D zgemm_in_A0(k,n) + READ C <- B ztrsm(k, m) [ type_remote = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, TILED); %} ] + READ D <- D zgemm_in_A0(k,n) [ type_remote = %{ return ADTT_DC(ddescA, loc_D, A_SHAPE, TILED); %} ] - RW E <- (0==k) ? descB(m,n) + RW E <- (0==k) ? ddescB(m,n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] <- (k>=1) ? E zgemm(k-1, m, n) -> ((1+k)==n) ? B ztrsm(n, m) -> (n>=(2+k)) ? E zgemm(k+1, m, n) @@ -104,19 +187,22 @@ BODY [type=CUDA] int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int ldan = BLKLDD( descA, n ); + int ldc = LDA(ddescB, C); + int ldan = LDA(ddescA, D); + int lde = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(trans), + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, dplasma_cublas_op(trans), tempmm, tempnn, descB->mb, - minvalpha, (cuDoubleComplex*)C, ldb, - (cuDoubleComplex*)D, ldan, - zone, (cuDoubleComplex*)E, ldb ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &minvalpha, C, ldc, + D, ldan, + &zone, E, lde ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -128,22 +214,23 @@ BODY [type=HIP] hipblasDoubleComplex minvalpha = { hipCreal(cdiv), hipCimag(cdiv) }; #else dplasma_complex64_t zone = 1.; - dplasma_complex64_t minvalpha = ((dplasma_complex64_t)1.0)/alpha; + dplasma_complex64_t minvalpha = ((dplasma_complex64_t)-1.0)/alpha; #endif int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int ldan = BLKLDD( descA, n ); + int ldc = LDA(ddescB, C); + int ldan = LDA(ddescA, D); + int lde = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, dplasma_hipblas_op(trans), tempmm, tempnn, descB->mb, - &minvalpha, (hipblasDoubleComplex*)C, ldb, + &minvalpha, (hipblasDoubleComplex*)C, ldc, (hipblasDoubleComplex*)D, ldan, - &zone, (hipblasDoubleComplex*)E, ldb ); + &zone, (hipblasDoubleComplex*)E, lde ); DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -153,14 +240,15 @@ BODY int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; dplasma_complex64_t minvalpha = ((dplasma_complex64_t)-1.000000)/alpha; - int ldb = BLKLDD( descB, m ); - int ldan = BLKLDD( descA, n ); + int ldc = LDA(ddescB, C); + int ldan = LDA(ddescA, D); + int lde = LDA(ddescB, E); CORE_zgemm(dplasmaNoTrans, trans, tempmm, tempnn, descB->mb, - minvalpha, C /* descB(m,k) */, ldb, + minvalpha, C /* descB(m,k) */, ldc, D /* descA(n,k) */, ldan, - 1.0, E /* descB(m,n) */, ldb ); + 1.0, E /* descB(m,n) */, lde ); printlog("CORE_zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, trans, tempmm, tempnn, descB->mb, minvalpha, descB(%d,%d)[%p], ldb, descA(%d,%d)[%p], ldan, zone, descB(%d,%d)[%p], ldb)\n", @@ -175,9 +263,13 @@ zgemm_in_A0(k,n) [profile = off] k = 0 .. (descB->nt-2) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, n,k); %} + + : descA(n,k) - RW D <- descA(n,k) + READ D <- ddescA(n,k) [ type = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, LAPACK); %} ] -> D zgemm(k,0 .. (descB->mt-1),n) BODY { diff --git a/src/ztrsm_RUN.jdf b/src/ztrsm_RUN.jdf index 1a4e9211..875b3e23 100644 --- a/src/ztrsm_RUN.jdf +++ b/src/ztrsm_RUN.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,32 +35,95 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,m) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, m), CLEAN_NB(descB, k)); %}] /* Execution space */ k = 0..(descB->nt-1) m = 0..(descB->mt-1) +loc_A = %{ return LOC(descA, k,k); %} +loc_B = %{ return LOC(descB, m,k); %} + : descB(m,k) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (0==k) ? descB(m,k) - <- (k>=1) ? E zgemm(k-1, m, k) + RW B <- (0==k) ? ddescB(m,k) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + <- (k>=1) ? E zgemm(k-1, m, k) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] -> (descB->nt>=(k+2)) ? C zgemm(k, m, (k+1) .. (descB->nt-1)) - -> descB(m,k) + -> ddescB(m,k) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + +BODY [type=CUDA] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(descB->nt-1)) ? (descB->n-(k*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = (k == 0) ? make_cuDoubleComplex(creal(alpha), cimag(alpha)) + : make_cuDoubleComplex(1.0, 0.0); +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)1.0; +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempmm, tempkn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(descB->nt-1)) ? (descB->n-(k*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {1., 0.}; + if(k == 0) { + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + } +#else + dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)1.0; +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempmm, tempkn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(descB->nt-1)) ? (descB->n-(k*descB->nb)) : descB->nb; dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)1.0; - int lda = BLKLDD( descA, k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempmm, tempkn, lalpha, @@ -65,9 +142,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, k,k); %} + + : descA(k,k) - RW A <- descA(k,k) + READ A <- ddescA(k,k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0 .. (descB->mt-1)) BODY { @@ -82,12 +163,18 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N m = 0 .. (descB->mt-1) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, (k), (n)); %} +loc_C = %{ return LOC(descB, (m),(k)); %} +loc_E = %{ return LOC(descB, m,n); %} + + : descB(m,n) - READ C <- B ztrsm(k, m) - READ D <- D zgemm_in_A0(k,n) + READ C <- B ztrsm(k, m) [ type_remote = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, TILED); %} ] + READ D <- D zgemm_in_A0(k,n) [ type_remote = %{ return ADTT_DC(ddescA, loc_D, A_SHAPE, TILED); %} ] - RW E <- (0==k) ? descB(m,n) + RW E <- (0==k) ? ddescB(m,n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] <- (k>=1) ? E zgemm(k-1, m, n) -> ((1+k)==n) ? B ztrsm(n, m) -> (n>=(2+k)) ? E zgemm(k+1, m, n) @@ -105,19 +192,22 @@ BODY [type=CUDA] int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int lda = BLKLDD( descA, k ); + int ldc = LDA(ddescB, C); + int lda = LDA(ddescA, D); + int lde = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', 'N', + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, tempmm, tempnn, descB->mb, - mzone, (cuDoubleComplex*)C, ldb, - (cuDoubleComplex*)D, lda, - lalpha, (cuDoubleComplex*)E, ldb ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &mzone, C, ldc, + D, lda, + &lalpha, E, lde ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -136,17 +226,18 @@ BODY [type=HIP] int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int lda = BLKLDD( descA, k ); + int ldc = LDA(ddescB, C); + int lda = LDA(ddescA, D); + int lde = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_N, tempmm, tempnn, descB->mb, - &mzone, (hipblasDoubleComplex*)C, ldb, + &mzone, (hipblasDoubleComplex*)C, ldc, (hipblasDoubleComplex*)D, lda, - &lalpha, (hipblasDoubleComplex*)E, ldb ); + &lalpha, (hipblasDoubleComplex*)E, lde ); DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -155,16 +246,17 @@ BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; - int ldb = BLKLDD( descB, m ); - int lda = BLKLDD( descA, k ); + int ldc = LDA(ddescB, C); + int lda = LDA(ddescA, D); + int lde = LDA(ddescB, E); dplasma_complex64_t lalpha = ((k)==(0)) ? (alpha) : (dplasma_complex64_t)1.0; CORE_zgemm(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descB->mb, - -1.0, C /* descB(m,k) */, ldb, + -1.0, C /* descB(m,k) */, ldc, D /* descA(k,n) */, lda, - lalpha, E /* descB(m,n) */, ldb ); + lalpha, E /* descB(m,n) */, lde ); printlog("CORE_zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descB->mb, mzone, descB(%d,%d)[%p], ldb, descA(%d,%d)[%p], lda, lalpha, descB(%d,%d)[%p], ldb)\n", @@ -179,9 +271,13 @@ zgemm_in_A0(k,n) [profile = off] k = 0 .. (descB->nt-2) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, k, n); %} + + : descA(k, n) - RW D <- descA(k, n) + READ D <- ddescA(k, n) [ type = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, LAPACK); %} ] -> D zgemm(k,0 .. (descB->mt-1),n) BODY { diff --git a/src/ztrsm_RUT.jdf b/src/ztrsm_RUT.jdf index 613791e2..94c28f2f 100644 --- a/src/ztrsm_RUT.jdf +++ b/src/ztrsm_RUT.jdf @@ -3,17 +3,31 @@ extern "C" %{ * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} side [type = "dplasma_enum_t"] @@ -21,31 +35,90 @@ uplo [type = "dplasma_enum_t"] trans [type = "dplasma_enum_t"] diag [type = "dplasma_enum_t"] alpha [type = "dplasma_complex64_t"] -descA [type = "const parsec_tiled_matrix_t*"] -descB [type = "parsec_tiled_matrix_t*"] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] + +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] -hip_handles_infokey [type = "int" hidden = on default = "parsec_info_lookup(&parsec_per_stream_infos, \"DPLASMA::HIP::HANDLES\", NULL)" ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] ztrsm(k,m) [ flops = inline_c %{ return FLOPS_ZTRSM(side, CLEAN_MB(descB, m), CLEAN_NB(descB, k)); %}] /* Execution space */ k = 0 .. (descB->nt-1) m = 0 .. (descB->mt-1) +loc_A = %{ return LOC(descA, (descB->nt-1)-k,(descB->nt-1)-k); %} +loc_B = %{ return LOC(descB, m,(descB->nt-1)-k); %} + : descB(m,(descB->nt-1)-k) - READ A <- A ztrsm_in_A0(k) + READ A <- A ztrsm_in_A0(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] - RW B <- (0==k) ? descB(m,(descB->nt-1)-k) - <- (k>=1) ? E zgemm(k-1, m, k) + RW B <- (0==k) ? ddescB(m,(descB->nt-1)-k) [ type = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + <- (k>=1) ? E zgemm(k-1, m, k) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] -> (descB->nt>=(2+k)) ? C zgemm(k, m, (k+1)..(descB->nt-1)) - -> descB(m,(descB->nt-1)-k) + -> ddescB(m,(descB->nt-1)-k) [ type = %{ return ADTT_CP(_f_B, ddescB, loc_B, B_SHAPE); %} + type_data = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, LAPACK); %} ] + +BODY [type=CUDA] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); +#else + dplasma_complex64_t lalpha = alpha; +#endif + + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZtrsm_v2( handles->cublas_handle, dplasma_cublas_side(side), dplasma_cublas_fill(uplo), + dplasma_cublas_op(trans), dplasma_cublas_diag(diag), + tempmm, tempkn, &lalpha, + A, lda, + B, ldb ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = {creal(alpha), cimag(alpha)}; +#else + dplasma_complex64_t lalpha = alpha; +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm( handles->hipblas_handle, dplasma_hipblas_side(side), dplasma_hipblas_fill(uplo), dplasma_hipblas_op(trans), dplasma_hipblas_diag(diag), + tempmm, tempkn, &lalpha, + (hipblasDoubleComplex*)A, lda, + (hipblasDoubleComplex*)B, ldb); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZtrsm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; - int lda = BLKLDD( descA, (descB->nt-1)-k ); - int ldb = BLKLDD( descB, m ); + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); CORE_ztrsm(side, uplo, trans, diag, tempmm, tempkn, alpha, @@ -64,9 +137,13 @@ END ztrsm_in_A0(k) [profile = off] k = 0 .. (descB->nt-1) +loc_A = %{ return LOC(descA, (descB->nt-1)-k,(descB->nt-1)-k); %} + + : descA((descB->nt-1)-k,(descB->nt-1)-k) - RW A <- descA((descB->nt-1)-k,(descB->nt-1)-k) + READ A <- ddescA((descB->nt-1)-k,(descB->nt-1)-k) [ type = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, A_SHAPE, LAPACK); %} ] -> A ztrsm(k,0..(descB->mt-1)) BODY { @@ -81,13 +158,19 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), descB-> m = 0 .. (descB->mt-1) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, (descB->nt-1)-(n),(descB->nt-1)-(k)); %} +loc_C = %{ return LOC(descB, (m),(descB->nt-1)-(k)); %} +loc_E = %{ return LOC(descB, m,(descB->nt-1)-n); %} + + : descB(m,(descB->nt-1)-n) - READ C <- B ztrsm(k, m) - READ D <- D zgemm_in_A0(k,n) + READ C <- B ztrsm(k, m) [ type_remote = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, TILED); %} ] + READ D <- D zgemm_in_A0(k,n) [ type_remote = %{ return ADTT_DC(ddescA, loc_D, A_SHAPE, TILED); %} ] RW E <- (k>=1) ? E zgemm(k-1, m, n) - <- (0==k) ? descB(m,(descB->nt-1)-n) + <- (0==k) ? ddescB(m,(descB->nt-1)-n) [ type = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescB, loc_E, B_SHAPE, LAPACK); %} ] -> (n>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==n) ? B ztrsm(n, m) @@ -104,19 +187,22 @@ BODY [type=CUDA] int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; - int ldan = BLKLDD( descA, (descB->nt-1)-n ); - int ldb = BLKLDD( descB, m ); + int ldc = LDA(ddescB, C); + int ldan = LDA(ddescA, D); + int lde = LDA(ddescB, E); cublasStatus_t status; + dplasma_cuda_handles_t *handles; - cublasSetKernelStream( parsec_body.stream ); - cublasZgemm( 'N', dplasma_lapack_const(trans), + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); + assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); + status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, dplasma_cublas_op(trans), tempmm, descB->nb, tempkn, - minvalpha, (cuDoubleComplex*)C, ldb, - (cuDoubleComplex*)D, ldan, - zone, (cuDoubleComplex*)E, ldb ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + &minvalpha, C, ldc, + D, ldan, + &zone, E, lde ); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -128,22 +214,23 @@ BODY [type=HIP] hipblasDoubleComplex minvalpha = { hipCreal(cdiv), hipCimag(cdiv) }; #else dplasma_complex64_t zone = 1.; - dplasma_complex64_t minvalpha = ((dplasma_complex64_t)1.0)/alpha; + dplasma_complex64_t minvalpha = ((dplasma_complex64_t)-1.0)/alpha; #endif int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; - int ldan = BLKLDD( descA, (descB->nt-1)-n ); - int ldb = BLKLDD( descB, m ); + int ldc = LDA(ddescB, C); + int ldan = LDA(ddescA, D); + int lde = LDA(ddescB, E); hipblasStatus_t status; dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, dplasma_hipblas_op(trans), tempmm, descB->nb, tempkn, - &minvalpha, (hipblasDoubleComplex*)C, ldb, + &minvalpha, (hipblasDoubleComplex*)C, ldc, (hipblasDoubleComplex*)D, ldan, - &zone, (hipblasDoubleComplex*)E, ldb ); + &zone, (hipblasDoubleComplex*)E, lde ); DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -153,14 +240,15 @@ BODY int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; int tempkn = ((k)==(0)) ? (descB->n-((descB->nt-1)*descB->nb)) : descB->nb; dplasma_complex64_t minvalpha = ((dplasma_complex64_t)-1.000000)/alpha; - int ldan = BLKLDD( descA, (descB->nt-1)-n ); - int ldb = BLKLDD( descB, m ); + int ldc = LDA(ddescB, C); + int ldan = LDA(ddescA, D); + int lde = LDA(ddescB, E); CORE_zgemm(dplasmaNoTrans, trans, tempmm, descB->nb, tempkn, - minvalpha, C /* descB(m, (descB->nt-1)-k) */, ldb, + minvalpha, C /* descB(m, (descB->nt-1)-k) */, ldc, D /* descA((descB->nt-1)-n,(descB->nt-1)-k) */, ldan, - 1.0, E /* descB(m, (descB->nt-1)-n) */, ldb ); + 1.0, E /* descB(m, (descB->nt-1)-n) */, lde ); printlog("CORE_zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, trans, tempmm, descB->nb, tempkn, minvalpha, B(%d,%d)[%p], ldb, A(%d,%d)[%p], descA->mb, zone, B(%d,%d)[%p], ldb)\n", @@ -175,9 +263,13 @@ zgemm_in_A0(k,n) [profile = off] k = 0 .. (descB->nt-2) n = (k+1) .. (descB->nt-1) +loc_D = %{ return LOC(descA, (descB->nt-1)-n,(descB->nt-1)-k); %} + + : descA((descB->nt-1)-n,(descB->nt-1)-k) - RW D <- descA((descB->nt-1)-n,(descB->nt-1)-k) + READ D <- ddescA((descB->nt-1)-n,(descB->nt-1)-k) [ type = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_D, A_SHAPE, LAPACK); %} ] -> D zgemm(k,0..(descB->mt-1),n) BODY { diff --git a/src/ztrsm_wrapper.c b/src/ztrsm_wrapper.c index 9606aab9..9934d065 100644 --- a/src/ztrsm_wrapper.c +++ b/src/ztrsm_wrapper.c @@ -2,6 +2,7 @@ * Copyright (c) 2010-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Inria. All rights reserved. * * @precisions normal z -> s d c @@ -10,6 +11,7 @@ #include "dplasma.h" #include "dplasma/types.h" +#include "dplasma/types_lapack.h" #include "dplasmaaux.h" #include "ztrsm_LLN.h" @@ -21,6 +23,8 @@ #include "ztrsm_RUN.h" #include "ztrsm_RUT.h" +#define MAX_SHAPES 2 + /** ******************************************************************************* * @@ -98,31 +102,33 @@ dplasma_ztrsm_New( dplasma_enum_t side, dplasma_enum_t uplo, parsec_tiled_matrix_t *B ) { parsec_taskpool_t *parsec_trsm = NULL; + dplasma_data_collection_t *ddc_A = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)A); + dplasma_data_collection_t *ddc_B = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)B); if ( side == dplasmaLeft ) { if ( uplo == dplasmaLower ) { if ( trans == dplasmaNoTrans ) { parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_LLN_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } else { /* trans =! dplasmaNoTrans */ parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_LLT_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } } else { /* uplo = dplasmaUpper */ if ( trans == dplasmaNoTrans ) { parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_LUN_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } else { /* trans =! dplasmaNoTrans */ parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_LUT_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } } } else { /* side == dplasmaRight */ @@ -130,33 +136,56 @@ dplasma_ztrsm_New( dplasma_enum_t side, dplasma_enum_t uplo, if ( trans == dplasmaNoTrans ) { parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_RLN_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } else { /* trans =! dplasmaNoTrans */ parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_RLT_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } } else { /* uplo = dplasmaUpper */ if ( trans == dplasmaNoTrans ) { parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_RUN_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } else { /* trans =! dplasmaNoTrans */ parsec_trsm = (parsec_taskpool_t*)parsec_ztrsm_RUT_new( side, uplo, trans, diag, alpha, - A, - B); + ddc_A, + ddc_B); } } } - dplasma_add2arena_tile( &((parsec_ztrsm_LLN_taskpool_t*)parsec_trsm)->arenas_datatypes[PARSEC_ztrsm_LLN_DEFAULT_ADT_IDX], - A->mb*A->nb*sizeof(dplasma_complex64_t), - PARSEC_ARENA_ALIGNMENT_SSE, - parsec_datatype_double_complex_t, A->mb ); +#if defined(DPLASMA_HAVE_CUDA) + ((parsec_ztrsm_LLN_taskpool_t*)parsec_trsm)->_g_cuda_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); +#else + ((parsec_ztrsm_LLN_taskpool_t*)parsec_trsm)->_g_cuda_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif +#if defined(DPLASMA_HAVE_HIP) + ((parsec_ztrsm_LLN_taskpool_t*)parsec_trsm)->_g_hip_handles_infokey = + parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + ((parsec_ztrsm_LLN_taskpool_t*)parsec_trsm)->_g_hip_handles_infokey = + PARSEC_INFO_ID_UNDEFINED; +#endif + + int shape = 0; + dplasma_setup_adtt_all_loc(ddc_A, + parsec_datatype_double_complex_t, + PARSEC_MATRIX_FULL, + 1, + &shape); + dplasma_setup_adtt_all_loc(ddc_B, + parsec_datatype_double_complex_t, + PARSEC_MATRIX_FULL, + 1, + &shape); + assert(shape == MAX_SHAPES); return parsec_trsm; } @@ -185,9 +214,16 @@ void dplasma_ztrsm_Destruct( parsec_taskpool_t *tp ) { parsec_ztrsm_LLN_taskpool_t *otrsm = (parsec_ztrsm_LLN_taskpool_t *)tp; + dplasma_data_collection_t *ddc_A = otrsm->_g_ddescA; + dplasma_data_collection_t *ddc_B = otrsm->_g_ddescB; + + dplasma_clean_adtt_all_loc(ddc_A, MAX_SHAPES); + dplasma_clean_adtt_all_loc(ddc_B, MAX_SHAPES); - dplasma_matrix_del2arena( &otrsm->arenas_datatypes[PARSEC_ztrsm_LLN_DEFAULT_ADT_IDX] ); parsec_taskpool_free(tp); + + dplasma_unwrap_data_collection(ddc_A); + dplasma_unwrap_data_collection(ddc_B); } /**