From 27390ec80f06cb12b4cc529cbeac1e5a94f87ebc Mon Sep 17 00:00:00 2001 From: Jeongnim Kim Date: Tue, 10 May 2022 14:23:13 -0700 Subject: [PATCH 01/36] Add syclSolverInverter routines. --- src/Platforms/SYCL/syclBLAS.cpp | 79 +++++++++++ src/Platforms/SYCL/syclBLAS.hpp | 17 +++ src/Platforms/SYCL/syclSolver.hpp | 25 ++++ .../Fermion/syclSolverInverter.hpp | 130 ++++++++++++++++++ .../detail/SYCL/sycl_determinant_helper.cpp | 43 ++++++ .../detail/SYCL/sycl_determinant_helper.hpp | 9 ++ .../tests/test_syclSolverInverter.cpp | 66 +++++++++ 7 files changed, 369 insertions(+) create mode 100644 src/Platforms/SYCL/syclSolver.hpp create mode 100644 src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp create mode 100644 src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp index 31cc35e89b..b8a9f87028 100644 --- a/src/Platforms/SYCL/syclBLAS.cpp +++ b/src/Platforms/SYCL/syclBLAS.cpp @@ -182,6 +182,85 @@ template sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events); +//transpose +template +sycl::event transpose(sycl::queue& q, + const T1* restrict in, + int m, + int lda, + T2* restrict out, + int n, + int ldb, + const std::vector& events) +{ + constexpr size_t tile_size=16; + const size_t m_max=((m+tile_size-1)/tile_size)*tile_size; + const size_t n_max=((n+tile_size-1)/tile_size)*tile_size; + + return q.submit([&](sycl::handler& cgh) { + + cgh.depends_on(events); + sycl::accessor + tile(sycl::range<2>(tile_size,tile_size+1), cgh); + + cgh.parallel_for(sycl::nd_range<2>{{m_max,n_max},{tile_size,tile_size}}, + [=](sycl::nd_item<2> item) { + unsigned x = item.get_global_id(1); + unsigned y = item.get_global_id(0); + unsigned xth=item.get_local_id(1); + unsigned yth=item.get_local_id(0); + + if(x& events); + +template sycl::event transpose(sycl::queue& q, + const double* restrict in, + int m, + int lda, + double* restrict out, + int n, + int ldb, + const std::vector& events); + +//copy_n for mixed precision +template +sycl::event copy_n(sycl::queue &aq, + const T1* restrict VA, + size_t array_size, + T2* restrict VC, + const std::vector& events) +{ + constexpr size_t tile_size=64; + const size_t a_max=((array_size+tile_size-1)/tile_size)*tile_size; + return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) { + if(id(VA[id]); + }); +} + +template sycl::event copy_n(sycl::queue &aq, + const double* restrict VA, + size_t array_size, + float* restrict VC, + const std::vector& events); + } // namespace syclBLAS } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp index 2952dfa3b3..36ac05e3a3 100644 --- a/src/Platforms/SYCL/syclBLAS.hpp +++ b/src/Platforms/SYCL/syclBLAS.hpp @@ -56,6 +56,23 @@ sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events = {}); +template +sycl::event transpose(sycl::queue& q, + const T1* restrict in, + int m, + int lda, + T2* restrict out, + int n, + int ldb, + const std::vector& events = {}); + +template +sycl::event copy_n(sycl::queue &aq, + const T1* restrict VA, + size_t array_size, + T2* restrict VC, + const std::vector& events = {}); + } // namespace syclBLAS } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/syclSolver.hpp b/src/Platforms/SYCL/syclSolver.hpp new file mode 100644 index 0000000000..66bdc781d9 --- /dev/null +++ b/src/Platforms/SYCL/syclSolver.hpp @@ -0,0 +1,25 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2022 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVER_H +#define QMCPLUSPLUS_SYCL_MKL_SOLVER_H + +#include "oneapi/mkl/lapack.hpp" + +namespace qmcplusplus +{ + namespace syclSolver + { + using namespace oneapi::mkl::lapack; + } +} + +#endif diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp new file mode 100644 index 0000000000..7300c0d6f2 --- /dev/null +++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp @@ -0,0 +1,130 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2019 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H +#define QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H + +#include "OhmmsPETE/OhmmsVector.h" +#include "OhmmsPETE/OhmmsMatrix.h" +#include "SYCL/SYCLallocator.hpp" +#include "SYCL/syclBLAS.hpp" +#include "SYCL/syclSolver.hpp" +#include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp" + +namespace qmcplusplus +{ +/** implements matrix inversion via cuSolverDN + * @tparam T_FP high precision for matrix inversion, T_FP >= T + */ +template +class syclSolverInverter +{ + /// scratch memory for cusolverDN + Matrix> Mat1_gpu; + /// pivot array + info + Vector> ipiv; + /// workspace + Vector> workspace; + std::int64_t getrf_ws=0; + std::int64_t getri_ws=0; + + /** resize the internal storage + * @param norb number of electrons/orbitals + * @param delay, maximum delay 0(m_queue,norb,norb,norb); + getri_ws=syclSolver::getri_scratchpad_size(m_queue,norb,norb); + workspace.resize(std::max(getrf_ws,getri_ws)); + } + } + +public: + + /** compute the inverse of the transpose of matrix A and its determinant value in log + * when T_FP and TMAT are the same + * @tparam TREAL real type + */ + template::value>> + std::enable_if_t::value> invert_transpose(const Matrix& logdetT, + Matrix& Ainv, + Matrix>& Ainv_gpu, + std::complex& log_value, + sycl::queue& m_queue) + { + const int norb = logdetT.rows(); + resize(norb, m_queue); + + auto c_event = m_queue.memcpy(Mat1_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT)); + auto t_event = syclBLAS::transpose(m_queue,Mat1_gpu.data(),norb,Mat1_gpu.cols(),Ainv_gpu.data(),norb,Ainv_gpu.cols(), {c_event}); + try + { + c_event = syclSolver::getrf(m_queue,norb,norb,Ainv_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event}); + } + catch(cl::sycl::exception const& ex) { + std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" + << ex.what() << " status: " << ex.code() << std::endl; + abort(); + } + + log_value = computeLogDet_sycl(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data(), {c_event}); + + c_event = syclSolver::getri(m_queue,norb,Ainv_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws); + + m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {c_event}).wait(); + } + + /** compute the inverse of the transpose of matrix A and its determinant value in log + * when T_FP and TMAT are not the same + * @tparam TREAL real type + */ + template::value>> + std::enable_if_t::value> invert_transpose(const Matrix& logdetT, + Matrix& Ainv, + Matrix>& Ainv_gpu, + std::complex& log_value, + sycl::queue& m_queue) + { + const int norb = logdetT.rows(); + resize(norb, m_queue); + //use Ainv_gpu for transpose + auto c_event = m_queue.memcpy(Ainv_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT)); + //transpose + auto t_event = syclBLAS::transpose(m_queue,Ainv_gpu.data(),norb,Ainv_gpu.cols(),Mat1_gpu.data(),norb,Mat1_gpu.cols(), {c_event}); + + //getrf (LU) -> getri (inverse) + try + { + c_event = syclSolver::getrf(m_queue,norb,norb,Mat1_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event}); + } + catch(cl::sycl::exception const& ex) { + std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" + << ex.what() << " status: " << ex.code() << std::endl; + abort(); + } + + log_value = computeLogDet_sycl(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data(), {c_event}); + + c_event = syclSolver::getri(m_queue,norb,Mat1_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws); + + t_event = syclBLAS::copy_n(m_queue,Mat1_gpu.data(),Mat1_gpu.size(),Ainv_gpu.data(), {c_event} ); + + m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {t_event}).wait(); + } +}; +} // namespace qmcplusplus + +#endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp index 7ce7a9d3dc..84fb6cada0 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp @@ -86,4 +86,47 @@ template sycl::event applyW_stageV_sycl(sycl::queue& aq, const int ndelay, std::complex* restrict V_gpu, const std::complex* restrict Ainv); + + +template +std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const TMAT* restrict a, + const Index_t* restrict pivot, + const std::vector& dependencies) + { + constexpr size_t COLBS=32; + + std::complex result{}; + { + sycl::buffer,1> abuff(&result,{1}); + aq.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + + size_t n_max=((n+COLBS-1)/COLBS)*COLBS; + sycl::global_ptr A{a}; + sycl::global_ptr Pivot{pivot}; + cgh.parallel_for(sycl::range<1>{n_max}, + sycl::reduction(abuff,cgh,{T{},T{}},std::plus>()), + [=](sycl::id<1> i, auto& sum) + { + std::complex val{}; + //if(i((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i])); + if(i(A[i*lda+i])) : std::log(std::complex(-A[i*lda+i])); + sum.combine(val); + }); + }); + } //synchronous + return result; + } + +template std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const double* restrict a, + const std::int64_t* restrict pivot, + const std::vector& dependencies); } // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp index 7baec66af7..4cca93bc83 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp @@ -31,5 +31,14 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq, const int ndelay, T* V_gpu, const T* Ainv); + +template +std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const TMAT* restrict a, + const Index_t* restrict pivot, + const std::vector& dependencies={}); + } #endif diff --git a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp new file mode 100644 index 0000000000..8ee4b65612 --- /dev/null +++ b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp @@ -0,0 +1,66 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2021 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#include "catch.hpp" + +#include "Configuration.h" +#include "CPU/SIMD/aligned_allocator.hpp" +#include "QMCWaveFunctions/Fermion/syclSolverInverter.hpp" +#include "QMCWaveFunctions/Fermion/DiracMatrix.h" +#include "Utilities/for_testing/checkMatrix.hpp" +#include "Containers/tests/makeRngSpdMatrix.hpp" + +namespace qmcplusplus +{ + +template +void test_inverse(const std::int64_t N) +{ + sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue(); + + syclSolverInverter solver; + + Matrix m(N, N); + Matrix m_invT(N, N); + Matrix m_invT_CPU(N, N); + Matrix> m_invGPU; + m_invGPU.resize(N,N); + + std::complex log_value, log_value_cpu; + testing::MakeRngSpdMatrix makeRngSpdMatrix{}; + makeRngSpdMatrix(m); + + solver.invert_transpose(m, m_invT, m_invGPU, log_value, m_queue); + m_queue.wait(); + + DiracMatrix dmat; + dmat.invert_transpose(m, m_invT_CPU, log_value_cpu); + + REQUIRE(log_value == ComplexApprox(log_value_cpu)); + + auto check_matrix_result = checkMatrix(m_invT, m_invT_CPU); + CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); } +} + +TEST_CASE("OmpSYCL syclSolverInverter", "[SYCL]") +{ + const int N = 911; + +#ifdef MIXED_PRECISION + std::cout << "Testing Inverse for mixed precision " << std::endl; + test_inverse(N); +#else + std::cout << "Testing Inverse for double double " << std::endl; + test_inverse(N); +#endif +} + +} // namespace qmcplusplus From ead985717cbead25681c211cab82d3c571f9cf53 Mon Sep 17 00:00:00 2001 From: Jeongnim Kim Date: Tue, 10 May 2022 16:09:14 -0700 Subject: [PATCH 02/36] Merge DelayedUpdateSYCL.h --- .../Fermion/DelayedUpdateSYCL.h | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h index ee3582e3b5..e453294f98 100644 --- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h +++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h @@ -20,6 +20,8 @@ #include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp" #include "DiracMatrix.h" #include "PrefetchedRange.h" +#include "syclSolverInverter.hpp" + //#define SYCL_BLOCKING namespace qmcplusplus @@ -52,6 +54,8 @@ class DelayedUpdateSYCL DiracMatrix host_inverter_; + syclSolverInverter sycl_inverter_; + // the range of prefetched_Ainv_rows PrefetchedRange prefetched_range; // Ainv prefetch buffer @@ -102,8 +106,17 @@ class DelayedUpdateSYCL template void invert_transpose(const Matrix& logdetT, Matrix& Ainv, std::complex& log_value) { - host_inverter_.invert_transpose(logdetT, Ainv, log_value); - initializeInv(Ainv); + clearDelayCount(); + + if(true) + { + host_inverter_.invert_transpose(logdetT, Ainv, log_value); + m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait(); + } + else + { + sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_); + } } /** initialize internal objects when Ainv is refreshed @@ -215,11 +228,11 @@ class DelayedUpdateSYCL #ifdef SYCL_BLOCKING syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv, - cone, Ainv_gpu.data(), norb, {u_event, temp_v_event}) + cone, Ainv_gpu.data(), norb, {u_event}) .wait(); #else ainv_event_ = syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, - temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event, temp_v_event}); + temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event}); #endif clearDelayCount(); From 2e3012db8eda7a753501268783ba2ca7e74e8a38 Mon Sep 17 00:00:00 2001 From: Jeongnim Kim Date: Tue, 10 May 2022 17:11:01 -0700 Subject: [PATCH 03/36] Use syclSolverInverter in DelayedUpdateSYCL.h --- CMakeLists.txt | 2 +- src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h | 12 +----------- .../detail/SYCL/sycl_determinant_helper.cpp | 2 +- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 259f2f5aa1..73253a0315 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -851,7 +851,7 @@ if(ENABLE_SYCL) endif() add_library(SYCL::host INTERFACE IMPORTED) add_library(SYCL::device INTERFACE IMPORTED) - find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE}) + find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE} NO_DEFAULT_PATH) target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host) target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device) if(TARGET MKL::sycl) diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h index e453294f98..8fdb6b362c 100644 --- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h +++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h @@ -52,8 +52,6 @@ class DelayedUpdateSYCL /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv int delay_count; - DiracMatrix host_inverter_; - syclSolverInverter sycl_inverter_; // the range of prefetched_Ainv_rows @@ -108,15 +106,7 @@ class DelayedUpdateSYCL { clearDelayCount(); - if(true) - { - host_inverter_.invert_transpose(logdetT, Ainv, log_value); - m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait(); - } - else - { - sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_); - } + sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_); } /** initialize internal objects when Ainv is refreshed diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp index 84fb6cada0..2593d34f19 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp @@ -96,7 +96,7 @@ std::complex computeLogDet_sycl(sycl::queue& aq, const Index_t* restrict pivot, const std::vector& dependencies) { - constexpr size_t COLBS=32; + constexpr size_t COLBS=128; std::complex result{}; { From b5fb457d8a3fcbebe41a2624effcb123ff861839 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 17 May 2022 15:52:05 -0500 Subject: [PATCH 04/36] Fix complex compilation. --- src/Platforms/SYCL/syclBLAS.cpp | 24 +++++++++++++++++++ src/Platforms/SYCL/syclBLAS.hpp | 8 +++---- .../detail/SYCL/sycl_determinant_helper.cpp | 13 +++++++--- .../detail/SYCL/sycl_determinant_helper.hpp | 6 ++--- 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp index b8a9f87028..3e0e3396f8 100644 --- a/src/Platforms/SYCL/syclBLAS.cpp +++ b/src/Platforms/SYCL/syclBLAS.cpp @@ -239,6 +239,24 @@ template sycl::event transpose(sycl::queue& q, int ldb, const std::vector& events); +template sycl::event transpose(sycl::queue& q, + const std::complex* restrict in, + int m, + int lda, + std::complex* restrict out, + int n, + int ldb, + const std::vector& events); + +template sycl::event transpose(sycl::queue& q, + const std::complex* restrict in, + int m, + int lda, + std::complex* restrict out, + int n, + int ldb, + const std::vector& events); + //copy_n for mixed precision template sycl::event copy_n(sycl::queue &aq, @@ -261,6 +279,12 @@ template sycl::event copy_n(sycl::queue &aq, float* restrict VC, const std::vector& events); +template sycl::event copy_n(sycl::queue &aq, + const std::complex* restrict VA, + size_t array_size, + std::complex* restrict VC, + const std::vector& events); + } // namespace syclBLAS } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp index 36ac05e3a3..370c035417 100644 --- a/src/Platforms/SYCL/syclBLAS.hpp +++ b/src/Platforms/SYCL/syclBLAS.hpp @@ -58,19 +58,19 @@ sycl::event gemm(sycl::queue& handle, template sycl::event transpose(sycl::queue& q, - const T1* restrict in, + const T1* in, int m, int lda, - T2* restrict out, + T2* out, int n, int ldb, const std::vector& events = {}); template sycl::event copy_n(sycl::queue &aq, - const T1* restrict VA, + const T1* VA, size_t array_size, - T2* restrict VC, + T2* VC, const std::vector& events = {}); } // namespace syclBLAS diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp index 2593d34f19..46b534c0ac 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp @@ -88,12 +88,12 @@ template sycl::event applyW_stageV_sycl(sycl::queue& aq, const std::complex* restrict Ainv); -template +template std::complex computeLogDet_sycl(sycl::queue& aq, int n, int lda, const TMAT* restrict a, - const Index_t* restrict pivot, + const INDEX* restrict pivot, const std::vector& dependencies) { constexpr size_t COLBS=128; @@ -106,7 +106,7 @@ std::complex computeLogDet_sycl(sycl::queue& aq, size_t n_max=((n+COLBS-1)/COLBS)*COLBS; sycl::global_ptr A{a}; - sycl::global_ptr Pivot{pivot}; + sycl::global_ptr Pivot{pivot}; cgh.parallel_for(sycl::range<1>{n_max}, sycl::reduction(abuff,cgh,{T{},T{}},std::plus>()), [=](sycl::id<1> i, auto& sum) @@ -129,4 +129,11 @@ template std::complex computeLogDet_sycl(sycl::queue& aq, const double* restrict a, const std::int64_t* restrict pivot, const std::vector& dependencies); + +template std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const std::complex* restrict a, + const std::int64_t* restrict pivot, + const std::vector& dependencies); } // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp index 4cca93bc83..6a7fb06933 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp @@ -32,12 +32,12 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq, T* V_gpu, const T* Ainv); -template +template std::complex computeLogDet_sycl(sycl::queue& aq, int n, int lda, - const TMAT* restrict a, - const Index_t* restrict pivot, + const TMAT* a, + const INDEX* pivot, const std::vector& dependencies={}); } From 64fcaff9e6addb07958d3b4aa2cd16dc34f04439 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 17 May 2022 16:04:57 -0500 Subject: [PATCH 05/36] Formatting --- src/Platforms/SYCL/syclBLAS.cpp | 126 +++++++++--------- src/Platforms/SYCL/syclBLAS.hpp | 4 +- src/Platforms/SYCL/syclSolver.hpp | 8 +- .../Fermion/DelayedUpdateSYCL.h | 1 - .../Fermion/syclSolverInverter.hpp | 59 ++++---- .../detail/SYCL/sycl_determinant_helper.cpp | 50 ++++--- .../detail/SYCL/sycl_determinant_helper.hpp | 5 +- .../tests/test_syclSolverInverter.cpp | 9 +- 8 files changed, 131 insertions(+), 131 deletions(-) diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp index 3e0e3396f8..83ee3c6b4d 100644 --- a/src/Platforms/SYCL/syclBLAS.cpp +++ b/src/Platforms/SYCL/syclBLAS.cpp @@ -113,11 +113,11 @@ sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events) { - return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, events); + return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb, + beta, C, ldc, events); } - template sycl::event gemm(sycl::queue& handle, const char tA, const char tB, @@ -182,7 +182,7 @@ template sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events); -//transpose +//transpose template sycl::event transpose(sycl::queue& q, const T1* restrict in, @@ -193,93 +193,93 @@ sycl::event transpose(sycl::queue& q, int ldb, const std::vector& events) { - constexpr size_t tile_size=16; - const size_t m_max=((m+tile_size-1)/tile_size)*tile_size; - const size_t n_max=((n+tile_size-1)/tile_size)*tile_size; - - return q.submit([&](sycl::handler& cgh) { - - cgh.depends_on(events); - sycl::accessor - tile(sycl::range<2>(tile_size,tile_size+1), cgh); + constexpr size_t tile_size = 16; + const size_t m_max = ((m + tile_size - 1) / tile_size) * tile_size; + const size_t n_max = ((n + tile_size - 1) / tile_size) * tile_size; - cgh.parallel_for(sycl::nd_range<2>{{m_max,n_max},{tile_size,tile_size}}, - [=](sycl::nd_item<2> item) { - unsigned x = item.get_global_id(1); - unsigned y = item.get_global_id(0); - unsigned xth=item.get_local_id(1); - unsigned yth=item.get_local_id(0); + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on(events); + sycl::accessor tile(sycl::range<2>(tile_size, + tile_size + 1), + cgh); - if(x{{m_max, n_max}, {tile_size, tile_size}}, [=](sycl::nd_item<2> item) { + unsigned x = item.get_global_id(1); + unsigned y = item.get_global_id(0); + unsigned xth = item.get_local_id(1); + unsigned yth = item.get_local_id(0); - x = item.get_group(0)*tile_size + xth; - y = item.get_group(1)*tile_size + yth; - if(x& events); + const float* restrict in, + int m, + int lda, + double* restrict out, + int n, + int ldb, + const std::vector& events); template sycl::event transpose(sycl::queue& q, - const double* restrict in, - int m, - int lda, - double* restrict out, - int n, - int ldb, - const std::vector& events); + const double* restrict in, + int m, + int lda, + double* restrict out, + int n, + int ldb, + const std::vector& events); template sycl::event transpose(sycl::queue& q, - const std::complex* restrict in, - int m, - int lda, - std::complex* restrict out, - int n, - int ldb, - const std::vector& events); + const std::complex* restrict in, + int m, + int lda, + std::complex* restrict out, + int n, + int ldb, + const std::vector& events); template sycl::event transpose(sycl::queue& q, - const std::complex* restrict in, - int m, - int lda, - std::complex* restrict out, - int n, - int ldb, - const std::vector& events); + const std::complex* restrict in, + int m, + int lda, + std::complex* restrict out, + int n, + int ldb, + const std::vector& events); //copy_n for mixed precision -template -sycl::event copy_n(sycl::queue &aq, +template +sycl::event copy_n(sycl::queue& aq, const T1* restrict VA, size_t array_size, T2* restrict VC, const std::vector& events) { - constexpr size_t tile_size=64; - const size_t a_max=((array_size+tile_size-1)/tile_size)*tile_size; - return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) { - if(id(VA[id]); - }); + constexpr size_t tile_size = 64; + const size_t a_max = ((array_size + tile_size - 1) / tile_size) * tile_size; + return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) { + if (id < array_size) + VC[id] = static_cast(VA[id]); + }); } -template sycl::event copy_n(sycl::queue &aq, +template sycl::event copy_n(sycl::queue& aq, const double* restrict VA, size_t array_size, float* restrict VC, const std::vector& events); -template sycl::event copy_n(sycl::queue &aq, +template sycl::event copy_n(sycl::queue& aq, const std::complex* restrict VA, size_t array_size, std::complex* restrict VC, diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp index 370c035417..3ecf85dcb1 100644 --- a/src/Platforms/SYCL/syclBLAS.hpp +++ b/src/Platforms/SYCL/syclBLAS.hpp @@ -66,8 +66,8 @@ sycl::event transpose(sycl::queue& q, int ldb, const std::vector& events = {}); -template -sycl::event copy_n(sycl::queue &aq, +template +sycl::event copy_n(sycl::queue& aq, const T1* VA, size_t array_size, T2* VC, diff --git a/src/Platforms/SYCL/syclSolver.hpp b/src/Platforms/SYCL/syclSolver.hpp index 66bdc781d9..fe18f8a7bd 100644 --- a/src/Platforms/SYCL/syclSolver.hpp +++ b/src/Platforms/SYCL/syclSolver.hpp @@ -16,10 +16,10 @@ namespace qmcplusplus { - namespace syclSolver - { - using namespace oneapi::mkl::lapack; - } +namespace syclSolver +{ +using namespace oneapi::mkl::lapack; } +} // namespace qmcplusplus #endif diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h index 8fdb6b362c..aa0cc1eb0f 100644 --- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h +++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h @@ -26,7 +26,6 @@ namespace qmcplusplus { - /** implements delayed update on Intel GPU using SYCL * @tparam T base precision for most computation * @tparam T_FP high precision for matrix inversion, T_FP >= T diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp index 7300c0d6f2..16731cf431 100644 --- a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp +++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp @@ -33,8 +33,8 @@ class syclSolverInverter Vector> ipiv; /// workspace Vector> workspace; - std::int64_t getrf_ws=0; - std::int64_t getri_ws=0; + std::int64_t getrf_ws = 0; + std::int64_t getri_ws = 0; /** resize the internal storage * @param norb number of electrons/orbitals @@ -46,14 +46,13 @@ class syclSolverInverter { Mat1_gpu.resize(norb, norb); ipiv.resize(norb); - getrf_ws=syclSolver::getrf_scratchpad_size(m_queue,norb,norb,norb); - getri_ws=syclSolver::getri_scratchpad_size(m_queue,norb,norb); - workspace.resize(std::max(getrf_ws,getri_ws)); + getrf_ws = syclSolver::getrf_scratchpad_size(m_queue, norb, norb, norb); + getri_ws = syclSolver::getri_scratchpad_size(m_queue, norb, norb); + workspace.resize(std::max(getrf_ws, getri_ws)); } } public: - /** compute the inverse of the transpose of matrix A and its determinant value in log * when T_FP and TMAT are the same * @tparam TREAL real type @@ -68,23 +67,26 @@ class syclSolverInverter const int norb = logdetT.rows(); resize(norb, m_queue); - auto c_event = m_queue.memcpy(Mat1_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT)); - auto t_event = syclBLAS::transpose(m_queue,Mat1_gpu.data(),norb,Mat1_gpu.cols(),Ainv_gpu.data(),norb,Ainv_gpu.cols(), {c_event}); + auto c_event = m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT)); + auto t_event = syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb, + Ainv_gpu.cols(), {c_event}); try { - c_event = syclSolver::getrf(m_queue,norb,norb,Ainv_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event}); + c_event = syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, + {t_event}); + } + catch (cl::sycl::exception const& ex) + { + std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" + << ex.what() << " status: " << ex.code() << std::endl; + abort(); } - catch(cl::sycl::exception const& ex) { - std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" - << ex.what() << " status: " << ex.code() << std::endl; - abort(); - } log_value = computeLogDet_sycl(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data(), {c_event}); - c_event = syclSolver::getri(m_queue,norb,Ainv_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws); + c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws); - m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {c_event}).wait(); + m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {c_event}).wait(); } /** compute the inverse of the transpose of matrix A and its determinant value in log @@ -101,28 +103,31 @@ class syclSolverInverter const int norb = logdetT.rows(); resize(norb, m_queue); //use Ainv_gpu for transpose - auto c_event = m_queue.memcpy(Ainv_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT)); + auto c_event = m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT)); //transpose - auto t_event = syclBLAS::transpose(m_queue,Ainv_gpu.data(),norb,Ainv_gpu.cols(),Mat1_gpu.data(),norb,Mat1_gpu.cols(), {c_event}); + auto t_event = syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb, + Mat1_gpu.cols(), {c_event}); //getrf (LU) -> getri (inverse) try { - c_event = syclSolver::getrf(m_queue,norb,norb,Mat1_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event}); + c_event = syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, + {t_event}); + } + catch (cl::sycl::exception const& ex) + { + std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" + << ex.what() << " status: " << ex.code() << std::endl; + abort(); } - catch(cl::sycl::exception const& ex) { - std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" - << ex.what() << " status: " << ex.code() << std::endl; - abort(); - } log_value = computeLogDet_sycl(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data(), {c_event}); - c_event = syclSolver::getri(m_queue,norb,Mat1_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws); + c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws); - t_event = syclBLAS::copy_n(m_queue,Mat1_gpu.data(),Mat1_gpu.size(),Ainv_gpu.data(), {c_event} ); + t_event = syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data(), {c_event}); - m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {t_event}).wait(); + m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {t_event}).wait(); } }; } // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp index 46b534c0ac..20711c1ff4 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp @@ -95,33 +95,31 @@ std::complex computeLogDet_sycl(sycl::queue& aq, const TMAT* restrict a, const INDEX* restrict pivot, const std::vector& dependencies) - { - constexpr size_t COLBS=128; +{ + constexpr size_t COLBS = 128; - std::complex result{}; - { - sycl::buffer,1> abuff(&result,{1}); - aq.submit([&](sycl::handler& cgh) { - cgh.depends_on(dependencies); - - size_t n_max=((n+COLBS-1)/COLBS)*COLBS; - sycl::global_ptr A{a}; - sycl::global_ptr Pivot{pivot}; - cgh.parallel_for(sycl::range<1>{n_max}, - sycl::reduction(abuff,cgh,{T{},T{}},std::plus>()), - [=](sycl::id<1> i, auto& sum) - { - std::complex val{}; - //if(i((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i])); - if(i(A[i*lda+i])) : std::log(std::complex(-A[i*lda+i])); - sum.combine(val); - }); - }); - } //synchronous - return result; - } + std::complex result{}; + { + sycl::buffer, 1> abuff(&result, {1}); + aq.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + + size_t n_max = ((n + COLBS - 1) / COLBS) * COLBS; + sycl::global_ptr A{a}; + sycl::global_ptr Pivot{pivot}; + cgh.parallel_for(sycl::range<1>{n_max}, sycl::reduction(abuff, cgh, {T{}, T{}}, std::plus>()), + [=](sycl::id<1> i, auto& sum) { + std::complex val{}; + //if(i((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i])); + if (i < n) + val = (Pivot[i] == i + 1) ? std::log(std::complex(A[i * lda + i])) + : std::log(std::complex(-A[i * lda + i])); + sum.combine(val); + }); + }); + } //synchronous + return result; +} template std::complex computeLogDet_sycl(sycl::queue& aq, int n, diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp index 6a7fb06933..23bcaa774a 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp @@ -20,7 +20,6 @@ namespace qmcplusplus { - template sycl::event applyW_stageV_sycl(sycl::queue& aq, const std::vector& dependencies, @@ -38,7 +37,7 @@ std::complex computeLogDet_sycl(sycl::queue& aq, int lda, const TMAT* a, const INDEX* pivot, - const std::vector& dependencies={}); + const std::vector& dependencies = {}); -} +} // namespace qmcplusplus #endif diff --git a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp index 8ee4b65612..0c6ecfcc20 100644 --- a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp +++ b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp @@ -20,7 +20,6 @@ namespace qmcplusplus { - template void test_inverse(const std::int64_t N) { @@ -32,7 +31,7 @@ void test_inverse(const std::int64_t N) Matrix m_invT(N, N); Matrix m_invT_CPU(N, N); Matrix> m_invGPU; - m_invGPU.resize(N,N); + m_invGPU.resize(N, N); std::complex log_value, log_value_cpu; testing::MakeRngSpdMatrix makeRngSpdMatrix{}; @@ -52,14 +51,14 @@ void test_inverse(const std::int64_t N) TEST_CASE("OmpSYCL syclSolverInverter", "[SYCL]") { - const int N = 911; + const int N = 911; #ifdef MIXED_PRECISION std::cout << "Testing Inverse for mixed precision " << std::endl; - test_inverse(N); + test_inverse(N); #else std::cout << "Testing Inverse for double double " << std::endl; - test_inverse(N); + test_inverse(N); #endif } From c6ee3641fc9ad3454b7b50b7c72c8241da18064a Mon Sep 17 00:00:00 2001 From: Jeongnim Kim Date: Thu, 9 Jun 2022 16:02:25 -0700 Subject: [PATCH 06/36] Update with interop and add waits. --- src/Platforms/SYCL/SYCLDeviceManager.cpp | 134 ++++-------------- src/Platforms/SYCL/SYCLDeviceManager.h | 2 + .../Fermion/syclSolverInverter.hpp | 14 +- 3 files changed, 37 insertions(+), 113 deletions(-) diff --git a/src/Platforms/SYCL/SYCLDeviceManager.cpp b/src/Platforms/SYCL/SYCLDeviceManager.cpp index e9478451b8..6372660deb 100644 --- a/src/Platforms/SYCL/SYCLDeviceManager.cpp +++ b/src/Platforms/SYCL/SYCLDeviceManager.cpp @@ -17,36 +17,44 @@ #include #include #include "config.h" -#include "OutputManager.h" -#include "determineDefaultDeviceNum.h" -#if defined(_OPENMP) -#include -#include -#include -#include -#endif - +#include "Platforms/Host/OutputManager.h" +#include "Platforms/Host/determineDefaultDeviceNum.h" namespace qmcplusplus { -#if defined(_OPENMP) -/** create SYCL device/contexts from OpenMP owned ones to ensure interoperability. - * CUDA has the notion of primary context while SYCL requires explicitly sharing context. - */ -static std::vector xomp_get_sycl_devices(); -#endif + +std::unique_ptr SYCLDeviceManager::default_device_queue; SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, int local_rank, int local_size) : sycl_default_device_num(-1) { +#if defined(ENABLE_OFFLOAD) + const int sycl_device_count=omp_get_num_devices(); + sycl_default_device_num = determineDefaultDeviceNum(sycl_device_count, local_rank, local_size); + if (default_device_num < 0) + default_device_num = sycl_default_device_num; + + visible_devices.resize(sycl_device_count); + for (int id = 0; id < sycl_device_count; id++) + { + omp_interop_t interop; +#pragma omp interop device(id) init(prefer_type("sycl"),targetsync: interop) + if(id == sycl_default_device_num) + { + int result; + default_device_queue.reset(static_cast (omp_get_interop_ptr(interop, omp_ipr_targetsync, &result))); + if(result != omp_irc_success) + throw std::runtime_error("SYCLDeviceManager::SYCLDeviceManager fail to obtain sycl::queue by interop"); + } + visible_devices[id].interop=interop; + } + +#else // Potentially multiple GPU platform. std::vector platforms = sycl::platform::get_platforms(); if (platforms.empty()) throw std::runtime_error("Cannot find SYCL platforms!"); - // find out devices from the first platform with GPUs. -#if defined(ENABLE_OFFLOAD) - visible_devices = xomp_get_sycl_devices(); -#else + // find out devices from the first platform with GPUs. std::vector devices; app_log() << "Visible SYCL platforms are :" << std::endl; for (auto& platform : platforms) @@ -67,7 +75,6 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, visible_devices.reserve(devices.size()); for (int id = 0; id < devices.size(); id++) visible_devices.push_back({sycl::context(devices[id]), devices[id]}); -#endif const size_t sycl_device_count = visible_devices.size(); if (num_devices == 0) @@ -85,14 +92,12 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, default_device_num = sycl_default_device_num; else if (default_device_num != sycl_default_device_num) throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!"); - default_device_queue = std::make_unique(visible_devices[sycl_default_device_num].context, visible_devices[sycl_default_device_num].device); } +#endif } -std::unique_ptr SYCLDeviceManager::default_device_queue; - sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue() { if (!default_device_queue) @@ -100,87 +105,4 @@ sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue() return *default_device_queue; } -#if defined(_OPENMP) -static std::vector xomp_get_sycl_devices() -{ - enum class Backend - { - UNKNOWN, - LEVEL_ZERO, - OPENCL - }; - - const auto num_omp_devices = omp_get_num_devices(); - Backend selected_backend = Backend::UNKNOWN; - std::vector devices(num_omp_devices); - for (int id = 0; id < num_omp_devices; id++) - { - omp_interop_t o = 0; - Backend my_backend = Backend::UNKNOWN; -#pragma omp interop init(prefer_type("sycl"), targetsync : o) device(id) - int err = -1; - - const std::string omp_backend(omp_get_interop_str(o, omp_ipr_fr_name, &err)); - assert(err >= 0 && "omp_get_interop_str(omp_ipr_fr_name)"); - - if (omp_backend.find("level_zero") == 0) - { - my_backend = Backend::LEVEL_ZERO; - - auto hPlatform = omp_get_interop_ptr(o, omp_ipr_platform, &err); - assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_platform)"); - auto hContext = omp_get_interop_ptr(o, omp_ipr_device_context, &err); - assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_device_context)"); - auto hDevice = omp_get_interop_ptr(o, omp_ipr_device, &err); - assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_device)"); - - const sycl::platform sycl_platform = - sycl::ext::oneapi::level_zero::make_platform(reinterpret_cast(hPlatform)); - devices[id].device = - sycl::ext::oneapi::level_zero::make_device(sycl_platform, reinterpret_cast(hDevice)); - - devices[id].context = sycl::ext::oneapi::level_zero::make_context({devices[id].device}, - reinterpret_cast(hContext), - true /* keep the ownership, no transfer */); - } - else if (omp_backend.find("opencl") == 0) - { - my_backend = Backend::OPENCL; - /* - auto hContext = omp_get_interop_ptr(o, omp_ipr_device_context, &err); - assert (err >= 0 && "omp_get_interop_ptr(omp_ipr_device_context)"); - auto hDevice = omp_get_interop_ptr(o, omp_ipr_device, &err); - assert (err >= 0 && "omp_get_interop_ptr(omp_ipr_device)"); - devices[id].device = sycl::make_device(static_cast(hDevice)); - devices[id].context = sycl::make_context(static_cast(hContext)); -*/ - } - -#pragma omp interop destroy(o) - - if (selected_backend == Backend::UNKNOWN) - selected_backend = my_backend; - else if (selected_backend != my_backend) - throw std::runtime_error("Inconsistent backends detected among OpenMP devices."); - } - - if (devices.size() > 0) - switch (selected_backend) - { - case Backend::LEVEL_ZERO: - app_log() << "SYCL adopts the Level Zero backend chosen by OpenMP." << std::endl; - break; - case Backend::OPENCL: - throw std::runtime_error("OpenMP has chosen the OpenCL backend. " - "We have not yet worked out its interoperability with SYCL. " - "Please set the Level Zero backend in OpenMP!"); - break; - default: - throw std::runtime_error("Failed in extracting OpenMP backend supported by SYCL."); - } - - return devices; -} -#endif - } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/SYCLDeviceManager.h b/src/Platforms/SYCL/SYCLDeviceManager.h index 3b5dff0c33..7055c13c3a 100644 --- a/src/Platforms/SYCL/SYCLDeviceManager.h +++ b/src/Platforms/SYCL/SYCLDeviceManager.h @@ -18,6 +18,7 @@ #include #include #include +#include namespace qmcplusplus { @@ -25,6 +26,7 @@ struct syclDeviceInfo { sycl::context context; sycl::device device; + omp_interop_t interop; }; /** SYCL device manager diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp index 16731cf431..694ed9a2e3 100644 --- a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp +++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp @@ -30,7 +30,7 @@ class syclSolverInverter /// scratch memory for cusolverDN Matrix> Mat1_gpu; /// pivot array + info - Vector> ipiv; + Vector> ipiv; /// workspace Vector> workspace; std::int64_t getrf_ws = 0; @@ -72,8 +72,8 @@ class syclSolverInverter Ainv_gpu.cols(), {c_event}); try { - c_event = syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, - {t_event}); + syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, + {t_event}).wait(); } catch (cl::sycl::exception const& ex) { @@ -82,7 +82,7 @@ class syclSolverInverter abort(); } - log_value = computeLogDet_sycl(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data(), {c_event}); + log_value = computeLogDet_sycl(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data()); c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws); @@ -111,8 +111,8 @@ class syclSolverInverter //getrf (LU) -> getri (inverse) try { - c_event = syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, - {t_event}); + syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, + {t_event}).wait(); } catch (cl::sycl::exception const& ex) { @@ -121,7 +121,7 @@ class syclSolverInverter abort(); } - log_value = computeLogDet_sycl(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data(), {c_event}); + log_value = computeLogDet_sycl(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data()); c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws); From 10c90b270ff34ca8867e3ffea1de9cc662fbc0cf Mon Sep 17 00:00:00 2001 From: Mark Dewing Date: Mon, 13 Jun 2022 14:56:42 -0500 Subject: [PATCH 07/36] Add tests for Rotated SPOs using LCAO A python script (using autograd) is used to generate reference values for the values of the wavefunction and derivatives at one point in space. The test system is a helium atom with two orbitals and optionally a Jastrow factor. --- src/QMCWaveFunctions/tests/CMakeLists.txt | 2 +- .../tests/gen_rotated_lcao_wf.py | 191 +++++++++ .../tests/test_RotatedSPOs_LCAO.cpp | 364 ++++++++++++++++++ 3 files changed, 556 insertions(+), 1 deletion(-) create mode 100644 src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py create mode 100644 src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp diff --git a/src/QMCWaveFunctions/tests/CMakeLists.txt b/src/QMCWaveFunctions/tests/CMakeLists.txt index 97a6d20717..c0d82bce66 100644 --- a/src/QMCWaveFunctions/tests/CMakeLists.txt +++ b/src/QMCWaveFunctions/tests/CMakeLists.txt @@ -59,7 +59,7 @@ endforeach() if(NOT QMC_CUDA) if(NOT QMC_COMPLEX) - set(MO_SRCS test_MO.cpp test_multiquintic_spline.cpp test_cartesian_ao.cpp) + set(MO_SRCS test_MO.cpp test_multiquintic_spline.cpp test_cartesian_ao.cpp test_RotatedSPOs_LCAO.cpp) if(NOT QMC_MIXED_PRECISION) set(MO_SRCS ${MO_SRCS} test_soa_cusp_corr.cpp) endif() diff --git a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py new file mode 100644 index 0000000000..1563ad83e7 --- /dev/null +++ b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py @@ -0,0 +1,191 @@ + +# Helium atom with a combination of two orbitals and simple jastrow factor + +# Uses automatic differentiation via the autograd package to +# compute spatial and parameter derivatives +import autograd.numpy as np +from autograd import hessian,grad + +# Values used in test_RotatedSPOs_LCAO.cpp + +def mag(r): + return np.sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]) + +# normalized STO's correspond to the 'normalized="no"' part of the input +# + + +def sto_norm1(zeta): + return 2*np.sqrt(zeta**3) + +def sto_norm2(zeta): + return 2*np.sqrt(3)*np.sqrt(zeta**5)/3 + + +def orb1(R): + r = mag(R) + Z = 2.0 + y00 = 1/np.sqrt(4 * np.pi) + snorm1 = sto_norm1(Z) + return y00 * snorm1 * np.exp(-Z*r) + +def orb2(R): + r = mag(R) + zeta = 1.0 + y00 = 1/np.sqrt(4*np.pi) + snorm2 = sto_norm2(zeta) + return snorm2* y00 * r* np.exp(-zeta*r) + +def rot_orb(R, theta): + return orb1(R) * np.cos(theta) + orb2(R) * np.sin(theta) + + +def jastrow(r12, B): + A = 0.5 + return np.exp(A*r12/(1.0 + B*r12) - A/B) + +def psi_no_jastrow(r1, r2, VP): + theta1 = VP[0] + theta2 = VP[1] + o1 = rot_orb(r1,theta1) + o2 = rot_orb(r2,theta2) + return o1*o2 + +def psi_with_jastrow(r1, r2, VP): + theta1 = VP[0] + theta2 = VP[1] + B = VP[2] + o1 = rot_orb(r1,theta1) + o2 = rot_orb(r2,theta2) + r12 = r2 - r1 + j = jastrow(r12, B) + return o1*o2*j + +def psi(r1, r2, VP): + global use_jastrow + theta1 = VP[0] + theta2 = VP[1] + j = 1.0 + if use_jastrow: + B = VP[2] + r12 = mag(r2 - r1) + j = jastrow(r12, B) + + o1 = rot_orb(r1,theta1) + o2 = rot_orb(r2,theta2) + return o1*o2*j + +def log_psi(r1, r2, B): + return np.log(psi(r1, r2, B)) + + +# Spatial derivatives +hess0 = hessian(psi, 0) +hess1 = hessian(psi, 1) + + +hess_log_0 = hessian(log_psi, 0) +hess_log_1 = hessian(log_psi, 1) + +grad0 = grad(psi, 0) +grad1 = grad(psi, 1) + +# Derivative wrt parameters +dpsi = grad(psi, 2) + +def lap0(r1, r2, VP): + h0 = np.sum(np.diag(hess_log_0(r1, r2, VP))) + return h0 + +def lap1(r1, r2, VP): + h1 = np.sum(np.diag(hess_log_1(r1, r2, VP))) + return h1 + +def lap(r1, r2, VP): + h0 = np.sum(np.diag(hess0(r1, r2, VP))) + h1 = np.sum(np.diag(hess1(r1, r2, VP))) + return h0 + h1 + +def en_pot(r1, r2): + r1_mag = mag(r1) + r2_mag = mag(r2) + Z = 2.0 + return -Z/r1_mag - Z/r2_mag + +def ee_pot(r1, r2): + r12 = r2 - r1 + r12_mag = mag(r12) + return 1.0/r12_mag + + +def local_energy(r1, r2, VP): + pot = en_pot(r1, r2) + ee_pot(r1, r2) + psi_val = psi(r1, r2, VP) + lapl = lap(r1, r2, VP) + + h = -0.5*lapl/psi_val + pot + return h + +dlocal_energy = grad(local_energy, 2) + + + +def print_wf_values(theta1=0.0, theta2=0.0, use_j=False, B=0.0): + global use_jastrow + + # Adjust numpy output so arrays are printed with higher precision + float_formatter = "{:.15g}".format + np.set_printoptions(formatter={'float_kind':float_formatter}) + + if use_j: + use_jastrow = True + VP = np.array([theta1, theta2, B]) + print("Values for theta = ",theta1,theta2," and jastrow B = ",B) + else: + use_jastrow = False + VP = np.array([theta1, theta2]) + print("Values for theta = ",theta1,theta2," and no jastrow") + + + + r1 = np.array([1.0, 2.0, 3.0]) + r2 = np.array([0.0, 1.1, 2.2]) + + psi_val = psi(r1, r2, VP) + print(" wf = ",psi_val," log wf = ",np.log(np.abs(psi_val))) + + g0 = grad0(r1, r2, VP)/psi_val + print(" grad/psi for particle 0 = ",g0[0],g0[1],g0[2]) + + # Using the laplacian of log psi to match internal QMCPACK values + lap_0 = lap0(r1, r2, VP) + print(" laplacian of log psi for particle 0 = ",lap_0) + + lap_1 = lap1(r1, r2, VP) + print(" laplacian for log psi particle 1 = ",lap_1) + + eloc = local_energy(r1, r2, VP) + print(" local energy = ",eloc) + + dp = dpsi(r1, r2, VP) + print(" parameter derivative of log psi = ",dp / psi_val) + + deloc = dlocal_energy(r1, r2, VP) + print(" parameter derivative of local energy = ",deloc) + + print("") + +def print_point_values(): + r1 = np.array([1.0, 2.0, 3.0]) + r2 = np.array([0.0, 1.1, 2.2]) + + print_wf_values(theta1=0.1, theta2=0.2) + + print_wf_values(theta1=0.0, theta2=0.0) + + print_wf_values(theta1=0.0, theta2=0.0, use_j=True, B=0.1) + + +if __name__=='__main__': + print_point_values() + diff --git a/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp b/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp new file mode 100644 index 0000000000..d68ad67bac --- /dev/null +++ b/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp @@ -0,0 +1,364 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2022 QMCPACK developers. +// +// File developed by: Mark Dewing, mdewing@anl.gov Argonne National Laboratory +// +// File created by: Mark Dewing, mdewing@anl.gov Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + + +#include "catch.hpp" + + +#include "Message/Communicate.h" +#include "OhmmsData/Libxml2Doc.h" +#include "Particle/ParticleSetPool.h" +#include "QMCWaveFunctions/WaveFunctionPool.h" + + +#include +#include +#include + + +// Reference values from gen_rotated_lcao_wf.py + + +namespace qmcplusplus +{ +void setupParticleSetPool(ParticleSetPool& pp) +{ + // See ParticleIO/tests/test_xml_io.cpp for particle parsing + const char* particles = R"( + + + 2 + + + 0.0 0.0 0.0 + + + + + + -1 + + 1.0 2.0 3.0 + + + + -1 + + 0.0 1.1 2.2 + + + +)"; + Libxml2Document doc; + + bool okay = doc.parseFromString(particles); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + xmlNodePtr part_ion = xmlFirstElementChild(root); + pp.put(part_ion); + xmlNodePtr part_elec = xmlNextElementSibling(part_ion); + pp.put(part_elec); +} + + +// No Jastrow, rotation angle theta1=0.1 and theta2=0.2 +TEST_CASE("Rotated LCAO WF1", "[qmcapp]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSetPool pp(c); + setupParticleSetPool(pp); + + WaveFunctionPool wp(pp, c); + + REQUIRE(wp.empty() == true); + + + const char* wf_input = R"( + + + + + + + + + + + + + + 0.1 + + 1.0 0.0 + 0.0 1.0 + + + + 0.2 + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + )"; + + Libxml2Document doc; + bool okay = doc.parseFromString(wf_input); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + wp.put(root); + + TrialWaveFunction* psi = wp.getWaveFunction("psi0"); + REQUIRE(psi != nullptr); + REQUIRE(psi->getOrbitals().size() == 1); + + opt_variables_type opt_vars; + psi->checkInVariables(opt_vars); + psi->checkOutVariables(opt_vars); + psi->resetParameters(opt_vars); + + ParticleSet* elec = pp.getParticleSet("e"); + elec->update(); + + + double logval = psi->evaluateLog(*elec); + CHECK(logval == Approx(-9.26625670653773)); + + CHECK(elec->G[0][0] == ValueApprox(-0.2758747113720909)); + CHECK(elec->L[0] == ValueApprox(-0.316459652026054)); + CHECK(elec->L[1] == ValueApprox(-0.6035591598540904)); + + using ValueType = QMCTraits::ValueType; + std::vector dlogpsi(2); + std::vector dhpsioverpsi(2); + psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi); + + + CHECK(dlogpsi[0] == ValueApprox(7.58753078998516)); + CHECK(dlogpsi[1] == ValueApprox(2.58896036829191)); + CHECK(dhpsioverpsi[0] == ValueApprox(2.59551625714144)); + CHECK(dhpsioverpsi[1] == ValueApprox(1.70071425070404)); +} + +// No Jastrow, rotation angle of 0 +TEST_CASE("Rotated LCAO WF zero angle", "[qmcapp]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSetPool pp(c); + setupParticleSetPool(pp); + + WaveFunctionPool wp(pp, c); + + REQUIRE(wp.empty() == true); + + + const char* wf_input = R"( + + + + + + + + + + + + + + + 1.0 0.0 + 0.0 1.0 + + + + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + )"; + + Libxml2Document doc; + bool okay = doc.parseFromString(wf_input); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + wp.put(root); + + TrialWaveFunction* psi = wp.getWaveFunction("psi0"); + REQUIRE(psi != nullptr); + REQUIRE(psi->getOrbitals().size() == 1); + + opt_variables_type opt_vars; + psi->checkInVariables(opt_vars); + opt_vars.resetIndex(); + psi->checkOutVariables(opt_vars); + psi->resetParameters(opt_vars); + + ParticleSet* elec = pp.getParticleSet("e"); + elec->update(); + + + double logval = psi->evaluateLog(*elec); + CHECK(logval == Approx(-11.467952668216984)); + + CHECK(elec->G[0][0] == ValueApprox(-0.5345224838248487)); + CHECK(elec->L[0] == ValueApprox(-1.0690449676496974)); + CHECK(elec->L[1] == ValueApprox(-1.626231256363484)); + + using ValueType = QMCTraits::ValueType; + std::vector dlogpsi(2); + std::vector dhpsioverpsi(2); + psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi); + + CHECK(dlogpsi[0] == ValueApprox(32.2062050179872)); + CHECK(dlogpsi[1] == ValueApprox(5.87482925187464)); + CHECK(dhpsioverpsi[0] == ValueApprox(46.0088643114103)); + CHECK(dhpsioverpsi[1] == ValueApprox(7.84119772047731)); + + RefVectorWithLeader wf_list(*psi, {*psi}); + RefVectorWithLeader p_list(*elec, {*elec}); + + // Test list with one wavefunction + + int nparam = 2; + int nentry = 1; + RecordArray dlogpsi_list(nparam, nentry); + RecordArray dhpsi_over_psi_list(nparam, nentry); + + TrialWaveFunction::mw_evaluateParameterDerivatives(wf_list, p_list, opt_vars, dlogpsi_list, dhpsi_over_psi_list); + + CHECK(dlogpsi_list.getValue(0, 0) == ValueApprox(32.2062050179872)); + CHECK(dlogpsi_list.getValue(1, 0) == ValueApprox(5.87482925187464)); + CHECK(dhpsi_over_psi_list.getValue(0, 0) == ValueApprox(46.0088643114103)); + CHECK(dhpsi_over_psi_list.getValue(1, 0) == ValueApprox(7.84119772047731)); +} + +// Rotation angle of 0 and add Jastrow factory +TEST_CASE("Rotated LCAO WF with jastrow", "[qmcapp]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSetPool pp(c); + setupParticleSetPool(pp); + + WaveFunctionPool wp(pp, c); + + REQUIRE(wp.empty() == true); + + + const char* wf_input = R"( + + + + + + + + + + + + + + + + 1.0 0.0 + 0.0 1.0 + + + + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + + + 0.1 + + + )"; + + Libxml2Document doc; + bool okay = doc.parseFromString(wf_input); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + wp.put(root); + + TrialWaveFunction* psi = wp.getWaveFunction("psi0"); + REQUIRE(psi != nullptr); + REQUIRE(psi->getOrbitals().size() == 2); + + opt_variables_type opt_vars; + psi->checkInVariables(opt_vars); + opt_vars.resetIndex(); + psi->checkOutVariables(opt_vars); + psi->resetParameters(opt_vars); + + ParticleSet* elec = pp.getParticleSet("e"); + elec->update(); + + + double logval = psi->evaluateLog(*elec); + CHECK(logval == Approx(-15.791249652199634)); + + CHECK(elec->G[0][0] == ValueApprox(-0.2956989647881321)); + CHECK(elec->L[0] == ValueApprox(-0.6560429678274734)); + CHECK(elec->L[1] == ValueApprox(-1.2132292565412577)); + + using ValueType = QMCTraits::ValueType; + std::vector dlogpsi(3); + std::vector dhpsioverpsi(3); + psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi); + + CHECK(dlogpsi[0] == ValueApprox(32.206205017987166)); + CHECK(dlogpsi[1] == ValueApprox(5.874829251874641)); + CHECK(dlogpsi[2] == ValueApprox(49.08414605622605)); + CHECK(dhpsioverpsi[0] == ValueApprox(32.462519534916666)); + CHECK(dhpsioverpsi[1] == ValueApprox(10.047601212881027)); + CHECK(dhpsioverpsi[2] == ValueApprox(2.0820644399551895)); +} +} // namespace qmcplusplus From 58f01adf1c6b05eb65e626e305175175ea2543d6 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Fri, 10 Jun 2022 22:01:10 -0500 Subject: [PATCH 08/36] Add data access APIs in OhmmsArray --- src/Containers/OhmmsPETE/OhmmsArray.h | 82 +++++++++++++++++++ src/Containers/OhmmsPETE/tests/test_Array.cpp | 6 ++ 2 files changed, 88 insertions(+) diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index 0fca688529..3054120d20 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -82,7 +82,9 @@ class Array inline typename Container_t::const_iterator begin() const { return X.begin(); } inline typename Container_t::const_iterator end() const { return X.end(); } + /// access data pointer inline Type_t* data() { return X.data(); } + /// const access data pointer inline const Type_t* data() const { return X.data(); } template> inline Type_t* device_data() @@ -95,6 +97,74 @@ class Array return X.device_data(); } + template> + Type_t* data(const std::array& dims) + { + size_t offset = dims[0]; + for (int i = 1; i < dims.size(); i++) + offset = offset * Length[i] + dims[i]; + return X.data() + offset; + } + template> + const Type_t* data(const std::array& dims) const + { + size_t offset = dims[0]; + for (int i = 1; i < dims.size(); i++) + offset = offset * Length[i] + dims[i]; + return X.data() + offset; + } + template, + typename Allocator = ALLOC, + typename = qmcplusplus::IsDualSpace> + Type_t* device_data(const std::array& dims) + { + size_t offset = dims[0]; + for (int i = 1; i < dims.size(); i++) + offset = offset * Length[i] + dims[i]; + return X.device_data() + offset; + } + template, + typename Allocator = ALLOC, + typename = qmcplusplus::IsDualSpace> + const Type_t* device_data(const std::array& dims) const + { + size_t offset = dims[0]; + for (int i = 1; i < dims.size(); i++) + offset = offset * Length[i] + dims[i]; + return X.device_data() + offset; + } + + /// access data pointer at (offset_1, ..., offset_D}) + template + Type_t* data(Args... sizes) + { + static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + return data({static_cast(std::forward(sizes))...}); + } + /// const access data pointer at (offset_1, ..., offset_D}) + template + const Type_t* data(Args... sizes) const + { + static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + return data({static_cast(std::forward(sizes))...}); + } + template> + Type_t* device_data(Args... sizes) + { + static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + return device_data({static_cast(std::forward(sizes))...}); + } + /// const access data pointer at (offset_1, ..., offset_D}) + template> + const Type_t* device_data(Args... sizes) const + { + static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + return device_data({static_cast(std::forward(sizes))...}); + } + + inline const Type_t* first_address() const { return &(X[0]); } inline const Type_t* last_address() const { return &(X[0]) + X.size(); } @@ -152,6 +222,18 @@ class Array return s; } + // Abstract Dual Space Transfers + template> + void updateTo() + { + X.updateTo(); + } + template> + void updateFrom() + { + X.updateFrom(); + } + private: std::array Length; Container_t X; diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp index e4bf9cb3b1..36b1c0d2d5 100644 --- a/src/Containers/OhmmsPETE/tests/test_Array.cpp +++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp @@ -73,6 +73,12 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]") CHECK(vec_copy(0).back() == 123); } +TEST_CASE("Array::data", "[OhmmsPETE]") +{ + Array tensor(2, 4, 5); + CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data(1, 2, 3)); +} + TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]") { const int dim = 2; From 6685f2f07535159b42cfead01799a2e06c5b0d1b Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Fri, 10 Jun 2022 22:07:55 -0500 Subject: [PATCH 09/36] Renaming variables. --- src/Containers/OhmmsPETE/OhmmsArray.h | 48 +++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index 3054120d20..f27398d10c 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -98,70 +98,70 @@ class Array } template> - Type_t* data(const std::array& dims) + Type_t* data(const std::array& offsets) { - size_t offset = dims[0]; - for (int i = 1; i < dims.size(); i++) - offset = offset * Length[i] + dims[i]; + size_t offset = offsets[0]; + for (int i = 1; i < offsets.size(); i++) + offset = offset * Length[i] + offsets[i]; return X.data() + offset; } template> - const Type_t* data(const std::array& dims) const + const Type_t* data(const std::array& offsets) const { - size_t offset = dims[0]; - for (int i = 1; i < dims.size(); i++) - offset = offset * Length[i] + dims[i]; + size_t offset = offsets[0]; + for (int i = 1; i < offsets.size(); i++) + offset = offset * Length[i] + offsets[i]; return X.data() + offset; } template, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace> - Type_t* device_data(const std::array& dims) + Type_t* device_data(const std::array& offsets) { - size_t offset = dims[0]; - for (int i = 1; i < dims.size(); i++) - offset = offset * Length[i] + dims[i]; + size_t offset = offsets[0]; + for (int i = 1; i < offsets.size(); i++) + offset = offset * Length[i] + offsets[i]; return X.device_data() + offset; } template, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace> - const Type_t* device_data(const std::array& dims) const + const Type_t* device_data(const std::array& offsets) const { - size_t offset = dims[0]; - for (int i = 1; i < dims.size(); i++) - offset = offset * Length[i] + dims[i]; + size_t offset = offsets[0]; + for (int i = 1; i < offsets.size(); i++) + offset = offset * Length[i] + offsets[i]; return X.device_data() + offset; } /// access data pointer at (offset_1, ..., offset_D}) template - Type_t* data(Args... sizes) + Type_t* data(Args... offsets) { static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); - return data({static_cast(std::forward(sizes))...}); + return data({static_cast(std::forward(offsets))...}); } /// const access data pointer at (offset_1, ..., offset_D}) template - const Type_t* data(Args... sizes) const + const Type_t* data(Args... offsets) const { static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); - return data({static_cast(std::forward(sizes))...}); + return data({static_cast(std::forward(offsets))...}); } template> - Type_t* device_data(Args... sizes) + Type_t* device_data(Args... offsets) { static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); - return device_data({static_cast(std::forward(sizes))...}); + return device_data({static_cast(std::forward(offsets))...}); } /// const access data pointer at (offset_1, ..., offset_D}) template> - const Type_t* device_data(Args... sizes) const + const Type_t* device_data(Args... offsets) const { static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); - return device_data({static_cast(std::forward(sizes))...}); + return device_data({static_cast(std::forward(offsets))...}); } From 7f77018c8f1d5b936a4c3d3703aeacb9edfc0ca7 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 13 Jun 2022 19:43:49 -0500 Subject: [PATCH 10/36] Replace operator() with parameter packing. --- src/Containers/OhmmsPETE/OhmmsArray.h | 43 +++++++++----- src/Containers/OhmmsPETE/tests/test_Array.cpp | 4 +- src/Estimators/TraceManager.h | 13 +---- .../Fermion/BackflowTransformation.cpp | 7 +-- .../HarmonicOscillator/SHOSet.cpp | 56 ++++++++----------- 5 files changed, 58 insertions(+), 65 deletions(-) diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index f27398d10c..8744c1b789 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -140,27 +140,27 @@ class Array template Type_t* data(Args... offsets) { - static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); return data({static_cast(std::forward(offsets))...}); } /// const access data pointer at (offset_1, ..., offset_D}) template const Type_t* data(Args... offsets) const { - static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); return data({static_cast(std::forward(offsets))...}); } template> Type_t* device_data(Args... offsets) { - static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); return device_data({static_cast(std::forward(offsets))...}); } /// const access data pointer at (offset_1, ..., offset_D}) template> const Type_t* device_data(Args... offsets) const { - static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array"); + static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); return device_data({static_cast(std::forward(offsets))...}); } @@ -198,20 +198,33 @@ class Array } // Get and Set Operations - inline Type_t& operator()(size_t i) { return X[i]; } - - inline Type_t operator()(size_t i) const { return X[i]; } - inline Type_t& operator()(size_t i, size_t j) { return X[j + Length[1] * i]; } - inline Type_t operator()(size_t i, size_t j) const { return X[j + Length[1] * i]; } - inline Type_t& operator()(size_t i, size_t j, size_t k) { return X[k + Length[2] * (j + Length[1] * i)]; } - inline Type_t operator()(size_t i, size_t j, size_t k) const { return X[k + Length[2] * (j + Length[1] * i)]; } - inline Type_t& operator()(size_t i, size_t j, size_t k, size_t l) + template> + Type_t& operator()(const std::array& indices) { - return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))]; + size_t offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; + return X[offset]; } - inline Type_t operator()(size_t i, size_t j, size_t k, size_t l) const + template> + const Type_t& operator()(const std::array& indices) const + { + size_t offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; + return X[offset]; + } + template + Type_t& operator()(Args... offsets) + { + static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); + return operator()({static_cast(std::forward(offsets))...}); + } + template + const Type_t& operator()(Args... offsets) const { - return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))]; + static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); + return operator()({static_cast(std::forward(offsets))...}); } inline Type_t sum() const diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp index 36b1c0d2d5..87bd969eb2 100644 --- a/src/Containers/OhmmsPETE/tests/test_Array.cpp +++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp @@ -24,8 +24,8 @@ namespace qmcplusplus TEST_CASE("array", "[OhmmsPETE]") { using Array1D = Array; - Array1D A({3}); - Array1D B({3}); + Array1D A(3); + Array1D B(3); // iterator auto ia = A.begin(); diff --git a/src/Estimators/TraceManager.h b/src/Estimators/TraceManager.h index cfb081a7a8..b676759a3e 100644 --- a/src/Estimators/TraceManager.h +++ b/src/Estimators/TraceManager.h @@ -1193,9 +1193,7 @@ struct TraceBuffer if (has_complex) complex_samples->combine_samples(); //collect data from all samples into the buffer row - int offset = current_row * row_size; { - int boffset; std::vector*>& ordered_samples = samples->ordered_samples; for (int s = 0; s < ordered_samples.size(); s++) { @@ -1203,17 +1201,13 @@ struct TraceBuffer if (tsample.write) { auto& sample = tsample.sample; - boffset = offset + tsample.buffer_start; for (int i = 0; i < sample.size(); ++i) - { - buffer(boffset + i) = sample[i]; - } + buffer(current_row, tsample.buffer_start + i) = sample[i]; } } } if (has_complex) { - int boffset; std::vector>*>& ordered_samples = complex_samples->ordered_samples; for (int s = 0; s < ordered_samples.size(); s++) { @@ -1221,11 +1215,10 @@ struct TraceBuffer if (tsample.write) { auto& sample = tsample.sample; - boffset = offset + tsample.buffer_start; for (int i = 0, ib = 0; i < sample.size(); ++i, ib += 2) { - buffer(boffset + ib) = sample[i].real(); - buffer(boffset + ib + 1) = sample[i].imag(); + buffer(current_row, tsample.buffer_start + ib) = sample[i].real(); + buffer(current_row, tsample.buffer_start + ib + 1) = sample[i].imag(); } } } diff --git a/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp b/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp index 62f8e259cf..7417bd09f5 100644 --- a/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp +++ b/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp @@ -388,8 +388,7 @@ void BackflowTransformation::evaluateDerivatives(const ParticleSet& P) Bmat_full = 0.0; Cmat = 0.0; Ymat = 0.0; - for (int i = 0; i < Xmat.size(); i++) - Xmat(i) = 0; + std::fill_n(Xmat.data(), Xmat.size(), 0); for (int i = 0; i < NumTargets; i++) { QP.R[i] = P.R[i]; @@ -414,9 +413,7 @@ void BackflowTransformation::testDeriv(const ParticleSet& P) Bmat_full = 0.0; Cmat = 0.0; Ymat = 0.0; - // Xmat=DummyHess; - for (int i = 0; i < Xmat.size(); i++) - Xmat(i) = 0; + std::fill_n(Xmat.data(), Xmat.size(), 0); for (int i = 0; i < NumTargets; i++) { QP.R[i] = P.R[i]; diff --git a/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp b/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp index dbb7f941a1..28b87f6fad 100644 --- a/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp +++ b/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp @@ -137,22 +137,21 @@ void SHOSet::evaluate_vgl(PosType r, ValueVector& psi, GradVector& dpsi, ValueVe void SHOSet::evaluate_hermite(const PosType& xpos) { - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { int nh = qn_max[d]; if (nh > 0) { - RealType x = xpos[d]; - hermite(0 + shift) = 1.0; - RealType Hnm2 = 0.0; - RealType Hnm1 = 1.0; + RealType x = xpos[d]; + hermite(d, 0) = 1.0; + RealType Hnm2 = 0.0; + RealType Hnm1 = 1.0; for (int n = 1; n < nh; ++n) { - RealType Hn = 2 * (x * Hnm1 - (n - 1) * Hnm2); - hermite(n + shift) = Hn; - Hnm2 = Hnm1; - Hnm1 = Hn; + RealType Hn = 2 * (x * Hnm1 - (n - 1) * Hnm2); + hermite(d, n) = Hn; + Hnm2 = Hnm1; + Hnm1 = Hn; } } } @@ -162,24 +161,21 @@ void SHOSet::evaluate_hermite(const PosType& xpos) void SHOSet::evaluate_d0(const PosType& xpos, ValueVector& psi) { using std::exp; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { RealType x = xpos[d]; RealType g = exp(-.5 * x * x); for (int n = 0; n < qn_max[d]; ++n) { - int ns = n + shift; - bvalues(ns) = prefactors[n] * g * hermite(ns); + bvalues(d, n) = prefactors[n] * g * hermite(d, n); } } for (int s = 0; s < state_info.size(); ++s) { const SHOState& state = state_info[s]; RealType phi = 1.0; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) - phi *= bvalues(shift + state.quantum_number[d]); + for (int d = 0; d < DIM; ++d) + phi *= bvalues(d, state.quantum_number[d]); psi[s] = phi; } } @@ -188,26 +184,23 @@ void SHOSet::evaluate_d0(const PosType& xpos, ValueVector& psi) void SHOSet::evaluate_d1(const PosType& xpos, ValueVector& psi, GradVector& dpsi) { RealType ol = 1.0 / length; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { RealType x = xpos[d]; RealType Hnm1 = 0.0; for (int n = 0; n < qn_max[d]; ++n) { - int ns = n + shift; - RealType Hn = hermite(ns); - bvalues(ns) = (-x + 2 * n * Hnm1 / Hn) * ol; - Hnm1 = Hn; + RealType Hn = hermite(d, n); + bvalues(d, n) = (-x + 2 * n * Hnm1 / Hn) * ol; + Hnm1 = Hn; } } for (int s = 0; s < state_info.size(); ++s) { const SHOState& state = state_info[s]; TinyVector dphi; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) - dphi[d] = bvalues(shift + state.quantum_number[d]); + for (int d = 0; d < DIM; ++d) + dphi[d] = bvalues(d, state.quantum_number[d]); dphi *= psi[s]; dpsi[s] = dphi; } @@ -217,24 +210,21 @@ void SHOSet::evaluate_d1(const PosType& xpos, ValueVector& psi, GradVector& dpsi void SHOSet::evaluate_d2(const PosType& xpos, ValueVector& psi, ValueVector& d2psi) { RealType ol2 = 1.0 / (length * length); - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { RealType x = xpos[d]; RealType x2 = x * x; for (int n = 0; n < qn_max[d]; ++n) { - int ns = n + shift; - bvalues(ns) = (-1.0 + x2 - 2 * n) * ol2; + bvalues(d, n) = (-1.0 + x2 - 2 * n) * ol2; } } for (int s = 0; s < state_info.size(); ++s) { const SHOState& state = state_info[s]; ValueType d2phi = 0.0; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) - d2phi += bvalues(shift + state.quantum_number[d]); + for (int d = 0; d < DIM; ++d) + d2phi += bvalues(d, state.quantum_number[d]); d2phi *= psi[s]; d2psi[s] = d2phi; } From c2105272523d444c7c698a0e544d3a29303c7a41 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 13 Jun 2022 19:53:22 -0500 Subject: [PATCH 11/36] Rename data(offset) to data_at(indices) --- src/Containers/OhmmsPETE/OhmmsArray.h | 64 +++++++++---------- src/Containers/OhmmsPETE/tests/test_Array.cpp | 2 +- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index 8744c1b789..7f222af7e8 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -98,73 +98,73 @@ class Array } template> - Type_t* data(const std::array& offsets) + Type_t* data_at(const std::array& indices) { - size_t offset = offsets[0]; - for (int i = 1; i < offsets.size(); i++) - offset = offset * Length[i] + offsets[i]; + size_t offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; return X.data() + offset; } template> - const Type_t* data(const std::array& offsets) const + const Type_t* data_at(const std::array& indices) const { - size_t offset = offsets[0]; - for (int i = 1; i < offsets.size(); i++) - offset = offset * Length[i] + offsets[i]; + size_t offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; return X.data() + offset; } template, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace> - Type_t* device_data(const std::array& offsets) + Type_t* device_data_at(const std::array& indices) { - size_t offset = offsets[0]; - for (int i = 1; i < offsets.size(); i++) - offset = offset * Length[i] + offsets[i]; + size_t offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; return X.device_data() + offset; } template, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace> - const Type_t* device_data(const std::array& offsets) const + const Type_t* device_data_at(const std::array& indices) const { - size_t offset = offsets[0]; - for (int i = 1; i < offsets.size(); i++) - offset = offset * Length[i] + offsets[i]; + size_t offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; return X.device_data() + offset; } - /// access data pointer at (offset_1, ..., offset_D}) + /// access data pointer at {index_1, ..., index_D} template - Type_t* data(Args... offsets) + Type_t* data_at(Args... indices) { static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); - return data({static_cast(std::forward(offsets))...}); + return data_at({static_cast(std::forward(indices))...}); } - /// const access data pointer at (offset_1, ..., offset_D}) + /// const access data pointer at {index_1, ..., index_D} template - const Type_t* data(Args... offsets) const + const Type_t* data_at(Args... indices) const { static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); - return data({static_cast(std::forward(offsets))...}); + return data_at({static_cast(std::forward(indices))...}); } + /// access data pointer at {index_1, ..., index_D} template> - Type_t* device_data(Args... offsets) + Type_t* device_data_at(Args... indices) { static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); - return device_data({static_cast(std::forward(offsets))...}); + return device_data_at({static_cast(std::forward(indices))...}); } - /// const access data pointer at (offset_1, ..., offset_D}) + /// const access data pointer at {index_1, ..., index_D} template> - const Type_t* device_data(Args... offsets) const + const Type_t* device_data_at(Args... indices) const { static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); - return device_data({static_cast(std::forward(offsets))...}); + return device_data_at({static_cast(std::forward(indices))...}); } - inline const Type_t* first_address() const { return &(X[0]); } inline const Type_t* last_address() const { return &(X[0]) + X.size(); } @@ -215,16 +215,16 @@ class Array return X[offset]; } template - Type_t& operator()(Args... offsets) + Type_t& operator()(Args... indices) { static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); - return operator()({static_cast(std::forward(offsets))...}); + return operator()({static_cast(std::forward(indices))...}); } template - const Type_t& operator()(Args... offsets) const + const Type_t& operator()(Args... indices) const { static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); - return operator()({static_cast(std::forward(offsets))...}); + return operator()({static_cast(std::forward(indices))...}); } inline Type_t sum() const diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp index 87bd969eb2..ca41aa5fcc 100644 --- a/src/Containers/OhmmsPETE/tests/test_Array.cpp +++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp @@ -76,7 +76,7 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]") TEST_CASE("Array::data", "[OhmmsPETE]") { Array tensor(2, 4, 5); - CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data(1, 2, 3)); + CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data_at(1, 2, 3)); } TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]") From ba4363bdb63a36ef699480c6ea39588f87ed5b04 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 13 Jun 2022 20:06:53 -0500 Subject: [PATCH 12/36] Replace redundant code with a function. --- src/Containers/OhmmsPETE/OhmmsArray.h | 47 +++++++++++---------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index 7f222af7e8..e730e2449a 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -100,18 +100,12 @@ class Array template> Type_t* data_at(const std::array& indices) { - size_t offset = indices[0]; - for (int i = 1; i < indices.size(); i++) - offset = offset * Length[i] + indices[i]; - return X.data() + offset; + return X.data() + compute_offset(indices); } template> const Type_t* data_at(const std::array& indices) const { - size_t offset = indices[0]; - for (int i = 1; i < indices.size(); i++) - offset = offset * Length[i] + indices[i]; - return X.data() + offset; + return X.data() + compute_offset(indices); } template, @@ -119,10 +113,7 @@ class Array typename = qmcplusplus::IsDualSpace> Type_t* device_data_at(const std::array& indices) { - size_t offset = indices[0]; - for (int i = 1; i < indices.size(); i++) - offset = offset * Length[i] + indices[i]; - return X.device_data() + offset; + return X.device_data() + compute_offset(indices); } template, @@ -130,10 +121,7 @@ class Array typename = qmcplusplus::IsDualSpace> const Type_t* device_data_at(const std::array& indices) const { - size_t offset = indices[0]; - for (int i = 1; i < indices.size(); i++) - offset = offset * Length[i] + indices[i]; - return X.device_data() + offset; + return X.device_data() + compute_offset(indices); } /// access data pointer at {index_1, ..., index_D} @@ -165,13 +153,13 @@ class Array return device_data_at({static_cast(std::forward(indices))...}); } - inline const Type_t* first_address() const { return &(X[0]); } + inline const Type_t* first_address() const { return X.data(); } - inline const Type_t* last_address() const { return &(X[0]) + X.size(); } + inline const Type_t* last_address() const { return X.data() + X.size(); } - inline Type_t* first_address() { return &(X[0]); } + inline Type_t* first_address() { return X.data(); } - inline Type_t* last_address() { return &(X[0]) + X.size(); } + inline Type_t* last_address() { return X.data() + X.size(); } This_t& operator=(const T& rhs) { @@ -201,18 +189,12 @@ class Array template> Type_t& operator()(const std::array& indices) { - size_t offset = indices[0]; - for (int i = 1; i < indices.size(); i++) - offset = offset * Length[i] + indices[i]; - return X[offset]; + return X[compute_offset(indices)]; } template> const Type_t& operator()(const std::array& indices) const { - size_t offset = indices[0]; - for (int i = 1; i < indices.size(); i++) - offset = offset * Length[i] + indices[i]; - return X[offset]; + return X[compute_offset(indices)]; } template Type_t& operator()(Args... indices) @@ -258,6 +240,15 @@ class Array total *= dims[i]; return total; } + + template> + SIZET compute_offset(const std::array& indices) const + { + SIZET offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; + return offset; + } }; template From 036d6196c8d998466c4cf7ff5515f64f3b0debd2 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 13 Jun 2022 20:22:45 -0500 Subject: [PATCH 13/36] Grouping doxygen comments. --- src/Containers/OhmmsPETE/OhmmsArray.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index e730e2449a..02a36f0cff 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -82,9 +82,9 @@ class Array inline typename Container_t::const_iterator begin() const { return X.begin(); } inline typename Container_t::const_iterator end() const { return X.end(); } - /// access data pointer + ///@{ + /// access the container data pointer inline Type_t* data() { return X.data(); } - /// const access data pointer inline const Type_t* data() const { return X.data(); } template> inline Type_t* device_data() @@ -96,7 +96,10 @@ class Array { return X.device_data(); } + ///@} + ///@{ + /// access the data pointer at {index_1, ..., index_D} template> Type_t* data_at(const std::array& indices) { @@ -124,34 +127,31 @@ class Array return X.device_data() + compute_offset(indices); } - /// access data pointer at {index_1, ..., index_D} template Type_t* data_at(Args... indices) { static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); return data_at({static_cast(std::forward(indices))...}); } - /// const access data pointer at {index_1, ..., index_D} template const Type_t* data_at(Args... indices) const { static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); return data_at({static_cast(std::forward(indices))...}); } - /// access data pointer at {index_1, ..., index_D} template> Type_t* device_data_at(Args... indices) { static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); return device_data_at({static_cast(std::forward(indices))...}); } - /// const access data pointer at {index_1, ..., index_D} template> const Type_t* device_data_at(Args... indices) const { static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); return device_data_at({static_cast(std::forward(indices))...}); } + ///@} inline const Type_t* first_address() const { return X.data(); } @@ -185,7 +185,8 @@ class Array return *this; } - // Get and Set Operations + ///@{ + /// access the element at {index_1, ..., index_D} template> Type_t& operator()(const std::array& indices) { @@ -208,6 +209,7 @@ class Array static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); return operator()({static_cast(std::forward(indices))...}); } + ///@} inline Type_t sum() const { From 908f40ed7ea893a4d431a42efcc3be58f9c01b6e Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 13 Jun 2022 20:31:42 -0500 Subject: [PATCH 14/36] Expand a bit unit test. --- src/Containers/OhmmsPETE/tests/test_Array.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp index ca41aa5fcc..5903feeb53 100644 --- a/src/Containers/OhmmsPETE/tests/test_Array.cpp +++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp @@ -75,8 +75,13 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]") TEST_CASE("Array::data", "[OhmmsPETE]") { - Array tensor(2, 4, 5); + Array tensor(2, 4, 5); + REQUIRE(tensor.size() == 40); + CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data_at(1, 2, 3)); + + tensor(1, 2, 3) = 0.5f; + CHECK(*tensor.data_at(1, 2, 3) == 0.5f); } TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]") From d362348457f2d56dac7e61d4faced8b7d3a65e8c Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 19 Mar 2022 14:50:58 -0500 Subject: [PATCH 15/36] Adjust phi_vgl layout. --- .../BsplineFactory/SplineC2ROMPTarget.cpp | 124 ++++++++------- .../Fermion/MatrixDelayedUpdateCUDA.h | 54 +++---- .../Fermion/MatrixUpdateOMPTarget.h | 28 ++-- src/QMCWaveFunctions/SPOSet.cpp | 11 +- .../detail/CUDA/matrix_update_helper.cu | 149 +++++++++--------- .../detail/CUDA/matrix_update_helper.hpp | 36 ++--- 6 files changed, 200 insertions(+), 202 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp index 60c0c7d5fe..683703bc08 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp @@ -25,7 +25,6 @@ inline void assign_v(ST x, ST y, ST z, TT* restrict results_scratch_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, const ST* restrict myKcart_ptr, size_t myKcart_padded_size, @@ -61,8 +60,8 @@ inline void assign_vgl(ST x, ST y, ST z, TT* restrict results_scratch_ptr, + size_t orb_padded_size, const ST* mKK_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, size_t spline_padded_size, const ST symGGt[6], @@ -92,10 +91,6 @@ inline void assign_vgl(ST x, const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8; const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9; - TT* restrict psi = results_scratch_ptr; - TT* restrict dpsi = results_scratch_ptr + orb_size; - TT* restrict d2psi = results_scratch_ptr + orb_size * 4; - const size_t jr = index << 1; const size_t ji = jr + 1; @@ -131,20 +126,27 @@ inline void assign_vgl(ST x, const ST lap_r = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i); const ST lap_i = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r); - const size_t psiIndex = first_spo + index + omptarget::min(index, nComplexBands); - psi[psiIndex] = c * val_r - s * val_i; - d2psi[psiIndex] = c * lap_r - s * lap_i; - dpsi[psiIndex * 3] = c * gX_r - s * gX_i; - dpsi[psiIndex * 3 + 1] = c * gY_r - s * gY_i; - dpsi[psiIndex * 3 + 2] = c * gZ_r - s * gZ_i; + TT* restrict psi = results_scratch_ptr; + TT* restrict dpsi_x = results_scratch_ptr + orb_padded_size; + TT* restrict dpsi_y = results_scratch_ptr + orb_padded_size * 2; + TT* restrict dpsi_z = results_scratch_ptr + orb_padded_size * 3; + TT* restrict d2psi = results_scratch_ptr + orb_padded_size * 4; + + const size_t psiIndex = first_spo + index + omptarget::min(index, nComplexBands); + + psi[psiIndex] = c * val_r - s * val_i; + d2psi[psiIndex] = c * lap_r - s * lap_i; + dpsi_x[psiIndex] = c * gX_r - s * gX_i; + dpsi_y[psiIndex] = c * gY_r - s * gY_i; + dpsi_z[psiIndex] = c * gZ_r - s * gZ_i; if (index < nComplexBands) { - psi[psiIndex + 1] = c * val_i + s * val_r; - d2psi[psiIndex + 1] = c * lap_i + s * lap_r; - dpsi[psiIndex * 3 + 3] = c * gX_i + s * gX_r; - dpsi[psiIndex * 3 + 4] = c * gY_i + s * gY_r; - dpsi[psiIndex * 3 + 5] = c * gZ_i + s * gZ_r; + psi[psiIndex + 1] = c * val_i + s * val_r; + d2psi[psiIndex + 1] = c * lap_i + s * lap_r; + dpsi_x[psiIndex + 1] = c * gX_i + s * gX_r; + dpsi_y[psiIndex + 1] = c * gY_i + s * gY_r; + dpsi_z[psiIndex + 1] = c * gZ_i + s * gZ_r; } } } // namespace C2R @@ -277,7 +279,7 @@ void SplineC2ROMPTarget::evaluateValue(const ParticleSet& P, const int iat, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_v(x, y, z, psi_ptr, orb_size, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, + C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index); } } @@ -316,7 +318,7 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const auto padded_size = myV.size(); offload_scratch.resize(padded_size * nVP); const auto orb_size = psiinv.size(); - results_scratch.resize(orb_size * nVP); + results_scratch.resize(padded_size * nVP); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -341,7 +343,7 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat; - auto* restrict psi_iat_ptr = results_scratch_ptr + orb_size * iat; + auto* restrict psi_iat_ptr = results_scratch_ptr + padded_size * iat; auto* restrict pos_scratch = psiinv_ptr + orb_size; int ix, iy, iz; @@ -358,7 +360,7 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]), - psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, + psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index); const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx); @@ -438,7 +440,7 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeadergetSplinePtr(); @@ -463,7 +465,7 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(TT)); auto* restrict psiinv_ptr = reinterpret_cast(buffer_H2D_ptr)[ref_id_ptr[iat]]; auto* restrict pos_scratch = reinterpret_cast(buffer_H2D_ptr + nw * sizeof(ValueType*)); @@ -482,7 +484,7 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader::evaluateVGL(const ParticleSet& P, offload_scratch.resize(padded_size * 10); const auto orb_size = psi.size(); // for V(1)G(3)L(1) final result - results_scratch.resize(orb_size * 5); + results_scratch.resize(padded_size * 5); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -666,7 +668,7 @@ void SplineC2ROMPTarget::evaluateVGL(const ParticleSet& P, { ScopedTimer offload(offload_timer_); PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \ - map(always, from: results_scratch_ptr[0:orb_size*5])") + map(always, from: results_scratch_ptr[0:padded_size*5])") for (int team_id = 0; team_id < NumTeams; team_id++) { const size_t first = ChunkSizePerTeam * team_id; @@ -691,7 +693,7 @@ void SplineC2ROMPTarget::evaluateVGL(const ParticleSet& P, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G, + C2R::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index); } } @@ -699,10 +701,10 @@ void SplineC2ROMPTarget::evaluateVGL(const ParticleSet& P, for (size_t i = 0; i < orb_size; i++) { psi[i] = results_scratch[i]; - dpsi[i][0] = results_scratch[orb_size + i * 3]; - dpsi[i][1] = results_scratch[orb_size + i * 3 + 1]; - dpsi[i][2] = results_scratch[orb_size + i * 3 + 2]; - d2psi[i] = results_scratch[orb_size * 4 + i]; + dpsi[i][0] = results_scratch[i + padded_size * 1]; + dpsi[i][1] = results_scratch[i + padded_size * 2]; + dpsi[i][2] = results_scratch[i + padded_size * 3]; + d2psi[i] = results_scratch[i + padded_size * 4]; } } @@ -722,7 +724,7 @@ void SplineC2ROMPTarget::evaluateVGLMultiPos(const VectorgetSplinePtr(); @@ -741,7 +743,7 @@ void SplineC2ROMPTarget::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL mw_offload_scratch.resize(padded_size * num_pos * 10); const auto orb_size = phi_vgl_v.size() / num_pos; // for V(1)G(3)L(1) final result - mw_results_scratch.resize(orb_size * num_pos * 5); + mw_results_scratch.resize(padded_size * num_pos * 5); // per team ratio and grads rg_private.resize(num_pos, NumTeams * 4); @@ -916,7 +918,7 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10; - auto* restrict psi_iw_ptr = results_scratch_ptr + orb_size * iw * 5; + auto* restrict psi_iw_ptr = results_scratch_ptr + padded_size * iw * 5; const auto* restrict pos_iw_ptr = reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw); const auto* restrict invRow_iw_ptr = *reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6); @@ -943,17 +945,21 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size, + C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index); - ValueType* restrict psi = psi_iw_ptr; - ValueType* restrict dpsi = psi_iw_ptr + orb_size; - ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4; + ValueType* restrict psi = psi_iw_ptr; + ValueType* restrict dpsi_x = psi_iw_ptr + padded_size; + ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2; + ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3; + ValueType* restrict d2psi = psi_iw_ptr + padded_size * 4; - ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size; - ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3; - ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size; + ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size; + ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride; + ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride; + ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride; + ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride; const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx); const size_t last_real = last_cplx + omptarget::min(nComplexBands_local, last_cplx); @@ -961,16 +967,16 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)") for (size_t j = first_real; j < last_real; j++) { - out_phi_v[j] = psi[j]; - out_phi_l[j] = d2psi[j]; - out_phi_g[j * 3] = dpsi[j * 3]; - out_phi_g[j * 3 + 1] = dpsi[j * 3 + 1]; - out_phi_g[j * 3 + 2] = dpsi[j * 3 + 2]; + out_phi[j] = psi[j]; + out_dphi_x[j] = dpsi_x[j]; + out_dphi_y[j] = dpsi_y[j]; + out_dphi_z[j] = dpsi_z[j]; + out_d2phi[j] = d2psi[j]; ratio += psi[j] * invRow_iw_ptr[j]; - grad_x += dpsi[j * 3] * invRow_iw_ptr[j]; - grad_y += dpsi[j * 3 + 1] * invRow_iw_ptr[j]; - grad_z += dpsi[j * 3 + 2] * invRow_iw_ptr[j]; + grad_x += dpsi_x[j] * invRow_iw_ptr[j]; + grad_y += dpsi_y[j] * invRow_iw_ptr[j]; + grad_z += dpsi_z[j] * invRow_iw_ptr[j]; } rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio; diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h index 7775b36441..6d37f5fc54 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h +++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h @@ -303,11 +303,11 @@ class MatrixDelayedUpdateCUDA const int lda = engine_leader.get_ref_psiMinv().cols(); mw_temp.resize(norb * n_accepted); mw_rcopy.resize(norb * n_accepted); - updateRow_buffer_H2D.resize((sizeof(Value*) * 8 + sizeof(Value)) * n_accepted); + updateRow_buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted); // to handle T** of Ainv, psi_v, temp, rcopy - Matrix ptr_buffer(reinterpret_cast(updateRow_buffer_H2D.data()), 8, n_accepted); - Value* c_ratio_inv = reinterpret_cast(updateRow_buffer_H2D.data() + sizeof(Value*) * 8 * n_accepted); + Matrix ptr_buffer(reinterpret_cast(updateRow_buffer_H2D.data()), 6, n_accepted); + Value* c_ratio_inv = reinterpret_cast(updateRow_buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted); for (int iw = 0, count = 0; iw < isAccepted.size(); iw++) if (isAccepted[iw]) { @@ -317,8 +317,6 @@ class MatrixDelayedUpdateCUDA ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count; ptr_buffer[4][count] = psiM_g_list[count]; ptr_buffer[5][count] = psiM_l_list[count]; - ptr_buffer[6][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw); - ptr_buffer[7][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw); c_ratio_inv[count] = Value(-1) / ratios[iw]; count++; @@ -333,7 +331,8 @@ class MatrixDelayedUpdateCUDA { Value** Ainv_mw_ptr = reinterpret_cast(updateRow_buffer_H2D.device_data()); - Value** phiV_mw_ptr = reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted); + Value** phiVGL_mw_ptr = + reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted); Value** temp_mw_ptr = reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 2); Value** rcopy_mw_ptr = @@ -342,21 +341,18 @@ class MatrixDelayedUpdateCUDA reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 4); Value** d2psiM_mw_out = reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 5); - Value** dpsiM_mw_in = - reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6); - Value** d2psiM_mw_in = - reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 7); Value* ratio_inv_mw = - reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 8); + reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6); + // invoke the Fahy's variant of Sherman-Morrison update. cudaErrorCheck(cuBLAS_MFs::gemv_batched(hstream, 'T', norb, norb, cone_vec.device_data(), Ainv_mw_ptr, lda, - phiV_mw_ptr, 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted), + phiVGL_mw_ptr, 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted), "cuBLAS_MFs::gemv_batched failed!"); cudaErrorCheck(CUDA::copyAinvRow_saveGL_cuda(hstream, rowchanged, norb, Ainv_mw_ptr, lda, temp_mw_ptr, - rcopy_mw_ptr, dpsiM_mw_in, d2psiM_mw_in, dpsiM_mw_out, d2psiM_mw_out, - n_accepted), + rcopy_mw_ptr, phiVGL_mw_ptr, phi_vgl_stride, dpsiM_mw_out, + d2psiM_mw_out, n_accepted), "CUDA::copyAinvRow_saveGL_cuda failed!"); @@ -579,11 +575,11 @@ class MatrixDelayedUpdateCUDA const int lda = engine_leader.get_psiMinv().cols(); const int nw = engines.size(); const int n_accepted = psiM_g_list.size(); - accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 14 + sizeof(Value)) * nw); + accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 12 + sizeof(Value)) * nw); engine_leader.resize_fill_constant_arrays(nw); - Matrix ptr_buffer(reinterpret_cast(accept_rejectRow_buffer_H2D.data()), 14, nw); - Value* c_ratio_inv = reinterpret_cast(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 14 * nw); + Matrix ptr_buffer(reinterpret_cast(accept_rejectRow_buffer_H2D.data()), 12, nw); + Value* c_ratio_inv = reinterpret_cast(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 12 * nw); for (int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++) { This_t& engine = engines[iw]; @@ -599,10 +595,8 @@ class MatrixDelayedUpdateCUDA ptr_buffer[7][count_accepted] = reinterpret_cast(engine.delay_list_gpu.data()); ptr_buffer[8][count_accepted] = engine.V_gpu.data() + norb * delay_count; ptr_buffer[9][count_accepted] = const_cast(phi_vgl_v_dev_ptr + norb * iw); - ptr_buffer[10][count_accepted] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw); - ptr_buffer[11][count_accepted] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw); - ptr_buffer[12][count_accepted] = psiM_g_list[count_accepted]; - ptr_buffer[13][count_accepted] = psiM_l_list[count_accepted]; + ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted]; + ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted]; c_ratio_inv[count_accepted] = Value(1) / ratios[iw]; count_accepted++; } @@ -640,18 +634,14 @@ class MatrixDelayedUpdateCUDA reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 7); Value** V_row_mw_ptr = reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 8); - Value** phiV_mw_ptr = + Value** phiVGL_mw_ptr = reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 9); - Value** dpsiM_mw_in = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10); - Value** d2psiM_mw_in = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11); Value** dpsiM_mw_out = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12); + reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10); Value** d2psiM_mw_out = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 13); + reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11); Value* ratio_inv_mw_ptr = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 14); + reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12); //std::copy_n(Ainv[rowchanged], norb, V[delay_count]); cudaErrorCheck(cuBLAS_MFs::copy_batched(hstream, norb, invRow_mw_ptr, 1, V_row_mw_ptr, 1, nw), @@ -660,7 +650,7 @@ class MatrixDelayedUpdateCUDA // the new Binv is [[X Y] [Z sigma]] //BLAS::gemv('T', norb, delay_count + 1, cminusone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1); cudaErrorCheck(cuBLAS_MFs::gemv_batched(hstream, 'T', norb, delay_count, cminusone_vec.device_data(), V_mw_ptr, - norb, phiV_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted), + norb, phiVGL_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted), "cuBLAS_MFs::gemv_batched failed!"); // Y //BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero, Binv.data() + delay_count, @@ -677,8 +667,8 @@ class MatrixDelayedUpdateCUDA "cuBLAS_MFs::ger_batched failed!"); // sigma and Z cudaErrorCheck(CUDA::add_delay_list_save_sigma_VGL_batched(hstream, delay_list_mw_ptr, rowchanged, delay_count, - Binv_mw_ptr, lda_Binv, ratio_inv_mw_ptr, phiV_mw_ptr, - dpsiM_mw_in, d2psiM_mw_in, U_row_mw_ptr, dpsiM_mw_out, + Binv_mw_ptr, lda_Binv, ratio_inv_mw_ptr, phiVGL_mw_ptr, + phi_vgl_stride, U_row_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, norb, n_accepted, nw), "CUDA::add_delay_list_save_y_VGL_batched failed!"); delay_count++; diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h index 185bd0a530..8a5bea5f3f 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h +++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h @@ -261,9 +261,9 @@ class MatrixUpdateOMPTarget engine_leader.resize_scratch_arrays(norb, n_accepted); // to handle Value** of Ainv, psi_v, temp, rcopy - buffer_H2D.resize((sizeof(Value*) * 8 + sizeof(Value)) * n_accepted); - Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 8, n_accepted); - Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * 8 * n_accepted); + buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted); + Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 6, n_accepted); + Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted); for (int iw = 0, count = 0; iw < isAccepted.size(); iw++) if (isAccepted[iw]) { @@ -273,8 +273,6 @@ class MatrixUpdateOMPTarget ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count; ptr_buffer[4][count] = psiM_g_list[count]; ptr_buffer[5][count] = psiM_l_list[count]; - ptr_buffer[6][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw); - ptr_buffer[7][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw); c_ratio_inv[count] = Value(-1) / ratios[iw]; count++; @@ -294,23 +292,21 @@ class MatrixUpdateOMPTarget use_device_ptr(buffer_H2D_ptr, cone_ptr, czero_ptr)") { Value** Ainv_mw_ptr = reinterpret_cast(buffer_H2D_ptr); - Value** phiV_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted); + Value** phiVGL_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted); Value** temp_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 2); Value** rcopy_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 3); Value** dpsiM_mw_out = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 4); Value** d2psiM_mw_out = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 5); - Value** dpsiM_mw_in = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 6); - Value** d2psiM_mw_in = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 7); - Value* ratio_inv_mw = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 8); + Value* ratio_inv_mw = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 6); // invoke the Fahy's variant of Sherman-Morrison update. - success = ompBLAS::gemv_batched(dummy_handle, 'T', norb, norb, cone_ptr, Ainv_mw_ptr, lda, phiV_mw_ptr, 1, + success = ompBLAS::gemv_batched(dummy_handle, 'T', norb, norb, cone_ptr, Ainv_mw_ptr, lda, phiVGL_mw_ptr, 1, czero_ptr, temp_mw_ptr, 1, n_accepted); if (success != 0) throw std::runtime_error("ompBLAS::gemv_batched failed."); PRAGMA_OFFLOAD("omp target teams distribute num_teams(n_accepted) is_device_ptr(Ainv_mw_ptr, temp_mw_ptr, \ - rcopy_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, dpsiM_mw_in, d2psiM_mw_in)") + rcopy_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, phiVGL_mw_ptr)") for (int iw = 0; iw < n_accepted; iw++) { Value* __restrict__ Ainv_ptr = Ainv_mw_ptr[iw]; @@ -318,8 +314,8 @@ class MatrixUpdateOMPTarget Value* __restrict__ rcopy_ptr = rcopy_mw_ptr[iw]; Value* __restrict__ dpsiM_out = dpsiM_mw_out[iw]; Value* __restrict__ d2psiM_out = d2psiM_mw_out[iw]; - Value* __restrict__ dpsiM_in = dpsiM_mw_in[iw]; - Value* __restrict__ d2psiM_in = d2psiM_mw_in[iw]; + Value* __restrict__ dpsiM_in = phiVGL_mw_ptr[iw] + phi_vgl_stride; + Value* __restrict__ d2psiM_in = phiVGL_mw_ptr[iw] + phi_vgl_stride * 4; temp_ptr[rowchanged] -= cone; PRAGMA_OFFLOAD("omp parallel for simd") @@ -328,9 +324,9 @@ class MatrixUpdateOMPTarget rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; // the following copying data on the device is not part of SM-1 // it is intended to copy dpsiM and d2psiM from temporary to final without a separate kernel. - dpsiM_out[i * 3] = dpsiM_in[i * 3]; - dpsiM_out[i * 3 + 1] = dpsiM_in[i * 3 + 1]; - dpsiM_out[i * 3 + 2] = dpsiM_in[i * 3 + 2]; + dpsiM_out[i * 3] = dpsiM_in[i]; + dpsiM_out[i * 3 + 1] = dpsiM_in[i + phi_vgl_stride]; + dpsiM_out[i * 3 + 2] = dpsiM_in[i + phi_vgl_stride * 2]; d2psiM_out[i] = d2psiM_in[i]; } } diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp index 88292d3a97..d2919ade80 100644 --- a/src/QMCWaveFunctions/SPOSet.cpp +++ b/src/QMCWaveFunctions/SPOSet.cpp @@ -117,18 +117,27 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader& s std::vector& grads) const { assert(this == &spo_list.getLeader()); + assert(OrbitalSetSize * nw == phi_vgl_v.size()); const size_t nw = spo_list.size(); const size_t norb_requested = phi_vgl_v.size() / nw; + GradVector dphi_v(norb_requested); #pragma omp parallel for for (int iw = 0; iw < nw; iw++) { ValueVector phi_v(phi_vgl_v.data() + norb_requested * iw, norb_requested); - GradVector dphi_v(reinterpret_cast(phi_vgl_v.data(1)) + norb_requested * iw, norb_requested); ValueVector d2phi_v(phi_vgl_v.data(4) + norb_requested * iw, norb_requested); spo_list[iw].evaluateVGL(P_list[iw], iat, phi_v, dphi_v, d2phi_v); ratios[iw] = simd::dot(invRow_ptr_list[iw], phi_v.data(), norb_requested); grads[iw] = simd::dot(invRow_ptr_list[iw], dphi_v.data(), norb_requested) / ratios[iw]; + + // transpose the array of gradients to SoA in phi_vgl_v + for (size_t idim = 0; idim < DIM; idim++) + { + ValueType* phi_g = phi_vgl_v.data(idim + 1) + norb_requested * iw; + for (size_t iorb = 0; iorb < OrbitalSetSize; iorb++) + phi_g[iorb] = dphi_v[iorb][idim]; + } } } diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu index 238e03e3d4..e152d15439 100644 --- a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu +++ b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu @@ -21,7 +21,7 @@ namespace CUDA { using namespace thrust::cuda_cub::core; } -} +} // namespace qmcplusplus #else #include #include "ROCm/cuda2hip.h" @@ -44,19 +44,18 @@ __global__ void copyAinvRow_saveGL_kernel(const int rowchanged, const int lda, T* const temp[], T* const rcopy[], - const T* const dphi_in[], - const T* const d2phi_in[], + const T* const phi_vgl_in[], + const size_t phi_vgl_stride, T* const dphi_out[], T* const d2phi_out[]) { - const int iw = blockIdx.x; - const T* __restrict__ Ainv_iw = Ainv[iw]; - T* __restrict__ temp_iw = temp[iw]; - T* __restrict__ rcopy_iw = rcopy[iw]; - const T* __restrict__ dphi_in_iw = dphi_in[iw]; - const T* __restrict__ d2phi_in_iw = d2phi_in[iw]; - T* __restrict__ dphi_out_iw = dphi_out[iw]; - T* __restrict__ d2phi_out_iw = d2phi_out[iw]; + const int iw = blockIdx.x; + const T* __restrict__ Ainv_iw = Ainv[iw]; + T* __restrict__ temp_iw = temp[iw]; + T* __restrict__ rcopy_iw = rcopy[iw]; + const T* __restrict__ phi_in_iw = phi_vgl_in[iw]; + T* __restrict__ dphi_out_iw = dphi_out[iw]; + T* __restrict__ d2phi_out_iw = d2phi_out[iw]; const int tid = threadIdx.x; if (tid == 0) @@ -72,10 +71,10 @@ __global__ void copyAinvRow_saveGL_kernel(const int rowchanged, // the following copying data on the device is not part of SM-1 // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel. - dphi_out_iw[col_id * 3] = dphi_in_iw[col_id * 3]; - dphi_out_iw[col_id * 3 + 1] = dphi_in_iw[col_id * 3 + 1]; - dphi_out_iw[col_id * 3 + 2] = dphi_in_iw[col_id * 3 + 2]; - d2phi_out_iw[col_id] = d2phi_in_iw[col_id]; + dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride]; + dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2]; + dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3]; + d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4]; } } } @@ -87,8 +86,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, float* const temp[], float* const rcopy[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const dphi_out[], float* const d2phi_out[], const int batch_count) @@ -100,7 +99,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); copyAinvRow_saveGL_kernel<<>>(rowchanged, n, Ainv, lda, temp, rcopy, - dphi_in, d2phi_in, dphi_out, d2phi_out); + phi_vgl_in, phi_vgl_stride, dphi_out, + d2phi_out); return cudaPeekAtLastError(); } @@ -111,8 +111,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, double* const temp[], double* const rcopy[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const dphi_out[], double* const d2phi_out[], const int batch_count) @@ -124,7 +124,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); copyAinvRow_saveGL_kernel<<>>(rowchanged, n, Ainv, lda, temp, rcopy, - dphi_in, d2phi_in, dphi_out, d2phi_out); + phi_vgl_in, phi_vgl_stride, dphi_out, + d2phi_out); return cudaPeekAtLastError(); } @@ -135,8 +136,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count) @@ -147,8 +148,10 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - copyAinvRow_saveGL_kernel<<>>(rowchanged, n, (const cuComplex**)Ainv, lda, (cuComplex**)temp, (cuComplex**)rcopy, - (const cuComplex**)dphi_in, (const cuComplex**)d2phi_in, (cuComplex**)dphi_out, (cuComplex**)d2phi_out); + copyAinvRow_saveGL_kernel + <<>>(rowchanged, n, (const cuComplex**)Ainv, lda, (cuComplex**)temp, + (cuComplex**)rcopy, (const cuComplex**)phi_vgl_in, phi_vgl_stride, + (cuComplex**)dphi_out, (cuComplex**)d2phi_out); return cudaPeekAtLastError(); } @@ -159,8 +162,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count) @@ -171,8 +174,10 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - copyAinvRow_saveGL_kernel<<>>(rowchanged, n, (const cuDoubleComplex**)Ainv, lda, (cuDoubleComplex**)temp, (cuDoubleComplex**)rcopy, - (const cuDoubleComplex**)dphi_in, (const cuDoubleComplex**)d2phi_in, (cuDoubleComplex**)dphi_out, (cuDoubleComplex**)d2phi_out); + copyAinvRow_saveGL_kernel + <<>>(rowchanged, n, (const cuDoubleComplex**)Ainv, lda, (cuDoubleComplex**)temp, + (cuDoubleComplex**)rcopy, (const cuDoubleComplex**)phi_vgl_in, phi_vgl_stride, + (cuDoubleComplex**)dphi_out, (cuDoubleComplex**)d2phi_out); return cudaPeekAtLastError(); } @@ -261,7 +266,9 @@ cudaError_t calcGradients_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - calcGradients_kernel, COLBS><<>>(n, (const thrust::complex**)Ainvrow, (const thrust::complex**)dpsiMrow, (thrust::complex*)grads_now); + calcGradients_kernel, COLBS> + <<>>(n, (const thrust::complex**)Ainvrow, + (const thrust::complex**)dpsiMrow, (thrust::complex*)grads_now); return cudaPeekAtLastError(); } @@ -278,7 +285,10 @@ cudaError_t calcGradients_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - calcGradients_kernel, COLBS><<>>(n, (const thrust::complex**)Ainvrow, (const thrust::complex**)dpsiMrow, (thrust::complex*)grads_now); + calcGradients_kernel, COLBS> + <<>>(n, (const thrust::complex**)Ainvrow, + (const thrust::complex**)dpsiMrow, + (thrust::complex*)grads_now); return cudaPeekAtLastError(); } @@ -289,9 +299,8 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[], T* const binv[], const int binv_lda, const T* const ratio_inv, - const T* const phi_in[], - const T* const dphi_in[], - const T* const d2phi_in[], + const T* const phi_vgl_in[], + const size_t phi_vgl_stride, T* const phi_out[], T* const dphi_out[], T* const d2phi_out[], @@ -304,14 +313,12 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[], if (iw < n_accepted) { // real accept, settle y and Z - int* __restrict__ delay_list_iw = delay_list[iw]; - T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda; - const T* __restrict__ phi_in_iw = phi_in[iw]; - const T* __restrict__ dphi_in_iw = dphi_in[iw]; - const T* __restrict__ d2phi_in_iw = d2phi_in[iw]; - T* __restrict__ phi_out_iw = phi_out[iw]; - T* __restrict__ dphi_out_iw = dphi_out[iw]; - T* __restrict__ d2phi_out_iw = d2phi_out[iw]; + int* __restrict__ delay_list_iw = delay_list[iw]; + T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda; + const T* __restrict__ phi_in_iw = phi_vgl_in[iw]; + T* __restrict__ phi_out_iw = phi_out[iw]; + T* __restrict__ dphi_out_iw = dphi_out[iw]; + T* __restrict__ d2phi_out_iw = d2phi_out[iw]; if (tid == 0) { @@ -335,10 +342,10 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[], { // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel. phi_out_iw[col_id] = phi_in_iw[col_id]; - dphi_out_iw[col_id * 3] = dphi_in_iw[col_id * 3]; - dphi_out_iw[col_id * 3 + 1] = dphi_in_iw[col_id * 3 + 1]; - dphi_out_iw[col_id * 3 + 2] = dphi_in_iw[col_id * 3 + 2]; - d2phi_out_iw[col_id] = d2phi_in_iw[col_id]; + dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride]; + dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2]; + dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3]; + d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4]; } } } @@ -379,9 +386,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, float* const binv[], const int binv_lda, const float* const ratio_inv, - const float* const phi_in[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const phi_out[], float* const dphi_out[], float* const d2phi_out[], @@ -396,8 +402,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); add_delay_list_save_sigma_VGL_kernel - <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_in, - dphi_in, d2phi_in, phi_out, dphi_out, d2phi_out, norb, n_accepted); + <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_vgl_in, + phi_vgl_stride, phi_out, dphi_out, d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -408,9 +414,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, double* const binv[], const int binv_lda, const double* const ratio_inv, - const double* const phi_in[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const phi_out[], double* const dphi_out[], double* const d2phi_out[], @@ -425,8 +430,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); add_delay_list_save_sigma_VGL_kernel - <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_in, - dphi_in, d2phi_in, phi_out, dphi_out, d2phi_out, norb, n_accepted); + <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_vgl_in, + phi_vgl_stride, phi_out, dphi_out, d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -437,9 +442,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], @@ -456,11 +460,9 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, add_delay_list_save_sigma_VGL_kernel, COLBS> <<>>(delay_list, rowchanged, delay_count, (thrust::complex**)binv, binv_lda, (const thrust::complex*)ratio_inv, - (const thrust::complex**)phi_in, - (const thrust::complex**)dphi_in, - (const thrust::complex**)d2phi_in, (thrust::complex**)phi_out, - (thrust::complex**)dphi_out, (thrust::complex**)d2phi_out, norb, - n_accepted); + (const thrust::complex**)phi_vgl_in, phi_vgl_stride, + (thrust::complex**)phi_out, (thrust::complex**)dphi_out, + (thrust::complex**)d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -471,9 +473,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], @@ -490,11 +491,9 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, add_delay_list_save_sigma_VGL_kernel, COLBS> <<>>(delay_list, rowchanged, delay_count, (thrust::complex**)binv, binv_lda, (const thrust::complex*)ratio_inv, - (const thrust::complex**)phi_in, - (const thrust::complex**)dphi_in, - (const thrust::complex**)d2phi_in, (thrust::complex**)phi_out, - (thrust::complex**)dphi_out, (thrust::complex**)d2phi_out, - norb, n_accepted); + (const thrust::complex**)phi_vgl_in, phi_vgl_stride, + (thrust::complex**)phi_out, (thrust::complex**)dphi_out, + (thrust::complex**)d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -566,7 +565,8 @@ cudaError_t applyW_batched(cudaStream_t& hstream, const int COLBS = 32; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - applyW_kernel<<>>(delay_list, delay_count, (cuComplex**)tempMat, lda); + applyW_kernel + <<>>(delay_list, delay_count, (cuComplex**)tempMat, lda); return cudaPeekAtLastError(); } @@ -583,7 +583,8 @@ cudaError_t applyW_batched(cudaStream_t& hstream, const int COLBS = 32; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - applyW_kernel<<>>(delay_list, delay_count, (cuDoubleComplex**)tempMat, lda); + applyW_kernel + <<>>(delay_list, delay_count, (cuDoubleComplex**)tempMat, lda); return cudaPeekAtLastError(); } diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp index 3a7e9a27eb..a6e5433808 100644 --- a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp +++ b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp @@ -40,8 +40,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, float* const temp[], float* const rcopy[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const dphi_out[], float* const d2phi_out[], const int batch_count); @@ -53,8 +53,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, double* const temp[], double* const rcopy[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const dphi_out[], double* const d2phi_out[], const int batch_count); @@ -66,8 +66,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count); @@ -79,8 +79,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count); @@ -121,9 +121,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, float* const binv[], const int binv_lda, const float* const ratio_inv, - const float* const phi_in[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const phi_out[], float* const dphi_out[], float* const d2phi_out[], @@ -138,9 +137,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, double* const binv[], const int binv_lda, const double* const ratio_inv, - const double* const phi_in[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const phi_out[], double* const dphi_out[], double* const d2phi_out[], @@ -155,9 +153,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], @@ -172,9 +169,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], From 4c96e70da5e144a2e70dafc01fde63f2889bf497 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Fri, 22 Apr 2022 20:31:41 -0500 Subject: [PATCH 16/36] Align SplineC2COMPTarget layout as SplineC2ROMPTarget --- .../BsplineFactory/SplineC2COMPTarget.cpp | 112 +++++++++--------- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp index ceb6a9c756..736b944040 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp @@ -26,7 +26,6 @@ inline void assign_v(ST x, ST y, ST z, TT* restrict results_scratch_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, const ST* restrict myKcart_ptr, size_t myKcart_padded_size, @@ -56,8 +55,8 @@ inline void assign_vgl(ST x, ST y, ST z, TT* restrict results_scratch_ptr, + size_t orb_padded_size, const ST* mKK_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, size_t spline_padded_size, const ST symGGt[6], @@ -86,10 +85,6 @@ inline void assign_vgl(ST x, const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8; const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9; - TT* restrict psi = results_scratch_ptr; - TT* restrict dpsi = results_scratch_ptr + orb_size; - TT* restrict d2psi = results_scratch_ptr + orb_size * 4; - const size_t jr = index << 1; const size_t ji = jr + 1; @@ -125,12 +120,18 @@ inline void assign_vgl(ST x, const ST lap_r = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i); const ST lap_i = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r); - const size_t psiIndex = first_spo + index; - psi[psiIndex] = TT(c * val_r - s * val_i, c * val_i + s * val_r); - d2psi[psiIndex] = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r); - dpsi[psiIndex * 3] = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r); - dpsi[psiIndex * 3 + 1] = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r); - dpsi[psiIndex * 3 + 2] = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r); + TT* restrict psi = results_scratch_ptr; + TT* restrict dpsi_x = results_scratch_ptr + orb_padded_size; + TT* restrict dpsi_y = results_scratch_ptr + orb_padded_size * 2; + TT* restrict dpsi_z = results_scratch_ptr + orb_padded_size * 3; + TT* restrict d2psi = results_scratch_ptr + orb_padded_size * 4; + + const size_t psiIndex = first_spo + index; + psi[psiIndex] = TT(c * val_r - s * val_i, c * val_i + s * val_r); + d2psi[psiIndex] = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r); + dpsi_x[psiIndex] = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r); + dpsi_y[psiIndex] = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r); + dpsi_z[psiIndex] = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r); } } // namespace C2C @@ -236,7 +237,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const auto padded_size = myV.size(); offload_scratch.resize(padded_size * nVP); const auto orb_size = psiinv.size(); - results_scratch.resize(orb_size * nVP); + results_scratch.resize(padded_size * nVP); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -260,7 +261,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat; - auto* restrict psi_iat_ptr = results_scratch_ptr + orb_size * iat; + auto* restrict psi_iat_ptr = results_scratch_ptr + padded_size * iat; auto* restrict pos_scratch = reinterpret_cast(psiinv_ptr + orb_size); int ix, iy, iz; @@ -277,8 +278,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]), - psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, - first_spo_local, index); + psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index); ComplexT sum(0); PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)") @@ -355,7 +355,7 @@ void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeadergetSplinePtr(); @@ -379,7 +379,7 @@ void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST)); auto* restrict psiinv_ptr = reinterpret_cast(buffer_H2D_ptr)[ref_id_ptr[iat]]; auto* restrict pos_scratch = reinterpret_cast(buffer_H2D_ptr + nw * sizeof(ValueType*)); @@ -397,7 +397,7 @@ void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader::evaluateVGL(const ParticleSet& P, offload_scratch.resize(padded_size * 10); const auto orb_size = psi.size(); // for V(1)G(3)L(1) final result - results_scratch.resize(orb_size * 5); + results_scratch.resize(padded_size * 5); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -523,7 +523,7 @@ void SplineC2COMPTarget::evaluateVGL(const ParticleSet& P, { ScopedTimer offload(offload_timer_); PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \ - map(always, from: results_scratch_ptr[0:orb_size*5])") + map(always, from: results_scratch_ptr[0:padded_size*5])") for (int team_id = 0; team_id < NumTeams; team_id++) { const size_t first = ChunkSizePerTeam * team_id; @@ -548,7 +548,7 @@ void SplineC2COMPTarget::evaluateVGL(const ParticleSet& P, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2C::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G, + C2C::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index); } } @@ -556,10 +556,10 @@ void SplineC2COMPTarget::evaluateVGL(const ParticleSet& P, for (size_t i = 0; i < orb_size; i++) { psi[i] = results_scratch[i]; - dpsi[i][0] = results_scratch[orb_size + i * 3]; - dpsi[i][1] = results_scratch[orb_size + i * 3 + 1]; - dpsi[i][2] = results_scratch[orb_size + i * 3 + 2]; - d2psi[i] = results_scratch[orb_size * 4 + i]; + dpsi[i][0] = results_scratch[i + padded_size]; + dpsi[i][1] = results_scratch[i + padded_size * 2]; + dpsi[i][2] = results_scratch[i + padded_size * 3]; + d2psi[i] = results_scratch[i + padded_size * 4]; } } @@ -579,7 +579,7 @@ void SplineC2COMPTarget::evaluateVGLMultiPos(const VectorgetSplinePtr(); @@ -597,7 +597,7 @@ void SplineC2COMPTarget::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL mw_offload_scratch.resize(padded_size * num_pos * 10); const auto orb_size = phi_vgl_v.size() / num_pos; // for V(1)G(3)L(1) final result - mw_results_scratch.resize(orb_size * num_pos * 5); + mw_results_scratch.resize(padded_size * num_pos * 5); // per team ratio and grads rg_private.resize(num_pos, NumTeams * 4); @@ -771,7 +771,7 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10; - auto* restrict psi_iw_ptr = results_scratch_ptr + orb_size * iw * 5; + auto* restrict psi_iw_ptr = results_scratch_ptr + padded_size * iw * 5; const auto* restrict pos_iw_ptr = reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw); const auto* restrict invRow_iw_ptr = *reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6); @@ -798,17 +798,21 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size, + C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index); - ValueType* restrict psi = psi_iw_ptr; - ValueType* restrict dpsi = psi_iw_ptr + orb_size; - ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4; + ValueType* restrict psi = psi_iw_ptr; + ValueType* restrict dpsi_x = psi_iw_ptr + padded_size; + ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2; + ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3; + ValueType* restrict d2psi = psi_iw_ptr + padded_size * 4; - ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size; - ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3; - ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size; + ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size; + ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride; + ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride; + ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride; + ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride; ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0); PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)") @@ -816,16 +820,16 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL { const size_t psiIndex = first_spo_local + j; - out_phi_v[psiIndex] = psi[psiIndex]; - out_phi_l[psiIndex] = d2psi[psiIndex]; - out_phi_g[psiIndex * 3] = dpsi[psiIndex * 3]; - out_phi_g[psiIndex * 3 + 1] = dpsi[psiIndex * 3 + 1]; - out_phi_g[psiIndex * 3 + 2] = dpsi[psiIndex * 3 + 2]; + out_phi[psiIndex] = psi[psiIndex]; + out_dphi_x[psiIndex] = dpsi_x[psiIndex]; + out_dphi_y[psiIndex] = dpsi_y[psiIndex]; + out_dphi_z[psiIndex] = dpsi_z[psiIndex]; + out_d2phi[psiIndex] = d2psi[psiIndex]; ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex]; - grad_x += dpsi[psiIndex * 3] * invRow_iw_ptr[psiIndex]; - grad_y += dpsi[psiIndex * 3 + 1] * invRow_iw_ptr[psiIndex]; - grad_z += dpsi[psiIndex * 3 + 2] * invRow_iw_ptr[psiIndex]; + grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex]; + grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex]; + grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex]; } rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio; From 8ce4c90d893bf3aa0032f9ffd9fed3795f5713de Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Fri, 10 Jun 2022 21:54:49 -0500 Subject: [PATCH 17/36] Adopt OhmmsArray in SPOSet. --- src/Configuration.h | 3 +- .../BsplineFactory/SplineC2COMPTarget.cpp | 8 ++-- .../BsplineFactory/SplineC2COMPTarget.h | 2 +- .../BsplineFactory/SplineC2ROMPTarget.cpp | 20 +++++----- .../BsplineFactory/SplineC2ROMPTarget.h | 2 +- .../Fermion/DiracDeterminantBatched.cpp | 21 +++------- .../Fermion/DiracDeterminantBatched.h | 6 ++- .../Fermion/MatrixDelayedUpdateCUDA.h | 40 +++++++++---------- .../Fermion/MatrixUpdateOMPTarget.h | 34 ++++++++-------- src/QMCWaveFunctions/SPOSet.cpp | 17 ++++---- src/QMCWaveFunctions/SPOSet.h | 9 +++-- 11 files changed, 80 insertions(+), 82 deletions(-) diff --git a/src/Configuration.h b/src/Configuration.h index 42ba461299..44230d95a2 100644 --- a/src/Configuration.h +++ b/src/Configuration.h @@ -50,7 +50,8 @@ struct QMCTraits { enum { - DIM = OHMMS_DIM + DIM = OHMMS_DIM, + DIM_VGL = OHMMS_DIM + 2 }; using QTBase = QMCTypes; using QTFull = QMCTypes; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp index 736b944040..3844d07b37 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp @@ -699,7 +699,7 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const { @@ -732,12 +732,12 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL } const size_t num_pos = nwalkers; + const auto orb_size = phi_vgl_v.size(2); + const auto padded_size = myV.size(); const size_t ChunkSizePerTeam = 512; const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam; - const auto padded_size = myV.size(); // for V(1)G(3)H(6) intermediate result mw_offload_scratch.resize(padded_size * num_pos * 10); - const auto orb_size = phi_vgl_v.size() / num_pos; // for V(1)G(3)L(1) final result mw_results_scratch.resize(padded_size * num_pos * 5); // per team ratio and grads @@ -757,7 +757,7 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL auto* rg_private_ptr = rg_private.data(); const size_t buffer_H2D_stride = buffer_H2D.cols(); const size_t first_spo_local = first_spo; - const size_t phi_vgl_stride = phi_vgl_v.capacity(); + const size_t phi_vgl_stride = num_pos * orb_size; { ScopedTimer offload(offload_timer_); diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h index 2a006b54d9..25e5d1c8d8 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h @@ -267,7 +267,7 @@ class SplineC2COMPTarget : public BsplineSet const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const override; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp index 683703bc08..387d24a0ee 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp @@ -279,8 +279,8 @@ void SplineC2ROMPTarget::evaluateValue(const ParticleSet& P, const int iat, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, - first_spo_local, nComplexBands_local, index); + C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, + nComplexBands_local, index); } } } @@ -360,8 +360,8 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]), - psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, - first_spo_local, nComplexBands_local, index); + psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, + nComplexBands_local, index); const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx); const size_t last_real = last_cplx + omptarget::min(nComplexBands_local, last_cplx); @@ -484,8 +484,8 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const { @@ -878,12 +878,12 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL } const size_t num_pos = nwalkers; + const auto orb_size = phi_vgl_v.size(2); + const auto padded_size = myV.size(); const size_t ChunkSizePerTeam = 512; const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam; - const auto padded_size = myV.size(); // for V(1)G(3)H(6) intermediate result mw_offload_scratch.resize(padded_size * num_pos * 10); - const auto orb_size = phi_vgl_v.size() / num_pos; // for V(1)G(3)L(1) final result mw_results_scratch.resize(padded_size * num_pos * 5); // per team ratio and grads @@ -903,7 +903,7 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL auto* rg_private_ptr = rg_private.data(); const size_t buffer_H2D_stride = buffer_H2D.cols(); const size_t first_spo_local = first_spo; - const size_t phi_vgl_stride = phi_vgl_v.capacity(); + const size_t phi_vgl_stride = num_pos * orb_size; const size_t nComplexBands_local = nComplexBands; { diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h index c1f311ee78..b183e6256c 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h @@ -276,7 +276,7 @@ class SplineC2ROMPTarget : public BsplineSet const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const override; diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp index b70f3431d0..fd877e858e 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp @@ -232,13 +232,11 @@ void DiracDeterminantBatched::mw_ratioGrad(const RefVectorWithLeader auto psiMinv_row_dev_ptr_list = DET_ENGINE::mw_getInvRow(engine_list, WorkingIndex, !Phi->isOMPoffload()); - phi_vgl_v.resize(NumOrbitals * wfc_list.size()); + phi_vgl_v.resize(DIM_VGL, wfc_list.size(), NumOrbitals); ratios_local.resize(wfc_list.size()); grad_new_local.resize(wfc_list.size()); - VectorSoaContainer phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(), - phi_vgl_v.capacity()); - wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view, + wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v, ratios_local, grad_new_local); } @@ -317,15 +315,8 @@ void DiracDeterminantBatched::mw_accept_rejectMove( det.curRatio = 1.0; } - if (!Phi->isOMPoffload() && n_accepted > 0) - { - auto* phi_vgl_v_ptr = phi_vgl_v.data(); - // transfer host to device, total size 5, v(1) + g(3) + l(1) - PRAGMA_OFFLOAD("omp target update to(phi_vgl_v_ptr[:phi_vgl_v.capacity()*5])") - } - DET_ENGINE::mw_accept_rejectRow(engine_list, WorkingIndex, psiM_g_dev_ptr_list, psiM_l_dev_ptr_list, isAccepted, - phi_vgl_v.device_data(), phi_vgl_v.capacity(), ratios_local); + phi_vgl_v, ratios_local); if (!safe_to_delay) DET_ENGINE::mw_updateInvMat(engine_list); @@ -595,15 +586,13 @@ void DiracDeterminantBatched::mw_calcRatio(const RefVectorWithLeader auto psiMinv_row_dev_ptr_list = DET_ENGINE::mw_getInvRow(engine_list, WorkingIndex, !Phi->isOMPoffload()); - phi_vgl_v.resize(NumOrbitals * wfc_list.size()); + phi_vgl_v.resize(DIM_VGL, wfc_list.size(), NumOrbitals); ratios_local.resize(wfc_list.size()); grad_new_local.resize(wfc_list.size()); - VectorSoaContainer phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(), - phi_vgl_v.capacity()); // calling Phi->mw_evaluateVGLandDetRatioGrads is a temporary workaround. // We may implement mw_evaluateVandDetRatio in the future. - wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view, + wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v, ratios_local, grad_new_local); } diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h index 176a00db44..04c5de28f6 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h @@ -55,6 +55,8 @@ class DiracDeterminantBatched : public DiracDeterminantBase using DualMatrix = Matrix>; using DualVGLVector = VectorSoaContainer>; + using OffloadMWVGLArray = typename SPOSet::OffloadMWVGLArray; + struct DiracDeterminantBatchedMultiWalkerResource : public Resource { DiracDeterminantBatchedMultiWalkerResource() : Resource("DiracDeterminantBatched") {} @@ -64,8 +66,8 @@ class DiracDeterminantBatched : public DiracDeterminantBase Resource* makeClone() const override { return new DiracDeterminantBatchedMultiWalkerResource(*this); } DualVector log_values; - /// value, grads, laplacian of single-particle orbital for particle-by-particle update and multi walker [5][nw*norb] - DualVGLVector phi_vgl_v; + /// value, grads, laplacian of single-particle orbital for particle-by-particle update and multi walker [5][nw][norb] + OffloadMWVGLArray phi_vgl_v; // multi walker of ratio std::vector ratios_local; // multi walker of grads diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h index 6d37f5fc54..e8837edebd 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h +++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h @@ -60,6 +60,8 @@ class MatrixDelayedUpdateCUDA using DualMatrix = Matrix>; template using DualVGLVector = VectorSoaContainer>; + template + using OffloadMWVGLArray = Array>; // [VGL, walker, Orbs] struct MatrixDelayedUpdateCUDAMultiWalkerMem : public Resource { @@ -269,8 +271,7 @@ class MatrixDelayedUpdateCUDA * \param[in] psiM_g_list device ptrs * \param[in] psiM_l_list device ptrs * \param[in] isAccepted bool but wait some lists are also filtered - * \param[in] phi_vgl_v_dev_ptr device ptr - * \param[in] phi_vgl_stride size of each "vector" in phi_vgl_v + * \param[in] phi_vgl_v multiple walker orbital VGL * \param[inout] ratios */ static void mw_updateRow(const RefVectorWithLeader& engines, @@ -278,8 +279,7 @@ class MatrixDelayedUpdateCUDA const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { auto& engine_leader = engines.getLeader(); @@ -293,14 +293,16 @@ class MatrixDelayedUpdateCUDA if (n_accepted == 0) return; - auto& hstream = engine_leader.cuda_handles_->hstream; - auto& updateRow_buffer_H2D = engine_leader.mw_mem_->updateRow_buffer_H2D; - auto& mw_temp = engine_leader.mw_mem_->mw_temp; - auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; - auto& cone_vec = engine_leader.mw_mem_->cone_vec; - auto& czero_vec = engine_leader.mw_mem_->czero_vec; - const int norb = engine_leader.get_ref_psiMinv().rows(); - const int lda = engine_leader.get_ref_psiMinv().cols(); + auto& hstream = engine_leader.cuda_handles_->hstream; + auto& updateRow_buffer_H2D = engine_leader.mw_mem_->updateRow_buffer_H2D; + auto& mw_temp = engine_leader.mw_mem_->mw_temp; + auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; + auto& cone_vec = engine_leader.mw_mem_->cone_vec; + auto& czero_vec = engine_leader.mw_mem_->czero_vec; + const int norb = engine_leader.get_ref_psiMinv().rows(); + const int lda = engine_leader.get_ref_psiMinv().cols(); + const int nw = engines.size(); + const size_t phi_vgl_stride = nw * norb; mw_temp.resize(norb * n_accepted); mw_rcopy.resize(norb * n_accepted); updateRow_buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted); @@ -312,7 +314,7 @@ class MatrixDelayedUpdateCUDA if (isAccepted[iw]) { ptr_buffer[0][count] = engines[iw].get_ref_psiMinv().device_data(); - ptr_buffer[1][count] = const_cast(phi_vgl_v_dev_ptr + norb * iw); + ptr_buffer[1][count] = const_cast(phi_vgl_v.device_data_at(0, iw, 0)); ptr_buffer[2][count] = mw_temp.device_data() + norb * count; ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count; ptr_buffer[4][count] = psiM_g_list[count]; @@ -540,8 +542,7 @@ class MatrixDelayedUpdateCUDA * \param[in] psiM_g_list * \param[in] psiM_l_list * \param[in] isAccepted - * \param[in] phi_vgl_v_dev_ptr - * \param[in] phi_vgl_stride size of each "vector" in phi_vgl_v + * \param[in] phi_vgl_v multiple walker orbital VGL * \param[inout] ratios */ static void mw_accept_rejectRow(const RefVectorWithLeader& engines, @@ -549,8 +550,7 @@ class MatrixDelayedUpdateCUDA const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { auto& engine_leader = engines.getLeader(); @@ -559,8 +559,7 @@ class MatrixDelayedUpdateCUDA if (engine_leader.isSM1()) { - mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v_dev_ptr, phi_vgl_stride, - ratios); + mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios); return; } @@ -575,6 +574,7 @@ class MatrixDelayedUpdateCUDA const int lda = engine_leader.get_psiMinv().cols(); const int nw = engines.size(); const int n_accepted = psiM_g_list.size(); + const size_t phi_vgl_stride = nw * norb; accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 12 + sizeof(Value)) * nw); engine_leader.resize_fill_constant_arrays(nw); @@ -594,7 +594,7 @@ class MatrixDelayedUpdateCUDA ptr_buffer[6][count_accepted] = engine.Binv_gpu.data() + delay_count; ptr_buffer[7][count_accepted] = reinterpret_cast(engine.delay_list_gpu.data()); ptr_buffer[8][count_accepted] = engine.V_gpu.data() + norb * delay_count; - ptr_buffer[9][count_accepted] = const_cast(phi_vgl_v_dev_ptr + norb * iw); + ptr_buffer[9][count_accepted] = const_cast(phi_vgl_v.device_data_at(0, iw, 0)); ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted]; ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted]; c_ratio_inv[count_accepted] = Value(1) / ratios[iw]; diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h index 8a5bea5f3f..94ff60e448 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h +++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h @@ -54,6 +54,8 @@ class MatrixUpdateOMPTarget using OffloadMatrix = Matrix>; template using OffloadVGLVector = VectorSoaContainer>; + template + using OffloadMWVGLArray = Array>; // [VGL, walker, Orbs] struct MatrixUpdateOMPTargetMultiWalkerMem : public Resource { @@ -240,23 +242,24 @@ class MatrixUpdateOMPTarget const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { const size_t n_accepted = psiM_g_list.size(); if (n_accepted == 0) return; - auto& engine_leader = engines.getLeader(); - auto& buffer_H2D = engine_leader.mw_mem_->buffer_H2D; - auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; - auto& cone_vec = engine_leader.mw_mem_->cone_vec; - auto& czero_vec = engine_leader.mw_mem_->czero_vec; - auto& mw_temp = engine_leader.mw_mem_->mw_temp; - auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; - const int norb = engine_leader.get_psiMinv().rows(); - const int lda = engine_leader.get_psiMinv().cols(); + auto& engine_leader = engines.getLeader(); + auto& buffer_H2D = engine_leader.mw_mem_->buffer_H2D; + auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; + auto& cone_vec = engine_leader.mw_mem_->cone_vec; + auto& czero_vec = engine_leader.mw_mem_->czero_vec; + auto& mw_temp = engine_leader.mw_mem_->mw_temp; + auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; + const int norb = engine_leader.get_psiMinv().rows(); + const int lda = engine_leader.get_psiMinv().cols(); + const size_t nw = isAccepted.size(); + const size_t phi_vgl_stride = nw * norb; engine_leader.resize_scratch_arrays(norb, n_accepted); @@ -264,11 +267,11 @@ class MatrixUpdateOMPTarget buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted); Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 6, n_accepted); Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted); - for (int iw = 0, count = 0; iw < isAccepted.size(); iw++) + for (int iw = 0, count = 0; iw < nw; iw++) if (isAccepted[iw]) { ptr_buffer[0][count] = engines[iw].get_ref_psiMinv().device_data(); - ptr_buffer[1][count] = const_cast(phi_vgl_v_dev_ptr + norb * iw); + ptr_buffer[1][count] = const_cast(phi_vgl_v.device_data_at(0, iw, 0)); ptr_buffer[2][count] = mw_temp.device_data() + norb * count; ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count; ptr_buffer[4][count] = psiM_g_list[count]; @@ -343,11 +346,10 @@ class MatrixUpdateOMPTarget const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { - mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v_dev_ptr, phi_vgl_stride, ratios); + mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios); } /** update the full Ainv and reset delay_count diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp index d2919ade80..93ade40335 100644 --- a/src/QMCWaveFunctions/SPOSet.cpp +++ b/src/QMCWaveFunctions/SPOSet.cpp @@ -112,20 +112,20 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader& s const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const { assert(this == &spo_list.getLeader()); - assert(OrbitalSetSize * nw == phi_vgl_v.size()); + assert(phi_vgl_v.size(0) == DIM_VGL); + assert(phi_vgl_v.size(1) == spo_list.size()); const size_t nw = spo_list.size(); - const size_t norb_requested = phi_vgl_v.size() / nw; + const size_t norb_requested = phi_vgl_v.size(2); GradVector dphi_v(norb_requested); -#pragma omp parallel for for (int iw = 0; iw < nw; iw++) { - ValueVector phi_v(phi_vgl_v.data() + norb_requested * iw, norb_requested); - ValueVector d2phi_v(phi_vgl_v.data(4) + norb_requested * iw, norb_requested); + ValueVector phi_v(phi_vgl_v.data_at(0, iw, 0), norb_requested); + ValueVector d2phi_v(phi_vgl_v.data_at(4, iw, 0), norb_requested); spo_list[iw].evaluateVGL(P_list[iw], iat, phi_v, dphi_v, d2phi_v); ratios[iw] = simd::dot(invRow_ptr_list[iw], phi_v.data(), norb_requested); @@ -134,11 +134,12 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader& s // transpose the array of gradients to SoA in phi_vgl_v for (size_t idim = 0; idim < DIM; idim++) { - ValueType* phi_g = phi_vgl_v.data(idim + 1) + norb_requested * iw; - for (size_t iorb = 0; iorb < OrbitalSetSize; iorb++) + ValueType* phi_g = phi_vgl_v.data_at(idim + 1, iw, 0); + for (size_t iorb = 0; iorb < norb_requested; iorb++) phi_g[iorb] = dphi_v[iorb][idim]; } } + phi_vgl_v.updateTo(); } void SPOSet::evaluateThirdDeriv(const ParticleSet& P, int first, int last, GGGMatrix& grad_grad_grad_logdet) diff --git a/src/QMCWaveFunctions/SPOSet.h b/src/QMCWaveFunctions/SPOSet.h index bc031a4591..4c17b1bb58 100644 --- a/src/QMCWaveFunctions/SPOSet.h +++ b/src/QMCWaveFunctions/SPOSet.h @@ -27,6 +27,7 @@ #ifdef QMC_CUDA #include "type_traits/CUDATypes.h" #endif +#include "OMPTarget/OffloadAlignedAllocators.hpp" namespace qmcplusplus { @@ -56,6 +57,8 @@ class SPOSet : public QMCTraits using Walker_t = ParticleSet::Walker_t; using SPOPool_t = std::map; + using OffloadMWVGLArray = Array>; // [VGL, walker, Orbs] + /** constructor */ SPOSet(bool use_OMP_offload = false, bool ion_deriv = false, bool optimizable = false); @@ -283,8 +286,8 @@ class SPOSet : public QMCTraits const RefVector& d2psi_v_list, const RefVector& dspin_v_list) const; - /** evaluate the values, gradients and laplacians of this single-particle orbital sets - * and determinant ratio and grads of multiple walkers + /** evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ratio + * and grads of multiple walkers. Device data of phi_vgl_v must be up-to-date upon return * @param spo_list the list of SPOSet pointers in a walker batch * @param P_list the list of ParticleSet pointers in a walker batch * @param iat active particle @@ -295,7 +298,7 @@ class SPOSet : public QMCTraits const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const; From 6ad3a904b5e095d52d261c9113c4d13cb90587e6 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 13 Jun 2022 23:05:33 -0500 Subject: [PATCH 18/36] Verbose magic number --- src/Configuration.h | 2 +- .../Fermion/MatrixDelayedUpdateCUDA.h | 37 ++++++++++++------- .../Fermion/MatrixUpdateOMPTarget.h | 16 ++++---- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/Configuration.h b/src/Configuration.h index 44230d95a2..0244546422 100644 --- a/src/Configuration.h +++ b/src/Configuration.h @@ -51,7 +51,7 @@ struct QMCTraits enum { DIM = OHMMS_DIM, - DIM_VGL = OHMMS_DIM + 2 + DIM_VGL = OHMMS_DIM + 2 // Value(1) + Gradients(OHMMS_DIM) + Laplacian(1) }; using QTBase = QMCTypes; using QTFull = QMCTypes; diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h index e8837edebd..55c187a4b9 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h +++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h @@ -206,11 +206,13 @@ class MatrixDelayedUpdateCUDA const int norb = engine_leader.get_psiMinv().rows(); const int nw = engines.size(); int& delay_count = engine_leader.delay_count; - prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * 7 * nw); + + constexpr size_t num_ptrs_packed = 7; // it must match packing and unpacking + prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); engine_leader.resize_fill_constant_arrays(nw); const int lda_Binv = engine_leader.Binv_gpu.cols(); - Matrix ptr_buffer(reinterpret_cast(prepare_inv_row_buffer_H2D.data()), 7, nw); + Matrix ptr_buffer(reinterpret_cast(prepare_inv_row_buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { This_t& engine = engines[iw]; @@ -305,11 +307,14 @@ class MatrixDelayedUpdateCUDA const size_t phi_vgl_stride = nw * norb; mw_temp.resize(norb * n_accepted); mw_rcopy.resize(norb * n_accepted); - updateRow_buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted); + + constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking + updateRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted); // to handle T** of Ainv, psi_v, temp, rcopy - Matrix ptr_buffer(reinterpret_cast(updateRow_buffer_H2D.data()), 6, n_accepted); - Value* c_ratio_inv = reinterpret_cast(updateRow_buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted); + Matrix ptr_buffer(reinterpret_cast(updateRow_buffer_H2D.data()), num_ptrs_packed, n_accepted); + Value* c_ratio_inv = + reinterpret_cast(updateRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted); for (int iw = 0, count = 0; iw < isAccepted.size(); iw++) if (isAccepted[iw]) { @@ -448,9 +453,10 @@ class MatrixDelayedUpdateCUDA auto& evalGrad_buffer_H2D = engine_leader.mw_mem_->evalGrad_buffer_H2D; auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; - const int nw = engines.size(); - evalGrad_buffer_H2D.resize(sizeof(Value*) * 2 * nw); - Matrix ptr_buffer(reinterpret_cast(evalGrad_buffer_H2D.data()), 2, nw); + const int nw = engines.size(); + constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking + evalGrad_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); + Matrix ptr_buffer(reinterpret_cast(evalGrad_buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { if (engine_leader.isSM1()) @@ -575,11 +581,14 @@ class MatrixDelayedUpdateCUDA const int nw = engines.size(); const int n_accepted = psiM_g_list.size(); const size_t phi_vgl_stride = nw * norb; - accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 12 + sizeof(Value)) * nw); + + constexpr size_t num_ptrs_packed = 12; // it must match packing and unpacking + accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * nw); engine_leader.resize_fill_constant_arrays(nw); - Matrix ptr_buffer(reinterpret_cast(accept_rejectRow_buffer_H2D.data()), 12, nw); - Value* c_ratio_inv = reinterpret_cast(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 12 * nw); + Matrix ptr_buffer(reinterpret_cast(accept_rejectRow_buffer_H2D.data()), num_ptrs_packed, nw); + Value* c_ratio_inv = + reinterpret_cast(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * nw); for (int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++) { This_t& engine = engines[iw]; @@ -693,10 +702,12 @@ class MatrixDelayedUpdateCUDA const int norb = engine_leader.get_psiMinv().rows(); const int lda = engine_leader.get_psiMinv().cols(); const int nw = engines.size(); - updateInv_buffer_H2D.resize(sizeof(Value*) * 6 * nw); + + constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking + updateInv_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); engine_leader.resize_fill_constant_arrays(nw); - Matrix ptr_buffer(reinterpret_cast(updateInv_buffer_H2D.data()), 6, nw); + Matrix ptr_buffer(reinterpret_cast(updateInv_buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { This_t& engine = engines[iw]; diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h index 94ff60e448..6ce11595b1 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h +++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h @@ -157,10 +157,11 @@ class MatrixUpdateOMPTarget auto& buffer_H2D = engine_leader.mw_mem_->buffer_H2D; auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; - const int norb = engine_leader.get_psiMinv().rows(); - const int nw = engines.size(); - buffer_H2D.resize(sizeof(Value*) * 2 * nw); - Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 2, nw); + const int norb = engine_leader.get_psiMinv().rows(); + const int nw = engines.size(); + constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking + buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); + Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { ptr_buffer[0][iw] = engines[iw].get_psiMinv().device_data() + rowchanged * engine_leader.get_psiMinv().cols(); @@ -264,9 +265,10 @@ class MatrixUpdateOMPTarget engine_leader.resize_scratch_arrays(norb, n_accepted); // to handle Value** of Ainv, psi_v, temp, rcopy - buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted); - Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 6, n_accepted); - Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted); + constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking + buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted); + Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), num_ptrs_packed, n_accepted); + Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted); for (int iw = 0, count = 0; iw < nw; iw++) if (isAccepted[iw]) { From 0d4d5e9db8dcda2e2f35b7d68dc89afe5981b47d Mon Sep 17 00:00:00 2001 From: Hyeondeok-Shin Date: Tue, 14 Jun 2022 11:49:53 -0500 Subject: [PATCH 19/36] Adjust tolerance for deterministic test for SOREP --- tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt b/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt index fee8c897d9..c130bbde4b 100644 --- a/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt +++ b/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt @@ -80,11 +80,11 @@ if(QMC_COMPLEX) #Deterministic test if(NOT QMC_MIXED_PRECISION) - list(APPEND DET_BCC_MO_VMC_SCALARS "totenergy" "-136.89324595 0.000001") + list(APPEND DET_BCC_MO_VMC_SCALARS "totenergy" "-136.89324595 0.000002") list(APPEND DET_BCC_MO_VMC_SCALARS "kinetic" "48.19451127 0.000001") - list(APPEND DET_BCC_MO_VMC_SCALARS "potential" "-185.08775723 0.000001") + list(APPEND DET_BCC_MO_VMC_SCALARS "potential" "-185.08775723 0.000002") list(APPEND DET_BCC_MO_VMC_SCALARS "eeenergy" "0.73302788 0.000001") - list(APPEND DET_BCC_MO_VMC_SCALARS "ionion" "-119.82754227 0.000001") + list(APPEND DET_BCC_MO_VMC_SCALARS "ionion" "-119.82754227 0.000002") list(APPEND DET_BCC_MO_VMC_SCALARS "localecp" "-90.13681859 0.000001") list(APPEND DET_BCC_MO_VMC_SCALARS "nonlocalecp" "24.16055828 0.000001") list(APPEND DET_BCC_MO_VMC_SCALARS "soecp" "-0.01698252 0.000001") @@ -102,11 +102,11 @@ if(QMC_COMPLEX) DET_BCC_MO_VMC_SCALARS # VMC ) - list(APPEND DET_BCC_MO_DMC_SCALARS "totenergy" "-135.29589113 0.000001") + list(APPEND DET_BCC_MO_DMC_SCALARS "totenergy" "-135.29589113 0.000002") list(APPEND DET_BCC_MO_DMC_SCALARS "kinetic" "29.19088964 0.000001") - list(APPEND DET_BCC_MO_DMC_SCALARS "potential" "-164.48678078 0.000001") + list(APPEND DET_BCC_MO_DMC_SCALARS "potential" "-164.48678078 0.000002") list(APPEND DET_BCC_MO_DMC_SCALARS "eeenergy" "-2.19402623 0.000001") - list(APPEND DET_BCC_MO_DMC_SCALARS "ionion" "-119.82754227 0.000001") + list(APPEND DET_BCC_MO_DMC_SCALARS "ionion" "-119.82754227 0.000002") list(APPEND DET_BCC_MO_DMC_SCALARS "localecp" "-64.57264081 0.000001") list(APPEND DET_BCC_MO_DMC_SCALARS "nonlocalecp" "22.11861933 0.000001") list(APPEND DET_BCC_MO_DMC_SCALARS "soecp" "-0.01119080 0.000001") From 21c6bf1801df18a67fef2af25228ebb629c0ff86 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Wed, 15 Jun 2022 16:15:41 -0500 Subject: [PATCH 20/36] Fix legacy CUDA compilation issue. --- src/Particle/ParticleSet.cpp | 7 ------- src/Particle/Walker.h | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Particle/ParticleSet.cpp b/src/Particle/ParticleSet.cpp index 8e69f67fbc..b6c72606fc 100644 --- a/src/Particle/ParticleSet.cpp +++ b/src/Particle/ParticleSet.cpp @@ -33,13 +33,6 @@ namespace qmcplusplus { using WP = WalkerProperties::Indexes; -//using namespace particle_info; - -#ifdef QMC_CUDA -template<> -int ParticleSet::Walker_t::cuda_DataSize = 0; -#endif - enum PSetTimers { PS_newpos, diff --git a/src/Particle/Walker.h b/src/Particle/Walker.h index 03347145d5..91847fcaba 100644 --- a/src/Particle/Walker.h +++ b/src/Particle/Walker.h @@ -154,7 +154,7 @@ class Walker /// Data for GPU-vectorized versions #ifdef QMC_CUDA - static inline int cuda_DataSize; + static inline int cuda_DataSize = 0; using cuda_Buffer_t = gpu::device_vector; cuda_Buffer_t cuda_DataSet; // Note that R_GPU has size N+1. The last element contains the From a25bf089bca193b0f619827311cf6c5460c90ccd Mon Sep 17 00:00:00 2001 From: Hyeondeok-Shin Date: Wed, 15 Jun 2022 16:22:29 -0500 Subject: [PATCH 21/36] Adjust tolerance for unit_test_estimators for One-body densty matrices on mixed precision build --- src/Estimators/tests/test_OneBodyDensityMatrices.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Estimators/tests/test_OneBodyDensityMatrices.cpp b/src/Estimators/tests/test_OneBodyDensityMatrices.cpp index fc1e704a73..b448c5ef8c 100644 --- a/src/Estimators/tests/test_OneBodyDensityMatrices.cpp +++ b/src/Estimators/tests/test_OneBodyDensityMatrices.cpp @@ -140,7 +140,7 @@ class OneBodyDensityMatricesTests auto* test_data = reinterpret_cast*>(test_in); for (size_t id = 0; id < size; id += 2) #if defined(MIXED_PRECISION) - CHECK(ref_data[id] == ComplexApprox(test_data[id]).epsilon(1e-4)); + CHECK(ref_data[id] == ComplexApprox(test_data[id]).epsilon(6e-4)); #else CHECK(ref_data[id] == ComplexApprox(test_data[id])); #endif From e0df1eb58bbdff5ea324568f36a9de34944a54bc Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Wed, 15 Jun 2022 17:27:28 -0500 Subject: [PATCH 22/36] Derive OpenMP OFFLOAD_ARCH based on HIP_ARCH. --- CMake/ClangCompilers.cmake | 22 ++++++++++++++++++---- CMakeLists.txt | 2 +- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake index 00d6f28cb4..c8ba9ddf9b 100644 --- a/CMake/ClangCompilers.cmake +++ b/CMake/ClangCompilers.cmake @@ -32,10 +32,6 @@ if(QMC_OMP) set(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Wno-linker-warnings") endif() - if(NOT DEFINED OFFLOAD_ARCH AND OFFLOAD_TARGET MATCHES "amdgcn") - set(OFFLOAD_ARCH gfx906) - endif() - if(NOT DEFINED OFFLOAD_ARCH AND OFFLOAD_TARGET MATCHES "nvptx64" AND DEFINED CMAKE_CUDA_ARCHITECTURES) @@ -51,6 +47,24 @@ if(QMC_OMP) endif() endif() + if(NOT DEFINED OFFLOAD_ARCH + AND OFFLOAD_TARGET MATCHES "amdgcn") + if (DEFINED HIP_ARCH) + list(LENGTH HIP_ARCH NUMBER_HIP_ARCHITECTURES) + if(NUMBER_HIP_ARCHITECTURES EQUAL "1") + set(OFFLOAD_ARCH ${HIP_ARCH}) + else() + message( + FATAL_ERROR + "LLVM does not yet support offload to multiple architectures! " + "Deriving OFFLOAD_ARCH from HIP_ARCH failed. " + "Please keep only one entry in HIP_ARCH or set OFFLOAD_ARCH.") + endif() + else() + set(OFFLOAD_ARCH gfx906) + endif() + endif() + if(DEFINED OFFLOAD_ARCH) set(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}") diff --git a/CMakeLists.txt b/CMakeLists.txt index 7517985f36..46f4eaeeab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -779,7 +779,7 @@ if(ENABLE_ROCM) message(STATUS "ROCM_ROOT not provided. Searching for FindHIP.cmake file.") find_path( HIP_MODULE_FILE_DIR FindHIP.cmake - HINTS /opt/rocm + HINTS $ENV{ROCM_PATH} /opt/rocm PATH_SUFFIXES hip/cmake) if(HIP_MODULE_FILE_DIR) message(STATUS "Found FindHIP.cmake file. ROCM_ROOT will be derived.") From 1aed50d855c20d871fb62736fc5ad16c14c9e991 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Wed, 15 Jun 2022 18:12:48 -0500 Subject: [PATCH 23/36] Update OpenMP offload build instruction. --- docs/installation.rst | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index fd7fa722ff..b5cd4fbe24 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -282,20 +282,20 @@ the path to the source directory. :: - QMC_CUDA Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no) QMC_COMPLEX Build the complex (general twist/k-point) version (1:yes, 0:no) QMC_MIXED_PRECISION Build the mixed precision (mixing double/float) version (1:yes (QMC_CUDA=1 default), 0:no (QMC_CUDA=0 default)). Mixed precision calculations can be signifiantly faster but should be carefully checked validated against full double precision runs, particularly for large electron counts. + ENABLE_OFFLOAD ON/OFF(default). Enable OpenMP target offload for GPU acceleration. + QMC_CUDA Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no) ENABLE_CUDA ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration. - Production quality for AFQMC. Pre-production quality for real-space. + Production quality for AFQMC and real-space performance portable implementation. Use CMAKE_CUDA_ARCHITECTURES, default 70, to set the actual GPU architecture. - ENABLE_OFFLOAD ON/OFF(default). Enable OpenMP target offload for GPU acceleration. - ENABLE_TIMERS ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse - to avoid potential slowdown in tiny systems. - For systems beyond tiny sizes (100+ electrons) there is no risk. + QMC_CUDA2HIP ON/OFF(default). To be set ON, it requires either QMC_CUDA or ENABLE_CUDA to be ON. + Compile CUDA source code as HIP and use ROCm libraries for AMD GPUs. + ENABLE_SYCL ON/OFF(default). Enable SYCL code path. Only support Intel GPUs and OneAPI compilers. - General build options @@ -327,6 +327,9 @@ the path to the source directory. :: + ENABLE_TIMERS ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse + to avoid potential slowdown in tiny systems. + For systems beyond tiny sizes (100+ electrons) there is no risk. QE_BIN Location of Quantum ESPRESSO binaries including pw2qmcpack.x RMG_BIN Location of RMG binary (rmg-cpu) QMC_DATA Specify data directory for QMCPACK performance and integration tests @@ -412,7 +415,7 @@ and is not suitable for production. Additional implementation in QMCPACK as well as improvements in open-source and vendor compilers is required for production status to be reached. The following compilers have been verified: -- LLVM Clang 11. Support NVIDIA GPUs. +- LLVM Clang 14. Support NVIDIA GPUs. :: @@ -425,31 +428,43 @@ to be reached. The following compilers have been verified: OFFLOAD_TARGET for the offload target. default nvptx64-nvidia-cuda. OFFLOAD_ARCH for the target architecture (sm_80, gfx906, ...) if not using the compiler default. -- AMD AOMP Clang 11.8. Support AMD GPUs. +- AMD ROCm/AOMP LLVM-based compilers. Support AMD GPUs. :: -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=amdgcn-amd-amdhsa -D OFFLOAD_ARCH=gfx906 -- Intel oneAPI beta08. Support Intel GPUs. +- Intel oneAPI 2022.1.0 icx/icpx compilers. Support Intel GPUs. :: -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=spir64 -- HPE Cray 11. It is derived from Clang and supports NVIDIA and AMD GPUs. +- HPE Cray 13. It is derived from Clang and supports NVIDIA and AMD GPUs. :: -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=nvptx64-nvidia-cuda -D OFFLOAD_ARCH=sm_80 OpenMP offload features can be used together with vendor specific code paths to maximize QMCPACK performance. -Some new CUDA functionality has been implemented to improve efficiency on NVIDIA GPUs in conjunction with the Offload code paths: -For example, using Clang 11 on Summit. +Some new CUDA functionality has been implemented to improve performance on NVIDIA GPUs in conjunction with the offload code paths: +For example, using Clang 14 on Summit. :: - -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70 -D CMAKE_CUDA_HOST_COMPILER=`which gcc` + -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70 + +Similarly, HIP features can be enabled in conjunction with the offload code path to improve performance on AMD GPUs. + + :: + + -D ENABLE_OFFLOAD=ON -D ENABLE_CUDA=ON -D QMC_CUDA2HIP=ON -DHIP_ARCH=gfx906 + +Similarly, SYCL features can be enabled in conjunction with the offload code path to improve performance on Intel GPUs. + + :: + + -D ENABLE_OFFLOAD=ON -D ENABLE_SYCL=ON Installation from CMake From 19acb573dfa04913a00cff4af847a6b9308a9898 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 20 Jun 2022 09:07:19 -0500 Subject: [PATCH 24/36] Update nightly_anl_bora.sh --- .../nightly_test_scripts/nightly_anl_bora.sh | 80 ++++++++++++++----- 1 file changed, 60 insertions(+), 20 deletions(-) diff --git a/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh b/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh index 0e1731e8fa..0799667cbe 100755 --- a/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh +++ b/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh @@ -13,11 +13,14 @@ fi module load cmake module load intel-mkl -module load cuda/10.1 + +module use /nfs/gce/projects/QMCPACK_dev/spack/share/spack/lmod/linux-ubuntu18.04-x86_64/Core +module load cuda/11.2.2-lo3x6k7 export TEST_SITE_NAME=bora.alcf.anl.gov export N_PROCS=16 export N_PROCS_BUILD=16 +export N_CONCURRENT_TESTS=16 # run on socket 1 NUMA_ID=0 @@ -26,7 +29,7 @@ QE_BIN=/scratch/opt/qe-stable/qe-6.4.1/bin QMC_DATA=/scratch/opt/h5data #Must be an absolute path -place=/scratch/QMCPACK_CI_BUILDS_DO_NOT_REMOVE +place=/scratch2/QMCPACK_CI_BUILDS_DO_NOT_REMOVE if [ ! -e $place ]; then mkdir $place @@ -56,9 +59,11 @@ cd $entry git checkout $branch -for sys in ClangDev-Offload-CUDA-Real ClangDev-Offload-CUDA-Complex ClangDev-Offload-CUDA-Real-Mixed ClangDev-Offload-CUDA-Complex-Mixed \ - Intel18-Real Intel18-Real-Mixed Intel18-Complex Intel18-Complex-Mixed \ - Intel18-Real-Mixed-CUDA2 Intel18-Complex-Mixed-CUDA2 Intel18-Real-Mixed-legacy-CUDA Intel18-Complex-Mixed-legacy-CUDA +for sys in ROCm-CUDA2HIP-Real ROCm-CUDA2HIP-Complex ROCm-CUDA2HIP-Real-Mixed ROCm-CUDA2HIP-Complex-Mixed \ + Intel19-Real Intel19-Real-Mixed Intel19-Complex Intel19-Complex-Mixed \ + ClangDev-Offload-Real ClangDev-Offload-Complex ClangDev-Offload-Real-Mixed ClangDev-Offload-Complex-Mixed \ + ClangDev-Offload-CUDA-Real ClangDev-Offload-CUDA-Complex ClangDev-Offload-CUDA-Real-Mixed ClangDev-Offload-CUDA-Complex-Mixed \ + Intel19-CUDA2-Real-Mixed Intel19-CUDA2-Complex-Mixed Intel19-legacy-CUDA-Real-Mixed Intel19-legacy-CUDA-Complex-Mixed do echo --- Building for $sys `date` @@ -78,42 +83,70 @@ if [[ $sys == *"ClangDev"* ]]; then #define and load compiler module load llvm/master-latest module load openmpi/4.0.2-llvm - export CC=mpicc - export CXX=mpicxx - CTEST_FLAGS="$CTEST_FLAGS -DCMAKE_C_FLAGS=-march=skylake -DCMAKE_CXX_FLAGS=-march=skylake" if [[ $sys == *"Offload-CUDA"* ]]; then - CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DUSE_OBJECT_TARGET=ON;-DCMAKE_CUDA_HOST_COMPILER=`which gcc`;-DQMC_NIO_MAX_SIZE=16'" - CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61" - CTEST_LABELS="-L deterministic -LE unstable" - export N_CONCURRENT_TESTS=4 + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DUSE_OBJECT_TARGET=ON;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=61 -DBUILD_AFQMC=ON" + elif [[ $sys == *"Offload"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DOFFLOAD_ARCH=sm_61;-DUSE_OBJECT_TARGET=ON;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DBUILD_AFQMC=ON" else CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN" - CTEST_LABELS="-L 'deterministic|performance' -LE unstable" - export N_CONCURRENT_TESTS=16 + fi + + export LIBOMP_USE_HIDDEN_HELPER_TASK=OFF + if [[ $sys == *"Real-Mixed"* ]]; then + #CTEST_LABELS="-E 'long|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp'" + CTEST_LABELS="-E 'restart|save|long|developer|He_param|example|cpu_limit|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp|short-C'" + else + CTEST_LABELS="-L 'deterministic|QMCPACK' -LE unstable -E long" + fi +elif [[ $sys == *"ROCm"* ]]; then + #define and load compiler + module load rocm/afar001-432 aomp/afar001-432 + module load openmpi/4.0.2-llvm + + if [[ $sys == *"Offload-CUDA2HIP"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DHIP_ARCH=gfx1030;-DENABLE_OFFLOAD=ON;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON" + elif [[ $sys == *"CUDA2HIP"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DHIP_ARCH=gfx1030;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON" + elif [[ $sys == *"Offload"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DOFFLOAD_TARGET=amdgcn-amd-amdhsa;-DOFFLOAD_ARCH=gfx1030;-DQMC_NIO_MAX_SIZE=8'" + else + CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN" + fi + + if [[ $sys == *"Real-Mixed"* ]]; then + #CTEST_LABELS="-E 'long|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp'" + CTEST_LABELS="-E 'restart|save|long|developer|He_param|example|cpu_limit|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp|short-C'" + else + CTEST_LABELS="-L 'deterministic|QMCPACK' -LE unstable -E long" fi elif [[ $sys == *"Intel"* ]]; then #define and load compiler - module load intel/18.4 + module load intel/20.2 module load openmpi/4.0.2-intel - export CC=mpicc - export CXX=mpicxx CTEST_FLAGS="$CTEST_FLAGS -DCMAKE_C_FLAGS=-xCOMMON-AVX512 -DCMAKE_CXX_FLAGS=-xCOMMON-AVX512" if [[ $sys == *"-CUDA2"* ]]; then CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61 -DQMC_OPTIONS='-DQMC_NIO_MAX_SIZE=16'" - CTEST_LABELS="-L 'deterministic|performance' -LE unstable" + CTEST_LABELS="-L 'deterministic|performance' -LE unstable -E long" export N_CONCURRENT_TESTS=4 elif [[ $sys == *"-legacy-CUDA"* ]]; then CTEST_FLAGS="$CTEST_FLAGS -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61 -DQMC_OPTIONS='-DQMC_NIO_MAX_SIZE=16'" - CTEST_LABELS="-L 'deterministic|performance' -LE unstable" + CTEST_LABELS="-L 'deterministic|performance' -LE unstable -E long" export N_CONCURRENT_TESTS=1 else CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN" + CTEST_LABELS="-E long" export N_CONCURRENT_TESTS=16 fi fi +CTEST_FLAGS="-DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx $CTEST_FLAGS" + # compiler independent options if [[ $sys == *"-Complex"* ]]; then CTEST_FLAGS="$CTEST_FLAGS -DQMC_COMPLEX=1" @@ -132,8 +165,15 @@ fi mkdir $folder cd $folder +logfile=$place/log/$entry/$mydate/${QMCPACK_TEST_SUBMIT_NAME}.log +echo "$CTEST_FLAGS $CTEST_LABELS" | tee $logfile +echo PATH=$PATH >> $logfile +echo OMPI_CXX=${OMPI_CXX} >> $logfile +mpicxx -v >> $logfile +echo >> $logfile + numactl -N $NUMA_ID \ -ctest $CTEST_FLAGS $CTEST_LABELS -S $PWD/../CMake/ctest_script.cmake,release -VV -E 'long' --timeout 800 &> $place/log/$entry/$mydate/${QMCPACK_TEST_SUBMIT_NAME}.log +ctest $CTEST_FLAGS $CTEST_LABELS -S $PWD/../CMake/ctest_script.cmake,release -VV --timeout 800 &>> $logfile cd .. echo --- Finished $sys `date` From 4d21eb1bd418a4a9678da2ad92c02f0847705b99 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 20 Jun 2022 09:21:56 -0500 Subject: [PATCH 25/36] Update test scripts and remove unused scripts. --- tests/test_automation/Jenkinsfile_ornl_vm | 4 - tests/test_automation/README.md | 5 +- .../add_fortran_for_spack_clang.py | 26 --- tests/test_automation/jenkins_ornl_test.sh | 36 ---- .../nightly_alcf_hyperion.sh | 178 ------------------ .../spack_supported_package_versions.sh | 50 ----- tests/test_automation/start_spack_env.sh | 14 -- tests/test_automation/vm_setup.sh | 164 ---------------- tests/test_automation/vm_spack_env_cpu.sh | 25 --- 9 files changed, 1 insertion(+), 501 deletions(-) delete mode 100644 tests/test_automation/Jenkinsfile_ornl_vm delete mode 100755 tests/test_automation/add_fortran_for_spack_clang.py delete mode 100755 tests/test_automation/jenkins_ornl_test.sh delete mode 100755 tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh delete mode 100644 tests/test_automation/spack_supported_package_versions.sh delete mode 100644 tests/test_automation/start_spack_env.sh delete mode 100755 tests/test_automation/vm_setup.sh delete mode 100644 tests/test_automation/vm_spack_env_cpu.sh diff --git a/tests/test_automation/Jenkinsfile_ornl_vm b/tests/test_automation/Jenkinsfile_ornl_vm deleted file mode 100644 index 66ac751dff..0000000000 --- a/tests/test_automation/Jenkinsfile_ornl_vm +++ /dev/null @@ -1,4 +0,0 @@ -library 'qmcpack_shared_jenkins' - -common_pipeline(name:"cpu", spack_path:"/var/lib/jenkins/spack", prefix:"vm") - diff --git a/tests/test_automation/README.md b/tests/test_automation/README.md index 5eddb9409d..8ed2b715c3 100644 --- a/tests/test_automation/README.md +++ b/tests/test_automation/README.md @@ -18,7 +18,4 @@ correspond to exactly those used in production, since these are sometimes tweaked for runtime, minor software version updates etc. # CI configuration -Most of these files begin with jenkins. Scrutinize any updates to them -carefully there is basically no reason they should be touched except by maintainers. -They depend on a common jenkins pipeline library pulled from QMCPACK/qmcpack_shared_jenkins. -Updates to that library don't seem to propagate faster than ~10 minutes. +See github-actions subdirectory and /.github/workflows/ diff --git a/tests/test_automation/add_fortran_for_spack_clang.py b/tests/test_automation/add_fortran_for_spack_clang.py deleted file mode 100755 index 859b8ca772..0000000000 --- a/tests/test_automation/add_fortran_for_spack_clang.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env spack python - -import spack.config -# spack uses this as well so it must be there -import argparse -import os - -parser = argparse.ArgumentParser(description='For chosen spec of gcc add gfortran as fortran compiler for all clang compiler configs') -parser.add_argument('gcc_spec', type=str, help='spack spec to get gfortran from') -args = parser.parse_args() - -conf_scope = spack.config.ConfigScope('user/linux', os.path.join(os.environ['HOME'],'.spack','linux')) -compilers_config = conf_scope.get_section_filename('compilers') - -gcc_configs = [ c for c in conf_scope.get_section('compilers')['compilers'] if args.gcc_spec in c['compiler']['spec'] ] -if len(gcc_configs) != 1: - raise NameError('gcc version must evaluate to a single gcc spec') -gcc = gcc_configs[0] - -llvm_configs = [ c for c in conf_scope.get_section('compilers')['compilers'] if "clang" in c['compiler']['spec'] ] -for lconf in llvm_configs: - lconf['compiler']['paths']['f77'] = gcc['compiler']['paths']['f77'] - lconf['compiler']['paths']['fc'] = gcc['compiler']['paths']['fc'] - lconf['compiler']['modules'].append(args.gcc_spec) - -conf_scope.write_section('compilers') diff --git a/tests/test_automation/jenkins_ornl_test.sh b/tests/test_automation/jenkins_ornl_test.sh deleted file mode 100755 index ddd35608e6..0000000000 --- a/tests/test_automation/jenkins_ornl_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -x - -exit_code=0 - -echo "starting spack using ${SPACK_ENV_FILE}" - -# this depends on SPACK_ROOT being set in Jenkinsfile_xxx -# it also supplies QMC_IMMUTABLE_FLAGS which makes it a bit more than the -# environment from the set of loaded spack packages. -. ${SPACK_ENV_FILE} - -cd build_${1}_${2} -BUILD_DIR=$(pwd) -echo "BUILD_DIR: ${BUILD_DIR}" - -export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} - -module list - -# this keeps tee from eating the exit status -set -o pipefail - -# exclude failing hamiltonian test -EXCLUDE_FLAG='' -if [[ $2 == 'mixed' ]]; then - EXCLUDE_FLAG='-E unit_test_hamiltonian' -fi - - -ctest -j${JNK_THREADS} -L unit ${EXCLUDE_FLAG} --output-on-failure --timeout 120 2>&1 | tee ${1}_${2}_ctest.out -ret=$? -if [[ ${ret} -ne 0 ]] ; then - exit_code=${ret} -fi - -exit ${exit_code} diff --git a/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh b/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh deleted file mode 100755 index 3cee8e5e17..0000000000 --- a/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/bin/bash -# source environment -# MPI wrappers, MKL, and Intel and GCC compiler -export PATH=/opt/cmake/current/bin:$PATH -source /opt/rh/devtoolset-6/enable -source /opt/intel/2018/parallel_studio_xe_2018/psxevars.sh intel64 - -export N_PROCS_BUILD=64 -export N_PROCS=64 - -# KNL NUMA + Memory Mode -# quit if in a hybrid mode, could run out of memory -if [grep -i hybrid /var/run/hwloc/knl_memoryside_cache]; then - echo Memory Mode equal Hybrid. Quitting. - exit 1 -else - cat /var/run/hwloc/knl_memoryside_cache -fi - -# timeout -timeout=1800 - -# topdir must exist, otherwise it fails -topdir=/data/ci -QMC_DATA=/data/ci/NiO -export BOOST_ROOT=/data/ci/boost_1_61_0 -testdir=${topdir}/scratch/QMCPACK_CI_BUILDS_DO_NOT_REMOVE - -if [ -e $topdir ]; then - -if [ ! -e $testdir ]; then -mkdir -p $testdir -fi - -if [ -e $testdir ]; then -cd $testdir - -# Minimize load of GitHub by maintaining a local cloned git used for all builds -if [ ! -e qmcpack ]; then -echo --- Cloning QMCPACK git `date` -git clone https://github.com/QMCPACK/qmcpack.git --depth 1 -else -cd qmcpack -echo --- Updating local QMCPACK git `date` -git pull -cd .. -fi - -# Sanity check cmake config file present -if [ -e qmcpack/CMakeLists.txt ]; then -echo --- QMCPACK git repo contains CMakeLists.txt - -# Build Quantum Espresso -# Compiled only once with the Intel Compiler -QE_VERSION=6.3 -QE_sysdir=${testdir}/intel2018_QE -QE_BIN=${QE_sysdir}/qe-${QE_VERSION}/bin -echo --- QE_BIN set to ${QE_BIN} - -# Always start from clean build, just in case we updated the QE patch. -if [ -e ${QE_sysdir} ]; then - rm -rf ${QE_sysdir} -fi - -mkdir ${QE_sysdir} - -# Download QE and patch it -cd ${QE_sysdir} -cp -p ../qmcpack/external_codes/quantum_espresso/*${QE_VERSION}* . -./download_and_patch_qe${QE_VERSION}.sh -cd qe-${QE_VERSION} - -# Hack to get QE build to build and link against proper libraries on KNL -# Eventually, Copy make.sys that is known to work. -cp /data/ci/auxfiles/configure-qe-knl-omp.sh . -cp /data/ci/auxfiles/configure-qe-libxsmm.mak . -cp /data/ci/auxfiles/configure-qe-tbbmalloc.mak . - -echo --- Configure QE ${QE_VERSION}$ -./configure-qe-knl-omp.sh -# HDF5 support in QE 6.3 is buggy, revert to older file I/O format -sed -i 's/D__HDF5/D__HDF5_C/' make.inc -echo --- Building QE ${QE_VERSION}$ -# make pwall # parallel build fails due to incorrect dependency -make -j 64 pw -make -j 64 pp - -# Make fault-tolerant, maybe QE did not download properly or there -# was a build failure -if [ -e ${QE_BIN}/pw.x ]; then - echo -- QE ${QE_VERSION} was built properly. -else - echo -- QE ${QE_VERSION} failed to build. - exit 1 -fi - - - -echo --- Starting test builds and tests - -for sys in build_gcc build_gcc_complex build_gcc_mixed build_gcc_complex_mixed build_intel2018 build_intel2018_complex build_intel2018_mixed build_intel2018_complex_mixed -do - -echo --- Building for $sys `date` - -cd ${testdir} - -if [ -e $sys ]; then -rm -rf $sys -fi -mkdir $sys -cd $sys - -# export PATH=/opt/local/bin:/opt/local/sbin:/usr/local/cuda/bin/:/usr/lib64/openmpi/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin -# export LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda-7.0/lib64 - -case $sys in - "build_gcc") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_gcc_complex") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Complex-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_gcc_mixed") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Mixed-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_gcc_complex_mixed") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Complex-Mixed-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018_complex") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Complex-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018_mixed") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Mixed-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018_complex_mixed") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Complex-Mixed-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - *) - echo "ERROR: Unknown build type $sys" - ;; -esac - -done # termination for $sys loop - -else - echo "ERROR: No CMakeLists. Bad git clone or update." - exit 1 -fi # termination for if block which checks CMakeLists.txt - - -else -echo "ERROR: Unable to make test directory ${testdir}" -exit 1 -fi # termination for if block which checks testing directory - -else -echo "ERROR: No top level directory" -exit 1 -fi # termination for if block which checks top level directory - - - diff --git a/tests/test_automation/spack_supported_package_versions.sh b/tests/test_automation/spack_supported_package_versions.sh deleted file mode 100644 index e16336bbc8..0000000000 --- a/tests/test_automation/spack_supported_package_versions.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -# GCC -# Dates at https://gcc.gnu.org/releases.html -gcc_vnew=9.2.0 # 2019-08-12 -gcc_vold=7.3.0 # 2018-01-25 - -#For Intel: -gcc_vintel=7.4.0 # 2018-12-06 - -#PGI 19.4 -# makelocalrc configured with 8.3.0 currently -gcc_vpgi=8.3.0 # 2019-02-22 - -# For CUDA toolkit compatibility -gcc_vcuda=8.3.0 # 2019-02-22 - -# LLVM -# Dates at http://releases.llvm.org/ -llvm_vnew=9.0.0 # 2019-09-19 -llvm_vold=5.0.1 # 2017-12-21 -# for CUDA 10.1 update 2 -llvm_vcuda=8.0.0 # 2019-03- - -# HDF5 -hdf5_vnew=1.10.5 # Releeased 2019-02-28 -hdf5_vold=1.8.19 # Released 2017-06-16 - -# CMake -# Dates at https://cmake.org/files/ -cmake_vnew=3.16.2 # Released 2019-12-19 -cmake_vold=3.10.2 # Released 2018-01-18 - -# OpenMPI -# Dates at https://www.open-mpi.org/software/ompi/v4.0/ -ompi_vnew=4.0.2 # Released 2019-10-07 -ompi_vold=2.1.2 # Released 2017-09-20 - -libxml2_vnew=2.9.9 # Released 2019-01-03 See http://xmlsoft.org/sources/ -libxml2_vold=2.9.1 # Released 2013-04-19 - -# FFTW -# Dates at http://www.fftw.org/release-notes.html -fftw_vnew=3.3.8 # Released 2018-05-28 -fftw_vold=3.3.4 # Released 2014-03-16 - -# BOOST -# Dates at https://www.boost.org/users/history/ -boost_vnew=1.70.0 # Released 2019-04-12 -boost_vold=1.65.1 # Released 2016-05-13 diff --git a/tests/test_automation/start_spack_env.sh b/tests/test_automation/start_spack_env.sh deleted file mode 100644 index 4dc0f19cd5..0000000000 --- a/tests/test_automation/start_spack_env.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# This shared script fragment start spack and loads in QMCPACKS currently supported versions of compilers -# and libraries - -. $SPACK_ROOT/share/spack/setup-env.sh - -# This is to deal with the fact that we don't know the relationship between the working directory -# and checkout directory in a relative way. -# this doesn't deal with symlinks -SRC="${BASH_SOURCE[0]}" -SRC_DIR="$( cd -P "$( dirname "$SRC" )" >/dev/null 2>&1 && pwd )" - -. ${SRC_DIR}/spack_supported_package_versions.sh diff --git a/tests/test_automation/vm_setup.sh b/tests/test_automation/vm_setup.sh deleted file mode 100755 index 0d8ef048f1..0000000000 --- a/tests/test_automation/vm_setup.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash - -# -# Installs compilers & libraries for QMCPACK testing via SPACK on a clean VM -# - assumes 32 jobs is reasonable -# -# ./vm_setup.sh Builds current CI requirements (use with caution specs could become ambiguous) -# ./vm_setup.sh clean *** DESTROYS EXISTING INSTALLATION *** -# -# - -module() { eval `/usr/bin/modulecmd bash $*`; } - -if [[ $1 == "clean" ]]; then - rm -r -f $HOME/spack $HOME/.spack - mkdir $HOME/.spack - cat >$HOME/.spack/config.yaml< -#Date: Fri Jan 3 15:52:59 2020 -0600 -# -# py-intervaltree: new package at 3.0.2 (#14277) - - cd bin - ./spack bootstrap -fi - -export SPACK_ROOT=$HOME/spack -. $SPACK_ROOT/share/spack/setup-env.sh - -echo --- Spack list -spack find -echo --- Spack compilers -spack compilers -echo --- Modules list -module list -echo --- End listings - -# -# Versions should be consistent with setup script -# - -# GCC -# Dates at https://gcc.gnu.org/releases.html -gcc_vnew=9.2.0 # 2019-08-12 -gcc_vold=7.3.0 # 2018-01-25 - -#For Intel: -gcc_vintel=7.4.0 # 2018-12-06 - -#PGI 19.4 -# makelocalrc configured with 8.3.0 currently -gcc_vpgi=8.3.0 # 2019-02-22 - -# For CUDA toolkit compatibility -gcc_vcuda=8.3.0 # 2019-02-22 - -# LLVM -# Dates at http://releases.llvm.org/ -llvm_vnew=9.0.0 # 2019-09-19 -llvm_vold=5.0.1 # 2017-12-21 -# for CUDA 10.1 update 2 -llvm_vcuda=8.0.0 # 2019-03- - -# HDF5 -hdf5_vnew=1.10.5 # Releeased 2019-02-28 -hdf5_vold=1.8.19 # Released 2017-06-16 - -# CMake -# Dates at https://cmake.org/files/ -cmake_vnew=3.16.2 # Released 2019-12-19 -cmake_vold=3.10.2 # Released 2018-01-18 - -# OpenMPI -# Dates at https://www.open-mpi.org/software/ompi/v4.0/ -ompi_vnew=4.0.2 # Released 2019-10-07 -ompi_vold=2.1.2 # Released 2017-09-20 - -libxml2_vnew=2.9.9 # Released 2019-01-03 See http://xmlsoft.org/sources/ -libxml2_vold=2.9.1 # Released 2013-04-19 - -# FFTW -# Dates at http://www.fftw.org/release-notes.html -fftw_vnew=3.3.8 # Released 2018-05-28 -fftw_vold=3.3.4 # Released 2014-03-16 - -# BOOST -# Dates at https://www.boost.org/users/history/ -boost_vnew=1.70.0 # Released 2019-04-12 -boost_vold=1.65.1 # Released 2016-05-13 - -echo --- START env `date` -echo --- gcc@${gcc_vold} -spack install gcc@${gcc_vold} -spack load gcc@${gcc_vold} -spack compiler find -echo --- Spack compilers -spack compilers - -if [[ $1 == "clean" ]]; then - cat >$HOME/.spack/packages.yaml</dev/null 2>&1 && pwd )" -. ${SRC_DIR}/start_spack_env.sh - -spack load gcc@$gcc_vnew%gcc@$gcc_vold -spack load boost@$boost_vnew%gcc@$gcc_vnew -spack load hdf5@$hdf5_vnew%gcc@$gcc_vnew~mpi -spack load openssl@1.1.1d%gcc@9.2.0+systemcerts -spack load cmake@$cmake_vnew%gcc@$gcc_vnew -spack load hwloc%gcc@$gcc_vnew -spack load libiconv%gcc@$gcc_vnew -spack load openmpi@$ompi_vnew%gcc@$gcc_vnew -spack load libxml2@$libxml2_vnew%gcc@$gcc_vnew -spack load fftw@$fftw_vnew%gcc@$gcc_vnew -spack load openblas%gcc@$gcc_vnew -spack load netlib-lapack%gcc@$gcc_vnew - -#if you've installed more than one python for the new compiler this will fail -spack load python%gcc@$gcc_vnew - -QMC_IMMUTABLE_FLAGS="-DBUILD_AFQMC=1" - -echo ${PATH} From e7027ae96a7b55e214896f5c149318f0b1196084 Mon Sep 17 00:00:00 2001 From: Mark Dewing Date: Mon, 20 Jun 2022 12:16:12 -0500 Subject: [PATCH 26/36] Orbital rotation test with legacy driver The gen_rotated_lcao_wf.py file is extended to generate QMC averages for the parameter gradients. A test for the legacy driver using one thread is introduced. --- .../tests/gen_rotated_lcao_wf.py | 136 +++++++++++++++++- src/QMCWaveFunctions/tests/stats.py | 43 ++++++ tests/molecules/He_param/CMakeLists.txt | 26 ++++ .../He_param/He_orb_rot_param_grad_legacy.xml | 100 +++++++++++++ 4 files changed, 303 insertions(+), 2 deletions(-) create mode 100644 src/QMCWaveFunctions/tests/stats.py create mode 100644 tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml diff --git a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py index 1563ad83e7..cb066cfbb5 100644 --- a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py +++ b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py @@ -5,8 +5,11 @@ # compute spatial and parameter derivatives import autograd.numpy as np from autograd import hessian,grad +from stats import averager + +# Point values used in test_RotatedSPOs_LCAO.cpp +# QMC values used to validate tests/molecules/He_param/He_orb_rot_param_grad_legacy -# Values used in test_RotatedSPOs_LCAO.cpp def mag(r): return np.sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]) @@ -175,6 +178,10 @@ def print_wf_values(theta1=0.0, theta2=0.0, use_j=False, B=0.0): print("") + +# Generate the wavefunction values for a single set of electron positions +# used in test_RotatedSPOs_LCAO.cpp + def print_point_values(): r1 = np.array([1.0, 2.0, 3.0]) r2 = np.array([0.0, 1.1, 2.2]) @@ -185,7 +192,132 @@ def print_point_values(): print_wf_values(theta1=0.0, theta2=0.0, use_j=True, B=0.1) +def run_qmc(VP): + r1 = np.array([1.0, 2.0, 3.0]) + r2 = np.array([0.0, 1.1, 2.2]) + + # Outer loop for statistics + nblock = 20 + + # Loop to gather data + nstep = 100 + + # Loop for decorrelation + nsubstep = 10 + + nelectron = 2 + + # Step size for trial move + delta = 1.2 + + r = np.zeros((2,3)) + r[0,:] = r1 + r[1,:] = r2 + + r_old = np.zeros(3) + r_trial = np.zeros(3) + + block_dp_ave = averager() + total_en_ave = averager() + + # Outer loop for statistics + for nb in range(nblock): + + naccept = 0 + en_ave = averager() + dpsi_ave = averager() + deloc_ave = averager() + eloc_dpsi_ave = averager() + + # Loop to gather data + for ns in range(nstep): + # Loop for decorrelation + for nss in range(nsubstep): + # Loop over electrons + for ne in range(nelectron): + + wf_old = psi(r[0,:], r[1,:], VP) + r_old[:] = r[ne,:] + + change = delta*(np.random.rand(3)-0.5) + + r_trial[:] = r_old[:] + change + r[ne,:] = r_trial[:] + + wf_new = psi(r[0,:], r[1,:], VP) + + wf_ratio = (wf_new/wf_old)**2 + + if wf_ratio > np.random.random(): + naccept += 1 + else: + r[ne,:] = r_old[:] + + eloc = local_energy(r[0,:], r[1,:], VP) + en_ave.add_value(eloc) + + psi_val = psi(r[0,:], r[1,:], VP) + dpsi_val = dpsi(r[0,:], r[1,:], VP) + dpsi_ave.add_value(dpsi_val/psi_val) + + deloc_val = dlocal_energy(r[0,:], r[1,:], VP) + deloc_ave.add_value(deloc_val) + + eloc_dpsi_ave.add_value(eloc * dpsi_val/psi_val) + + + en = en_ave.average() + en_err = en_ave.error() + ar = naccept/(nstep*nsubstep*nelectron) + print('block = ',nb, ' energy = ',en,en_err,' acceptance ratio = ',ar) + + dp = dpsi_ave.average() + dp_err = dpsi_ave.error() + + deloc = deloc_ave.average() + deloc_err = deloc_ave.error() + + eloc_dpsi = eloc_dpsi_ave.average() + eloc_dpsi_err = eloc_dpsi_ave.error() + + # For the parameter derivative formula, see + # https://github.com/QMCPACK/qmc_algorithms/blob/master/Variational/Parameter_Optimization.ipynb + + dg = deloc + 2*eloc_dpsi - 2*en*dp + block_dp_ave.add_value(dg) + + dg = block_dp_ave.average() + dg_err = block_dp_ave.error() + print('parameter values = ',VP) + print('parameter derivatives = ',dg) + print('parameter derivative errors = ',dg_err) + + +def run_qmc_parameter_derivatives(): + global use_jastrow + use_jastrow=False + theta1 = 0.1 + theta2 = 0.1 + VP = np.array([theta1, theta2]) + + run_qmc(VP) + +# Some results from run_qmc_parameter_derivatives + +# Run took about 10 minutes on laptop +# nblock=20, nstep=1000, nsubstep=10 +# parameter values = [0.1 0.1] +# parameter derivatives = [-0.20164722 -0.18347461] +# parameter derivative errors = [0.01201481 0.01314164] + +# Run took about 40 minutes on laptop +# nblock=40, nstep=2000, nsubstep=10 +# parameter values = [0.1 0.1] +# parameter derivatives = [-0.2204924 -0.21471184] +# parameter derivative errors = [0.00493837 0.00571082] + if __name__=='__main__': - print_point_values() + #print_point_values() + run_qmc_parameter_derivatives() diff --git a/src/QMCWaveFunctions/tests/stats.py b/src/QMCWaveFunctions/tests/stats.py new file mode 100644 index 0000000000..61794a9a3c --- /dev/null +++ b/src/QMCWaveFunctions/tests/stats.py @@ -0,0 +1,43 @@ + +import math +import numpy as np + +class averager: + """Compute average, variance, and standard error for a set of data""" + def __init__(self): + self.sum = 0.0 + self.sum_sq = 0.0 + self.norm = 0 + + def add_value(self,v): + self.sum += v + self.sum_sq += v*v + self.norm += 1 + + def average(self): + if (self.norm == 0): + return 0.0 + else: + return self.sum/self.norm + + def variance(self): + var = 0.0 + if (self.norm != 0): + var = (self.sum_sq - self.sum*self.sum/self.norm)/self.norm + return var + + def std_dev(self): + return math.sqrt(self.variance()) + + def error(self): + err = 0.0 + if (self.norm > 1): + var = self.variance() + err = np.sqrt(var/(self.norm-1)) + return err + + def merge(self, other): + self.sum += other.sum + self.sum_sq += other.sum_sq + self.norm += other.norm + diff --git a/tests/molecules/He_param/CMakeLists.txt b/tests/molecules/He_param/CMakeLists.txt index 02affdc006..c77a43a599 100644 --- a/tests/molecules/He_param/CMakeLists.txt +++ b/tests/molecules/He_param/CMakeLists.txt @@ -78,6 +78,32 @@ if(NOT QMC_CUDA) endif() + # Orbital Rotation + + # Two atomic orbitals, no jastrow + + list(APPEND HE_ORB_ROT_PARAM spo-up_orb_rot_0000_0001 -0.218 0.01) # scalar name, value, error + list(APPEND HE_ORB_ROT_PARAM spo-down_orb_rot_0000_0001 -0.218 0.01) # scalar name, value, error + + qmc_run_and_check_custom_scalar( + BASE_NAME + He_orb_rot_param_grad_legacy + BASE_DIR + "${qmcpack_SOURCE_DIR}/tests/molecules/He_param" + PREFIX + He_orb_rot_param_grad_legacy.param + INPUT_FILE + He_orb_rot_param_grad_legacy.xml + PROCS + 1 + THREADS + 1 + SERIES + 0 + SCALAR_VALUES + HE_ORB_ROT_PARAM) + + else() message(VERBOSE "Skipping He_param tests because parameter output is not supported by mixed precison build (QMC_MIXED_PRECISION=1)") endif() diff --git a/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml b/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml new file mode 100644 index 0000000000..71b42b5b81 --- /dev/null +++ b/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml @@ -0,0 +1,100 @@ + + + + legacy + + + + + + + 2 + + + 0.0 0.0 0.0 + + + + + + + + -1 + + + -1 + + + + + + + + + + + + + + + + + + + + 0.1 + + 1.0 0.0 + 0.0 1.0 + + + + 0.1 + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + + 100 + + 25 + 10 + 20 + 0.5 + 1000 + 1.0 + 0.00 + + + + + From 0a0d55acf2b433ff91f4ba3149bc9e2b8639f02a Mon Sep 17 00:00:00 2001 From: Jakub Kurzak Date: Mon, 20 Jun 2022 15:36:03 -0400 Subject: [PATCH 27/36] Do not use cudaDeviceProp maxTexture1D in HIP --- src/QMCHamiltonians/ECPComponentBuilder.2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/QMCHamiltonians/ECPComponentBuilder.2.cpp b/src/QMCHamiltonians/ECPComponentBuilder.2.cpp index 7d28e17902..f4aaf6eeac 100644 --- a/src/QMCHamiltonians/ECPComponentBuilder.2.cpp +++ b/src/QMCHamiltonians/ECPComponentBuilder.2.cpp @@ -474,7 +474,7 @@ void ECPComponentBuilder::doBreakUp(const std::vector& angList, RealType rmax, mRealType Vprefactor) { -#ifdef QMC_CUDA +#if defined(QMC_CUDA) && !defined(QMC_CUDA2HIP) int device; cudaCheck(cudaGetDevice(&device)); cudaDeviceProp deviceProp; From cdf7430c50fa69794c49d52d5bdc0f61bfb41a4d Mon Sep 17 00:00:00 2001 From: William F Godoy Date: Wed, 22 Jun 2022 11:51:40 -0400 Subject: [PATCH 28/36] Add MPI support to ROCm legacy CI For CI running on nitrogen --- .../workflows/ci-github-actions-self-hosted.yaml | 8 ++++---- .../github-actions/ci/run_step.sh | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml index d2f890091a..5d4b4c3e9f 100644 --- a/.github/workflows/ci-github-actions-self-hosted.yaml +++ b/.github/workflows/ci-github-actions-self-hosted.yaml @@ -222,10 +222,10 @@ jobs: ROCm-Clang13-NoMPI-CUDA2HIP-Real, ROCm-Clang13-NoMPI-CUDA2HIP-Complex-Mixed, ROCm-Clang13-NoMPI-CUDA2HIP-Complex, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real-Mixed, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex-Mixed, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real-Mixed, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex-Mixed, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex, ] steps: diff --git a/tests/test_automation/github-actions/ci/run_step.sh b/tests/test_automation/github-actions/ci/run_step.sh index 5d15188167..1ec85c84dc 100755 --- a/tests/test_automation/github-actions/ci/run_step.sh +++ b/tests/test_automation/github-actions/ci/run_step.sh @@ -231,13 +231,21 @@ case "$1" in -DQMC_DATA=$QMC_DATA_DIR \ ${GITHUB_WORKSPACE} ;; - *"ROCm-Clang13-NoMPI-Legacy-CUDA2HIP"*) + *"ROCm-Clang13-MPI-Legacy-CUDA2HIP"*) echo 'Configure for building CUDA2HIP with clang compilers shipped with ROCM on AMD hardware' + + export OMPI_CC=/opt/rocm/llvm/bin/clang + export OMPI_CXX=/opt/rocm/llvm/bin/clang++ + + # Make current environment variables available to subsequent steps + echo "OMPI_CC=/opt/rocm/llvm/bin/clang" >> $GITHUB_ENV + echo "OMPI_CXX=/opt/rocm/llvm/bin/clang++" >> $GITHUB_ENV + cmake -GNinja \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_C_COMPILER=/usr/lib64/openmpi/bin/mpicc \ + -DCMAKE_CXX_COMPILER=/usr/lib64/openmpi/bin/mpicxx \ + -DMPIEXEC_EXECUTABLE=/usr/lib64/openmpi/bin/mpirun \ -DQMC_CUDA=1 \ - -DQMC_MPI=0 \ -DQMC_CUDA2HIP=ON \ -DCMAKE_PREFIX_PATH="/opt/OpenBLAS/0.3.18" \ -DQMC_COMPLEX=$IS_COMPLEX \ From e0afba1c2427a17fb130f4e3aa5f18fd30d26f17 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Wed, 22 Jun 2022 20:26:28 -0500 Subject: [PATCH 29/36] Rewrite loop to avoid NVHPC hang. --- .../Fermion/MatrixDelayedUpdateCUDA.h | 12 +++++++----- src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h index 55c187a4b9..a2fbc37d80 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h +++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h @@ -526,13 +526,15 @@ class MatrixDelayedUpdateCUDA int success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1); if (success != 0) throw std::runtime_error("ompBLAS::gemv failed."); - PRAGMA_OFFLOAD("omp target is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + + PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + for (int i = 0; i < norb; i++) { - temp_ptr[rowchanged] -= cone; - PRAGMA_OFFLOAD("omp parallel for simd") - for (int i = 0; i < norb; i++) - rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + if (i == 0) + temp_ptr[rowchanged] -= cone; } + success = ompBLAS::ger(dummy_handle, norb, norb, static_cast(FullPrecValue(-1) / c_ratio_in), rcopy_ptr, 1, temp_ptr, 1, Ainv_ptr, lda); if (success != 0) diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h index 6ce11595b1..cbb1efe06f 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h +++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h @@ -222,13 +222,15 @@ class MatrixUpdateOMPTarget success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1); if (success != 0) throw std::runtime_error("ompBLAS::gemv failed."); - PRAGMA_OFFLOAD("omp target is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + + PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + for (int i = 0; i < norb; i++) { - temp_ptr[rowchanged] -= cone; - PRAGMA_OFFLOAD("omp parallel for simd") - for (int i = 0; i < norb; i++) - rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + if (i == 0) + temp_ptr[rowchanged] -= cone; } + success = ompBLAS::ger(dummy_handle, norb, norb, static_cast(-1.0 / c_ratio_in), rcopy_ptr, 1, temp_ptr, 1, Ainv_ptr, lda); if (success != 0) From 49bfe4b19183443c2ef4ab08a01a1a49a47e67f2 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Thu, 23 Jun 2022 08:28:52 -0500 Subject: [PATCH 30/36] Remove omp parallel over walkers. --- src/Particle/DistanceTable.h | 5 ----- src/Particle/ParticleSet.cpp | 4 ---- src/Particle/SoaDistanceTableAAOMPTarget.h | 1 - src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 1 - src/QMCWaveFunctions/SPOSet.cpp | 5 ----- src/QMCWaveFunctions/SpinorSet.cpp | 2 -- src/QMCWaveFunctions/WaveFunctionComponent.cpp | 12 ------------ 7 files changed, 30 deletions(-) diff --git a/src/Particle/DistanceTable.h b/src/Particle/DistanceTable.h index c1fa5b07ac..3175be4596 100644 --- a/src/Particle/DistanceTable.h +++ b/src/Particle/DistanceTable.h @@ -103,7 +103,6 @@ class DistanceTable virtual void mw_evaluate(const RefVectorWithLeader& dt_list, const RefVectorWithLeader& p_list) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].evaluate(p_list[iw]); } @@ -117,7 +116,6 @@ class DistanceTable const RefVectorWithLeader& p_list, const std::vector& recompute) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) if (recompute[iw]) dt_list[iw].evaluate(p_list[iw]); @@ -145,7 +143,6 @@ class DistanceTable const IndexType iat, bool prepare_old = true) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].move(p_list[iw], rnew_list[iw], iat, prepare_old); } @@ -174,7 +171,6 @@ class DistanceTable IndexType jat, const std::vector& from_temp) { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].updatePartial(jat, from_temp[iw]); } @@ -192,7 +188,6 @@ class DistanceTable virtual void mw_finalizePbyP(const RefVectorWithLeader& dt_list, const RefVectorWithLeader& p_list) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].finalizePbyP(p_list[iw]); } diff --git a/src/Particle/ParticleSet.cpp b/src/Particle/ParticleSet.cpp index b6c72606fc..14f5c1ee3f 100644 --- a/src/Particle/ParticleSet.cpp +++ b/src/Particle/ParticleSet.cpp @@ -380,11 +380,8 @@ void ParticleSet::mw_update(const RefVectorWithLeader& p_list, bool } if (!skipSK && p_leader.structure_factor_) - { -#pragma omp parallel for for (int iw = 0; iw < p_list.size(); iw++) p_list[iw].structure_factor_->updateAllPart(p_list[iw]); - } } void ParticleSet::makeMove(Index_t iat, const SingleParticlePos& displ, bool maybe_accept) @@ -852,7 +849,6 @@ void ParticleSet::mw_loadWalker(const RefVectorWithLeader& p_list, pset.spins = awalker.spins; pset.coordinates_->setAllParticlePos(pset.R); }; -#pragma omp parallel for for (int iw = 0; iw < p_list.size(); ++iw) if (recompute[iw]) loadWalkerConfig(p_list[iw], walkers[iw]); diff --git a/src/Particle/SoaDistanceTableAAOMPTarget.h b/src/Particle/SoaDistanceTableAAOMPTarget.h index 646b5085ee..c850488e6e 100644 --- a/src/Particle/SoaDistanceTableAAOMPTarget.h +++ b/src/Particle/SoaDistanceTableAAOMPTarget.h @@ -477,7 +477,6 @@ struct SoaDistanceTableAAOMPTarget : public DTD_BConds, public Distanc if (!(modes_ & DTModes::NEED_TEMP_DATA_ON_HOST)) return; -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].updatePartial(jat, from_temp[iw]); } diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index 4da7844190..b668e1c975 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -715,7 +715,6 @@ void J2OMPTarget::mw_recompute(const RefVectorWithLeader>(); assert(this == &wfc_leader); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) if (recompute[iw]) wfc_list[iw].recompute(p_list[iw]); diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp index 93ade40335..8a1d2a6ca1 100644 --- a/src/QMCWaveFunctions/SPOSet.cpp +++ b/src/QMCWaveFunctions/SPOSet.cpp @@ -52,7 +52,6 @@ void SPOSet::mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, std::vector>& ratios_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) { Vector invRow(const_cast(invRow_ptr_list[iw]), psi_list[iw].get().size()); @@ -78,7 +77,6 @@ void SPOSet::mw_evaluateVGL(const RefVectorWithLeader& spo_list, const RefVector& d2psi_v_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluateVGL(P_list[iw], iat, psi_v_list[iw], dpsi_v_list[iw], d2psi_v_list[iw]); } @@ -89,7 +87,6 @@ void SPOSet::mw_evaluateValue(const RefVectorWithLeader& spo_list, const RefVector& psi_v_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluateValue(P_list[iw], iat, psi_v_list[iw]); } @@ -103,7 +100,6 @@ void SPOSet::mw_evaluateVGLWithSpin(const RefVectorWithLeader& spo_list, const RefVector& dspin_v_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluateVGL_spin(P_list[iw], iat, psi_v_list[iw], dpsi_v_list[iw], d2psi_v_list[iw], dspin_v_list[iw]); } @@ -168,7 +164,6 @@ void SPOSet::mw_evaluate_notranspose(const RefVectorWithLeader& spo_list const RefVector& d2logdet_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluate_notranspose(P_list[iw], first, last, logdet_list[iw], dlogdet_list[iw], d2logdet_list[iw]); } diff --git a/src/QMCWaveFunctions/SpinorSet.cpp b/src/QMCWaveFunctions/SpinorSet.cpp index 7609800b5b..62b2fc4a47 100644 --- a/src/QMCWaveFunctions/SpinorSet.cpp +++ b/src/QMCWaveFunctions/SpinorSet.cpp @@ -195,7 +195,6 @@ void SpinorSet::mw_evaluateVGLWithSpin(const RefVectorWithLeader& spo_li up_spo_leader.mw_evaluateVGL(up_spo_list, P_list, iat, up_psi_v_list, up_dpsi_v_list, up_d2psi_v_list); dn_spo_leader.mw_evaluateVGL(dn_spo_list, P_list, iat, dn_psi_v_list, dn_dpsi_v_list, dn_d2psi_v_list); -#pragma omp parallel for for (int iw = 0; iw < nw; iw++) { ParticleSet::Scalar_t s = P_list[iw].activeSpin(iat); @@ -327,7 +326,6 @@ void SpinorSet::mw_evaluate_notranspose(const RefVectorWithLeader& spo_l dn_spo_leader.mw_evaluate_notranspose(dn_spo_list, P_list, first, last, dn_logdet_list, dn_dlogdet_list, dn_d2logdet_list); -#pragma omp parallel for for (int iw = 0; iw < nw; iw++) for (int iat = 0; iat < nelec; iat++) { diff --git a/src/QMCWaveFunctions/WaveFunctionComponent.cpp b/src/QMCWaveFunctions/WaveFunctionComponent.cpp index 284fe17996..880f67684e 100644 --- a/src/QMCWaveFunctions/WaveFunctionComponent.cpp +++ b/src/QMCWaveFunctions/WaveFunctionComponent.cpp @@ -44,7 +44,6 @@ void WaveFunctionComponent::mw_evaluateLog(const RefVectorWithLeader& L_list) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) wfc_list[iw].evaluateLog(p_list[iw], G_list[iw], L_list[iw]); } @@ -62,7 +61,6 @@ void WaveFunctionComponent::mw_recompute(const RefVectorWithLeader& recompute) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) if (recompute[iw]) wfc_list[iw].recompute(p_list[iw]); @@ -73,7 +71,6 @@ void WaveFunctionComponent::mw_prepareGroup(const RefVectorWithLeader& grad_now) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) grad_now[iw] = wfc_list[iw].evalGrad(p_list[iw], iat); } @@ -108,7 +104,6 @@ void WaveFunctionComponent::mw_evalGradWithSpin(const RefVectorWithLeader& spingrad_now) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) grad_now[iw] = wfc_list[iw].evalGradWithSpin(p_list[iw], iat, spingrad_now[iw]); } @@ -119,7 +114,6 @@ void WaveFunctionComponent::mw_calcRatio(const RefVectorWithLeader& ratios) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) ratios[iw] = wfc_list[iw].ratio(p_list[iw], iat); } @@ -151,7 +145,6 @@ void WaveFunctionComponent::mw_ratioGrad(const RefVectorWithLeader& grad_new) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) ratios[iw] = wfc_list[iw].ratioGrad(p_list[iw], iat, grad_new[iw]); } @@ -164,7 +157,6 @@ void WaveFunctionComponent::mw_ratioGradWithSpin(const RefVectorWithLeader& spingrad_new) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) ratios[iw] = wfc_list[iw].ratioGradWithSpin(p_list[iw], iat, grad_new[iw], spingrad_new[iw]); } @@ -176,7 +168,6 @@ void WaveFunctionComponent::mw_accept_rejectMove(const RefVectorWithLeader& wfc_list) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) wfc_list[iw].completeUpdates(); } @@ -207,7 +197,6 @@ void WaveFunctionComponent::mw_evaluateGL(const RefVectorWithLeader>& ratios) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) wfc_list[iw].evaluateRatios(vp_list[iw], ratios[iw]); } From 608cdab860c98f405b02c937ee26e9d217ea1813 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Fri, 24 Jun 2022 01:44:50 -0500 Subject: [PATCH 31/36] Add MSD::mw_accept_rejectMove unit test --- .../Fermion/MultiSlaterDetTableMethod.cpp | 3 ++ .../tests/test_multi_slater_determinant.cpp | 37 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp index 4c8e73b905..9162e60ca7 100644 --- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp +++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp @@ -1082,6 +1082,9 @@ void MultiSlaterDetTableMethod::mw_prepareGroup(const RefVectorWithLeader& p_list, int ig) const { + if (!use_pre_computing_) + return; + auto& det_leader = wfc_list.getCastedLeader(); const size_t nw = wfc_list.size(); assert(this == &det_leader); diff --git a/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp b/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp index f980fda7e9..8c222f6ca6 100644 --- a/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp +++ b/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp @@ -276,6 +276,43 @@ void test_LiH_msd(const std::string& spo_xml_string, CHECK(grad_new.grads_positions[1][0] == ValueApprox(-0.8633778143)); CHECK(grad_new.grads_positions[1][1] == ValueApprox(0.8245347691)); CHECK(grad_new.grads_positions[1][2] == ValueApprox(-5.1513380151)); + + std::vector isAccepted{false, true}; + TrialWaveFunction::mw_accept_rejectMove(wf_ref_list, p_ref_list, moved_elec_id, isAccepted); + ParticleSet::mw_accept_rejectMove(p_ref_list, moved_elec_id, isAccepted); + + CHECK(wf_ref_list[0].getLogPsi() == Approx(-7.803347327300152)); + CHECK(wf_ref_list[1].getLogPsi() == Approx(-7.321765331299484)); + + // move the next electron + TrialWaveFunction::mw_prepareGroup(wf_ref_list, p_ref_list, 1); + const int moved_elec_id_next = 2; + TrialWaveFunction::mw_evalGrad(wf_ref_list, p_ref_list, moved_elec_id_next, grad_old); + + CHECK(grad_old.grads_positions[0][0] == ValueApprox(1.3325558736)); + CHECK(grad_old.grads_positions[0][1] == ValueApprox(1.3327966725)); + CHECK(grad_old.grads_positions[0][2] == ValueApprox(1.2157689586)); + CHECK(grad_old.grads_positions[1][0] == ValueApprox(1.3222514142)); + CHECK(grad_old.grads_positions[1][1] == ValueApprox(1.3230108868)); + CHECK(grad_old.grads_positions[1][2] == ValueApprox(1.2035047435)); + + ParticleSet::mw_makeMove(p_ref_list, moved_elec_id_next, displ); + TrialWaveFunction::mw_calcRatio(wf_ref_list, p_ref_list, moved_elec_id_next, ratios); + + CHECK(ratios[0] == ValueApprox(PsiValueType(2.1080036144))); + CHECK(ratios[1] == ValueApprox(PsiValueType(0.4947158435))); + + TrialWaveFunction::mw_calcRatioGrad(wf_ref_list, p_ref_list, moved_elec_id_next, ratios, grad_new); + + CHECK(ratios[0] == ValueApprox(PsiValueType(2.1080036144))); + CHECK(ratios[1] == ValueApprox(PsiValueType(0.4947158435))); + + CHECK(grad_new.grads_positions[0][0] == ValueApprox(1.8412365668)); + CHECK(grad_new.grads_positions[0][1] == ValueApprox(1.3736370007)); + CHECK(grad_new.grads_positions[0][2] == ValueApprox(0.8043818454)); + CHECK(grad_new.grads_positions[1][0] == ValueApprox(1.3553132105)); + CHECK(grad_new.grads_positions[1][1] == ValueApprox(1.5552132255)); + CHECK(grad_new.grads_positions[1][2] == ValueApprox(0.804301246)); } } From 2fce4436bd2208d5403e8291b7c559f03f0e1fce Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sun, 12 Jun 2022 21:54:12 -0500 Subject: [PATCH 32/36] Add mock-up mw_accept_rejectMove in MSD. --- .../Fermion/MultiDiracDeterminant.cpp | 13 +++++++++++ .../Fermion/MultiDiracDeterminant.h | 5 +++++ .../Fermion/MultiSlaterDetTableMethod.cpp | 22 +++++++++++++++++++ .../Fermion/MultiSlaterDetTableMethod.h | 6 +++++ 4 files changed, 46 insertions(+) diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp index a42f88409b..9787e91d68 100644 --- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp +++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp @@ -517,6 +517,19 @@ void MultiDiracDeterminant::restore(int iat) */ } +void MultiDiracDeterminant::mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted) +{ + // TODO to be expanded to serve offload needs without relying on calling acceptMove and restore + for (int iw = 0; iw < wfc_list.size(); iw++) + if (isAccepted[iw]) + wfc_list[iw].acceptMove(p_list[iw], iat, false); + else + wfc_list[iw].restore(iat); +} + // this has been fixed MultiDiracDeterminant::MultiDiracDeterminant(const MultiDiracDeterminant& s) : WaveFunctionComponent(s), diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h index 690ba02368..b42a61df73 100644 --- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h +++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h @@ -201,6 +201,11 @@ class MultiDiracDeterminant : public WaveFunctionComponent */ void restore(int iat) override; + static void mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted); + void createResource(ResourceCollection& collection) const override; void acquireResource(ResourceCollection& collection, const RefVectorWithLeader& wfc_list) const; diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp index 9162e60ca7..c86af93eeb 100644 --- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp +++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp @@ -669,6 +669,28 @@ void MultiSlaterDetTableMethod::restore(int iat) curRatio = 1.0; } +void MultiSlaterDetTableMethod::mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted, + bool safe_to_delay) const +{ + ScopedTimer local_timer(AccRejTimer); + for (size_t iw = 0; iw < isAccepted.size(); iw++) + { + auto& det = wfc_list.getCastedElement(iw); + if (isAccepted[iw]) + { + det.psi_ratio_to_ref_det_ = det.new_psi_ratio_to_new_ref_det_; + det.log_value_ += convertValueToLog(det.curRatio); + } + det.curRatio = 1.0; + } + const auto det_id = getDetID(iat); + const auto det_list(extract_DetRef_list(wfc_list, det_id)); + Dets[det_id]->mw_accept_rejectMove(det_list, p_list, iat, isAccepted); +} + void MultiSlaterDetTableMethod::registerData(ParticleSet& P, WFBufferType& buf) { for (size_t id = 0; id < Dets.size(); id++) diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h index 51c2a2b163..66d6699fb3 100644 --- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h +++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h @@ -159,6 +159,12 @@ class MultiSlaterDetTableMethod : public WaveFunctionComponent void acceptMove(ParticleSet& P, int iat, bool safe_to_delay = false) override; void restore(int iat) override; + void mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted, + bool safe_to_delay = false) const override; + void registerData(ParticleSet& P, WFBufferType& buf) override; LogValueType updateBuffer(ParticleSet& P, WFBufferType& buf, bool fromscratch = false) override; void copyFromBuffer(ParticleSet& P, WFBufferType& buf) override; From 17160d4425abf312fcc68806a0d9edebe0209c0b Mon Sep 17 00:00:00 2001 From: Peter Doak Date: Fri, 24 Jun 2022 10:05:42 -0400 Subject: [PATCH 33/36] drop standalone-debug for offload builds --- CMake/ClangCompilers.cmake | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake index c8ba9ddf9b..3688bb4d9d 100644 --- a/CMake/ClangCompilers.cmake +++ b/CMake/ClangCompilers.cmake @@ -104,8 +104,16 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") # Set extra debug flags -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer") + +# unfortunately this removes standalone-debug altogether for offload builds +# but until we discover how to use the ${OPENMP_OFFLOAD_COMPILE_OPTIONS} more selectively +# this is the only way to avoid a warning per compilation unit that contains an omp symbol. +if (NOT OFFLOAD_TARGET MATCHES "nvptx64") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fstandalone-debug") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstandalone-debug") +endif() #-------------------------------------- # Special architectural flags From 8019f7f21bb0e4db574f9c5cc42b82c4d867b476 Mon Sep 17 00:00:00 2001 From: Peter Doak Date: Fri, 24 Jun 2022 14:19:55 -0400 Subject: [PATCH 34/36] adding a message about -fstandalone-debug --- CMake/ClangCompilers.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake index 3688bb4d9d..ac439a4de8 100644 --- a/CMake/ClangCompilers.cmake +++ b/CMake/ClangCompilers.cmake @@ -111,6 +111,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer") # but until we discover how to use the ${OPENMP_OFFLOAD_COMPILE_OPTIONS} more selectively # this is the only way to avoid a warning per compilation unit that contains an omp symbol. if (NOT OFFLOAD_TARGET MATCHES "nvptx64") + message(STATUS "QMCPACK adds -fstandalone-debug for Debug builds") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fstandalone-debug") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstandalone-debug") endif() From 179a2d0666751671967558590768e75da3624656 Mon Sep 17 00:00:00 2001 From: Paul Kent Date: Fri, 24 Jun 2022 15:56:27 -0400 Subject: [PATCH 35/36] Guard div by 0 --- nexus/tests/unit/test_structure.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nexus/tests/unit/test_structure.py b/nexus/tests/unit/test_structure.py index acdbd6798d..d7aef53980 100644 --- a/nexus/tests/unit/test_structure.py +++ b/nexus/tests/unit/test_structure.py @@ -1372,7 +1372,9 @@ def test_embed(): r = np.linalg.norm(dr,axis=1) dilation = 2*r*np.exp(-r) for i in range(npos): - gr.pos[i] += dilation[i]/r[i]*dr[i] + if r[i]>0: + gr.pos[i] += dilation[i]/r[i]*dr[i] + #end if #end for # Represent the unrelaxed large cell From b89a911fe2dcb19095286035c2d5c4f124ba3424 Mon Sep 17 00:00:00 2001 From: Jeongnim Kim Date: Mon, 11 Jul 2022 11:06:31 -0700 Subject: [PATCH 36/36] Fix to avoid double-free with latest compilers. --- src/Platforms/SYCL/SYCLDeviceManager.cpp | 11 +++++++---- src/Platforms/SYCL/SYCLDeviceManager.h | 7 ++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Platforms/SYCL/SYCLDeviceManager.cpp b/src/Platforms/SYCL/SYCLDeviceManager.cpp index 6372660deb..eb5082ac8c 100644 --- a/src/Platforms/SYCL/SYCLDeviceManager.cpp +++ b/src/Platforms/SYCL/SYCLDeviceManager.cpp @@ -41,11 +41,14 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, if(id == sycl_default_device_num) { int result; - default_device_queue.reset(static_cast (omp_get_interop_ptr(interop, omp_ipr_targetsync, &result))); + sycl::queue* omp_queue = static_cast(omp_get_interop_ptr(interop, omp_ipr_targetsync, &result)); if(result != omp_irc_success) throw std::runtime_error("SYCLDeviceManager::SYCLDeviceManager fail to obtain sycl::queue by interop"); + + default_device_queue.reset(new sycl::queue(*omp_queue)); + visible_devices[id].queue_ = omp_queue; } - visible_devices[id].interop=interop; + visible_devices[id].interop_ = interop; } #else @@ -92,8 +95,8 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, default_device_num = sycl_default_device_num; else if (default_device_num != sycl_default_device_num) throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!"); - default_device_queue = std::make_unique(visible_devices[sycl_default_device_num].context, - visible_devices[sycl_default_device_num].device); + default_device_queue = std::make_unique(visible_devices[sycl_default_device_num].context_, + visible_devices[sycl_default_device_num].device_); } #endif } diff --git a/src/Platforms/SYCL/SYCLDeviceManager.h b/src/Platforms/SYCL/SYCLDeviceManager.h index 7055c13c3a..7a54391b9f 100644 --- a/src/Platforms/SYCL/SYCLDeviceManager.h +++ b/src/Platforms/SYCL/SYCLDeviceManager.h @@ -24,9 +24,10 @@ namespace qmcplusplus { struct syclDeviceInfo { - sycl::context context; - sycl::device device; - omp_interop_t interop; + sycl::context context_; + sycl::device device_; + sycl::queue* queue_ = nullptr; + omp_interop_t interop_; }; /** SYCL device manager