diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml index d2f890091a..5d4b4c3e9f 100644 --- a/.github/workflows/ci-github-actions-self-hosted.yaml +++ b/.github/workflows/ci-github-actions-self-hosted.yaml @@ -222,10 +222,10 @@ jobs: ROCm-Clang13-NoMPI-CUDA2HIP-Real, ROCm-Clang13-NoMPI-CUDA2HIP-Complex-Mixed, ROCm-Clang13-NoMPI-CUDA2HIP-Complex, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real-Mixed, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex-Mixed, - ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real-Mixed, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex-Mixed, + ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex, ] steps: diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake index 00d6f28cb4..ac439a4de8 100644 --- a/CMake/ClangCompilers.cmake +++ b/CMake/ClangCompilers.cmake @@ -32,10 +32,6 @@ if(QMC_OMP) set(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Wno-linker-warnings") endif() - if(NOT DEFINED OFFLOAD_ARCH AND OFFLOAD_TARGET MATCHES "amdgcn") - set(OFFLOAD_ARCH gfx906) - endif() - if(NOT DEFINED OFFLOAD_ARCH AND OFFLOAD_TARGET MATCHES "nvptx64" AND DEFINED CMAKE_CUDA_ARCHITECTURES) @@ -51,6 +47,24 @@ if(QMC_OMP) endif() endif() + if(NOT DEFINED OFFLOAD_ARCH + AND OFFLOAD_TARGET MATCHES "amdgcn") + if (DEFINED HIP_ARCH) + list(LENGTH HIP_ARCH NUMBER_HIP_ARCHITECTURES) + if(NUMBER_HIP_ARCHITECTURES EQUAL "1") + set(OFFLOAD_ARCH ${HIP_ARCH}) + else() + message( + FATAL_ERROR + "LLVM does not yet support offload to multiple architectures! " + "Deriving OFFLOAD_ARCH from HIP_ARCH failed. " + "Please keep only one entry in HIP_ARCH or set OFFLOAD_ARCH.") + endif() + else() + set(OFFLOAD_ARCH gfx906) + endif() + endif() + if(DEFINED OFFLOAD_ARCH) set(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}") @@ -90,8 +104,17 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") # Set extra debug flags -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer") + +# unfortunately this removes standalone-debug altogether for offload builds +# but until we discover how to use the ${OPENMP_OFFLOAD_COMPILE_OPTIONS} more selectively +# this is the only way to avoid a warning per compilation unit that contains an omp symbol. +if (NOT OFFLOAD_TARGET MATCHES "nvptx64") + message(STATUS "QMCPACK adds -fstandalone-debug for Debug builds") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fstandalone-debug") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstandalone-debug") +endif() #-------------------------------------- # Special architectural flags diff --git a/CMakeLists.txt b/CMakeLists.txt index 7517985f36..79f57919ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -779,7 +779,7 @@ if(ENABLE_ROCM) message(STATUS "ROCM_ROOT not provided. Searching for FindHIP.cmake file.") find_path( HIP_MODULE_FILE_DIR FindHIP.cmake - HINTS /opt/rocm + HINTS $ENV{ROCM_PATH} /opt/rocm PATH_SUFFIXES hip/cmake) if(HIP_MODULE_FILE_DIR) message(STATUS "Found FindHIP.cmake file. ROCM_ROOT will be derived.") @@ -856,7 +856,7 @@ if(ENABLE_SYCL) endif() add_library(SYCL::host INTERFACE IMPORTED) add_library(SYCL::device INTERFACE IMPORTED) - find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE}) + find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE} NO_DEFAULT_PATH) target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host) target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device) if(TARGET MKL::sycl) diff --git a/docs/installation.rst b/docs/installation.rst index fd7fa722ff..b5cd4fbe24 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -282,20 +282,20 @@ the path to the source directory. :: - QMC_CUDA Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no) QMC_COMPLEX Build the complex (general twist/k-point) version (1:yes, 0:no) QMC_MIXED_PRECISION Build the mixed precision (mixing double/float) version (1:yes (QMC_CUDA=1 default), 0:no (QMC_CUDA=0 default)). Mixed precision calculations can be signifiantly faster but should be carefully checked validated against full double precision runs, particularly for large electron counts. + ENABLE_OFFLOAD ON/OFF(default). Enable OpenMP target offload for GPU acceleration. + QMC_CUDA Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no) ENABLE_CUDA ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration. - Production quality for AFQMC. Pre-production quality for real-space. + Production quality for AFQMC and real-space performance portable implementation. Use CMAKE_CUDA_ARCHITECTURES, default 70, to set the actual GPU architecture. - ENABLE_OFFLOAD ON/OFF(default). Enable OpenMP target offload for GPU acceleration. - ENABLE_TIMERS ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse - to avoid potential slowdown in tiny systems. - For systems beyond tiny sizes (100+ electrons) there is no risk. + QMC_CUDA2HIP ON/OFF(default). To be set ON, it requires either QMC_CUDA or ENABLE_CUDA to be ON. + Compile CUDA source code as HIP and use ROCm libraries for AMD GPUs. + ENABLE_SYCL ON/OFF(default). Enable SYCL code path. Only support Intel GPUs and OneAPI compilers. - General build options @@ -327,6 +327,9 @@ the path to the source directory. :: + ENABLE_TIMERS ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse + to avoid potential slowdown in tiny systems. + For systems beyond tiny sizes (100+ electrons) there is no risk. QE_BIN Location of Quantum ESPRESSO binaries including pw2qmcpack.x RMG_BIN Location of RMG binary (rmg-cpu) QMC_DATA Specify data directory for QMCPACK performance and integration tests @@ -412,7 +415,7 @@ and is not suitable for production. Additional implementation in QMCPACK as well as improvements in open-source and vendor compilers is required for production status to be reached. The following compilers have been verified: -- LLVM Clang 11. Support NVIDIA GPUs. +- LLVM Clang 14. Support NVIDIA GPUs. :: @@ -425,31 +428,43 @@ to be reached. The following compilers have been verified: OFFLOAD_TARGET for the offload target. default nvptx64-nvidia-cuda. OFFLOAD_ARCH for the target architecture (sm_80, gfx906, ...) if not using the compiler default. -- AMD AOMP Clang 11.8. Support AMD GPUs. +- AMD ROCm/AOMP LLVM-based compilers. Support AMD GPUs. :: -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=amdgcn-amd-amdhsa -D OFFLOAD_ARCH=gfx906 -- Intel oneAPI beta08. Support Intel GPUs. +- Intel oneAPI 2022.1.0 icx/icpx compilers. Support Intel GPUs. :: -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=spir64 -- HPE Cray 11. It is derived from Clang and supports NVIDIA and AMD GPUs. +- HPE Cray 13. It is derived from Clang and supports NVIDIA and AMD GPUs. :: -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=nvptx64-nvidia-cuda -D OFFLOAD_ARCH=sm_80 OpenMP offload features can be used together with vendor specific code paths to maximize QMCPACK performance. -Some new CUDA functionality has been implemented to improve efficiency on NVIDIA GPUs in conjunction with the Offload code paths: -For example, using Clang 11 on Summit. +Some new CUDA functionality has been implemented to improve performance on NVIDIA GPUs in conjunction with the offload code paths: +For example, using Clang 14 on Summit. :: - -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70 -D CMAKE_CUDA_HOST_COMPILER=`which gcc` + -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70 + +Similarly, HIP features can be enabled in conjunction with the offload code path to improve performance on AMD GPUs. + + :: + + -D ENABLE_OFFLOAD=ON -D ENABLE_CUDA=ON -D QMC_CUDA2HIP=ON -DHIP_ARCH=gfx906 + +Similarly, SYCL features can be enabled in conjunction with the offload code path to improve performance on Intel GPUs. + + :: + + -D ENABLE_OFFLOAD=ON -D ENABLE_SYCL=ON Installation from CMake diff --git a/nexus/tests/unit/test_structure.py b/nexus/tests/unit/test_structure.py index acdbd6798d..d7aef53980 100644 --- a/nexus/tests/unit/test_structure.py +++ b/nexus/tests/unit/test_structure.py @@ -1372,7 +1372,9 @@ def test_embed(): r = np.linalg.norm(dr,axis=1) dilation = 2*r*np.exp(-r) for i in range(npos): - gr.pos[i] += dilation[i]/r[i]*dr[i] + if r[i]>0: + gr.pos[i] += dilation[i]/r[i]*dr[i] + #end if #end for # Represent the unrelaxed large cell diff --git a/src/Configuration.h b/src/Configuration.h index 42ba461299..0244546422 100644 --- a/src/Configuration.h +++ b/src/Configuration.h @@ -50,7 +50,8 @@ struct QMCTraits { enum { - DIM = OHMMS_DIM + DIM = OHMMS_DIM, + DIM_VGL = OHMMS_DIM + 2 // Value(1) + Gradients(OHMMS_DIM) + Laplacian(1) }; using QTBase = QMCTypes; using QTFull = QMCTypes; diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h index 0fca688529..02a36f0cff 100644 --- a/src/Containers/OhmmsPETE/OhmmsArray.h +++ b/src/Containers/OhmmsPETE/OhmmsArray.h @@ -82,6 +82,8 @@ class Array inline typename Container_t::const_iterator begin() const { return X.begin(); } inline typename Container_t::const_iterator end() const { return X.end(); } + ///@{ + /// access the container data pointer inline Type_t* data() { return X.data(); } inline const Type_t* data() const { return X.data(); } template> @@ -94,14 +96,70 @@ class Array { return X.device_data(); } + ///@} - inline const Type_t* first_address() const { return &(X[0]); } + ///@{ + /// access the data pointer at {index_1, ..., index_D} + template> + Type_t* data_at(const std::array& indices) + { + return X.data() + compute_offset(indices); + } + template> + const Type_t* data_at(const std::array& indices) const + { + return X.data() + compute_offset(indices); + } + template, + typename Allocator = ALLOC, + typename = qmcplusplus::IsDualSpace> + Type_t* device_data_at(const std::array& indices) + { + return X.device_data() + compute_offset(indices); + } + template, + typename Allocator = ALLOC, + typename = qmcplusplus::IsDualSpace> + const Type_t* device_data_at(const std::array& indices) const + { + return X.device_data() + compute_offset(indices); + } + + template + Type_t* data_at(Args... indices) + { + static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); + return data_at({static_cast(std::forward(indices))...}); + } + template + const Type_t* data_at(Args... indices) const + { + static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array"); + return data_at({static_cast(std::forward(indices))...}); + } + template> + Type_t* device_data_at(Args... indices) + { + static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); + return device_data_at({static_cast(std::forward(indices))...}); + } + template> + const Type_t* device_data_at(Args... indices) const + { + static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array"); + return device_data_at({static_cast(std::forward(indices))...}); + } + ///@} - inline const Type_t* last_address() const { return &(X[0]) + X.size(); } + inline const Type_t* first_address() const { return X.data(); } - inline Type_t* first_address() { return &(X[0]); } + inline const Type_t* last_address() const { return X.data() + X.size(); } - inline Type_t* last_address() { return &(X[0]) + X.size(); } + inline Type_t* first_address() { return X.data(); } + + inline Type_t* last_address() { return X.data() + X.size(); } This_t& operator=(const T& rhs) { @@ -127,22 +185,31 @@ class Array return *this; } - // Get and Set Operations - inline Type_t& operator()(size_t i) { return X[i]; } - - inline Type_t operator()(size_t i) const { return X[i]; } - inline Type_t& operator()(size_t i, size_t j) { return X[j + Length[1] * i]; } - inline Type_t operator()(size_t i, size_t j) const { return X[j + Length[1] * i]; } - inline Type_t& operator()(size_t i, size_t j, size_t k) { return X[k + Length[2] * (j + Length[1] * i)]; } - inline Type_t operator()(size_t i, size_t j, size_t k) const { return X[k + Length[2] * (j + Length[1] * i)]; } - inline Type_t& operator()(size_t i, size_t j, size_t k, size_t l) + ///@{ + /// access the element at {index_1, ..., index_D} + template> + Type_t& operator()(const std::array& indices) { - return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))]; + return X[compute_offset(indices)]; } - inline Type_t operator()(size_t i, size_t j, size_t k, size_t l) const + template> + const Type_t& operator()(const std::array& indices) const { - return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))]; + return X[compute_offset(indices)]; } + template + Type_t& operator()(Args... indices) + { + static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); + return operator()({static_cast(std::forward(indices))...}); + } + template + const Type_t& operator()(Args... indices) const + { + static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array"); + return operator()({static_cast(std::forward(indices))...}); + } + ///@} inline Type_t sum() const { @@ -152,6 +219,18 @@ class Array return s; } + // Abstract Dual Space Transfers + template> + void updateTo() + { + X.updateTo(); + } + template> + void updateFrom() + { + X.updateFrom(); + } + private: std::array Length; Container_t X; @@ -163,6 +242,15 @@ class Array total *= dims[i]; return total; } + + template> + SIZET compute_offset(const std::array& indices) const + { + SIZET offset = indices[0]; + for (int i = 1; i < indices.size(); i++) + offset = offset * Length[i] + indices[i]; + return offset; + } }; template diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp index e4bf9cb3b1..5903feeb53 100644 --- a/src/Containers/OhmmsPETE/tests/test_Array.cpp +++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp @@ -24,8 +24,8 @@ namespace qmcplusplus TEST_CASE("array", "[OhmmsPETE]") { using Array1D = Array; - Array1D A({3}); - Array1D B({3}); + Array1D A(3); + Array1D B(3); // iterator auto ia = A.begin(); @@ -73,6 +73,17 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]") CHECK(vec_copy(0).back() == 123); } +TEST_CASE("Array::data", "[OhmmsPETE]") +{ + Array tensor(2, 4, 5); + REQUIRE(tensor.size() == 40); + + CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data_at(1, 2, 3)); + + tensor(1, 2, 3) = 0.5f; + CHECK(*tensor.data_at(1, 2, 3) == 0.5f); +} + TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]") { const int dim = 2; diff --git a/src/Estimators/TraceManager.h b/src/Estimators/TraceManager.h index cfb081a7a8..b676759a3e 100644 --- a/src/Estimators/TraceManager.h +++ b/src/Estimators/TraceManager.h @@ -1193,9 +1193,7 @@ struct TraceBuffer if (has_complex) complex_samples->combine_samples(); //collect data from all samples into the buffer row - int offset = current_row * row_size; { - int boffset; std::vector*>& ordered_samples = samples->ordered_samples; for (int s = 0; s < ordered_samples.size(); s++) { @@ -1203,17 +1201,13 @@ struct TraceBuffer if (tsample.write) { auto& sample = tsample.sample; - boffset = offset + tsample.buffer_start; for (int i = 0; i < sample.size(); ++i) - { - buffer(boffset + i) = sample[i]; - } + buffer(current_row, tsample.buffer_start + i) = sample[i]; } } } if (has_complex) { - int boffset; std::vector>*>& ordered_samples = complex_samples->ordered_samples; for (int s = 0; s < ordered_samples.size(); s++) { @@ -1221,11 +1215,10 @@ struct TraceBuffer if (tsample.write) { auto& sample = tsample.sample; - boffset = offset + tsample.buffer_start; for (int i = 0, ib = 0; i < sample.size(); ++i, ib += 2) { - buffer(boffset + ib) = sample[i].real(); - buffer(boffset + ib + 1) = sample[i].imag(); + buffer(current_row, tsample.buffer_start + ib) = sample[i].real(); + buffer(current_row, tsample.buffer_start + ib + 1) = sample[i].imag(); } } } diff --git a/src/Estimators/tests/test_OneBodyDensityMatrices.cpp b/src/Estimators/tests/test_OneBodyDensityMatrices.cpp index fc1e704a73..b448c5ef8c 100644 --- a/src/Estimators/tests/test_OneBodyDensityMatrices.cpp +++ b/src/Estimators/tests/test_OneBodyDensityMatrices.cpp @@ -140,7 +140,7 @@ class OneBodyDensityMatricesTests auto* test_data = reinterpret_cast*>(test_in); for (size_t id = 0; id < size; id += 2) #if defined(MIXED_PRECISION) - CHECK(ref_data[id] == ComplexApprox(test_data[id]).epsilon(1e-4)); + CHECK(ref_data[id] == ComplexApprox(test_data[id]).epsilon(6e-4)); #else CHECK(ref_data[id] == ComplexApprox(test_data[id])); #endif diff --git a/src/Particle/DistanceTable.h b/src/Particle/DistanceTable.h index c1fa5b07ac..3175be4596 100644 --- a/src/Particle/DistanceTable.h +++ b/src/Particle/DistanceTable.h @@ -103,7 +103,6 @@ class DistanceTable virtual void mw_evaluate(const RefVectorWithLeader& dt_list, const RefVectorWithLeader& p_list) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].evaluate(p_list[iw]); } @@ -117,7 +116,6 @@ class DistanceTable const RefVectorWithLeader& p_list, const std::vector& recompute) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) if (recompute[iw]) dt_list[iw].evaluate(p_list[iw]); @@ -145,7 +143,6 @@ class DistanceTable const IndexType iat, bool prepare_old = true) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].move(p_list[iw], rnew_list[iw], iat, prepare_old); } @@ -174,7 +171,6 @@ class DistanceTable IndexType jat, const std::vector& from_temp) { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].updatePartial(jat, from_temp[iw]); } @@ -192,7 +188,6 @@ class DistanceTable virtual void mw_finalizePbyP(const RefVectorWithLeader& dt_list, const RefVectorWithLeader& p_list) const { -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].finalizePbyP(p_list[iw]); } diff --git a/src/Particle/ParticleSet.cpp b/src/Particle/ParticleSet.cpp index 8e69f67fbc..14f5c1ee3f 100644 --- a/src/Particle/ParticleSet.cpp +++ b/src/Particle/ParticleSet.cpp @@ -33,13 +33,6 @@ namespace qmcplusplus { using WP = WalkerProperties::Indexes; -//using namespace particle_info; - -#ifdef QMC_CUDA -template<> -int ParticleSet::Walker_t::cuda_DataSize = 0; -#endif - enum PSetTimers { PS_newpos, @@ -387,11 +380,8 @@ void ParticleSet::mw_update(const RefVectorWithLeader& p_list, bool } if (!skipSK && p_leader.structure_factor_) - { -#pragma omp parallel for for (int iw = 0; iw < p_list.size(); iw++) p_list[iw].structure_factor_->updateAllPart(p_list[iw]); - } } void ParticleSet::makeMove(Index_t iat, const SingleParticlePos& displ, bool maybe_accept) @@ -859,7 +849,6 @@ void ParticleSet::mw_loadWalker(const RefVectorWithLeader& p_list, pset.spins = awalker.spins; pset.coordinates_->setAllParticlePos(pset.R); }; -#pragma omp parallel for for (int iw = 0; iw < p_list.size(); ++iw) if (recompute[iw]) loadWalkerConfig(p_list[iw], walkers[iw]); diff --git a/src/Particle/SoaDistanceTableAAOMPTarget.h b/src/Particle/SoaDistanceTableAAOMPTarget.h index 646b5085ee..c850488e6e 100644 --- a/src/Particle/SoaDistanceTableAAOMPTarget.h +++ b/src/Particle/SoaDistanceTableAAOMPTarget.h @@ -477,7 +477,6 @@ struct SoaDistanceTableAAOMPTarget : public DTD_BConds, public Distanc if (!(modes_ & DTModes::NEED_TEMP_DATA_ON_HOST)) return; -#pragma omp parallel for for (int iw = 0; iw < dt_list.size(); iw++) dt_list[iw].updatePartial(jat, from_temp[iw]); } diff --git a/src/Particle/Walker.h b/src/Particle/Walker.h index 03347145d5..91847fcaba 100644 --- a/src/Particle/Walker.h +++ b/src/Particle/Walker.h @@ -154,7 +154,7 @@ class Walker /// Data for GPU-vectorized versions #ifdef QMC_CUDA - static inline int cuda_DataSize; + static inline int cuda_DataSize = 0; using cuda_Buffer_t = gpu::device_vector; cuda_Buffer_t cuda_DataSet; // Note that R_GPU has size N+1. The last element contains the diff --git a/src/Platforms/SYCL/SYCLDeviceManager.cpp b/src/Platforms/SYCL/SYCLDeviceManager.cpp index e9478451b8..eb5082ac8c 100644 --- a/src/Platforms/SYCL/SYCLDeviceManager.cpp +++ b/src/Platforms/SYCL/SYCLDeviceManager.cpp @@ -17,36 +17,47 @@ #include #include #include "config.h" -#include "OutputManager.h" -#include "determineDefaultDeviceNum.h" -#if defined(_OPENMP) -#include -#include -#include -#include -#endif - +#include "Platforms/Host/OutputManager.h" +#include "Platforms/Host/determineDefaultDeviceNum.h" namespace qmcplusplus { -#if defined(_OPENMP) -/** create SYCL device/contexts from OpenMP owned ones to ensure interoperability. - * CUDA has the notion of primary context while SYCL requires explicitly sharing context. - */ -static std::vector xomp_get_sycl_devices(); -#endif + +std::unique_ptr SYCLDeviceManager::default_device_queue; SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, int local_rank, int local_size) : sycl_default_device_num(-1) { +#if defined(ENABLE_OFFLOAD) + const int sycl_device_count=omp_get_num_devices(); + sycl_default_device_num = determineDefaultDeviceNum(sycl_device_count, local_rank, local_size); + if (default_device_num < 0) + default_device_num = sycl_default_device_num; + + visible_devices.resize(sycl_device_count); + for (int id = 0; id < sycl_device_count; id++) + { + omp_interop_t interop; +#pragma omp interop device(id) init(prefer_type("sycl"),targetsync: interop) + if(id == sycl_default_device_num) + { + int result; + sycl::queue* omp_queue = static_cast(omp_get_interop_ptr(interop, omp_ipr_targetsync, &result)); + if(result != omp_irc_success) + throw std::runtime_error("SYCLDeviceManager::SYCLDeviceManager fail to obtain sycl::queue by interop"); + + default_device_queue.reset(new sycl::queue(*omp_queue)); + visible_devices[id].queue_ = omp_queue; + } + visible_devices[id].interop_ = interop; + } + +#else // Potentially multiple GPU platform. std::vector platforms = sycl::platform::get_platforms(); if (platforms.empty()) throw std::runtime_error("Cannot find SYCL platforms!"); - // find out devices from the first platform with GPUs. -#if defined(ENABLE_OFFLOAD) - visible_devices = xomp_get_sycl_devices(); -#else + // find out devices from the first platform with GPUs. std::vector devices; app_log() << "Visible SYCL platforms are :" << std::endl; for (auto& platform : platforms) @@ -67,7 +78,6 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, visible_devices.reserve(devices.size()); for (int id = 0; id < devices.size(); id++) visible_devices.push_back({sycl::context(devices[id]), devices[id]}); -#endif const size_t sycl_device_count = visible_devices.size(); if (num_devices == 0) @@ -85,14 +95,12 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, default_device_num = sycl_default_device_num; else if (default_device_num != sycl_default_device_num) throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!"); - - default_device_queue = std::make_unique(visible_devices[sycl_default_device_num].context, - visible_devices[sycl_default_device_num].device); + default_device_queue = std::make_unique(visible_devices[sycl_default_device_num].context_, + visible_devices[sycl_default_device_num].device_); } +#endif } -std::unique_ptr SYCLDeviceManager::default_device_queue; - sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue() { if (!default_device_queue) @@ -100,87 +108,4 @@ sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue() return *default_device_queue; } -#if defined(_OPENMP) -static std::vector xomp_get_sycl_devices() -{ - enum class Backend - { - UNKNOWN, - LEVEL_ZERO, - OPENCL - }; - - const auto num_omp_devices = omp_get_num_devices(); - Backend selected_backend = Backend::UNKNOWN; - std::vector devices(num_omp_devices); - for (int id = 0; id < num_omp_devices; id++) - { - omp_interop_t o = 0; - Backend my_backend = Backend::UNKNOWN; -#pragma omp interop init(prefer_type("sycl"), targetsync : o) device(id) - int err = -1; - - const std::string omp_backend(omp_get_interop_str(o, omp_ipr_fr_name, &err)); - assert(err >= 0 && "omp_get_interop_str(omp_ipr_fr_name)"); - - if (omp_backend.find("level_zero") == 0) - { - my_backend = Backend::LEVEL_ZERO; - - auto hPlatform = omp_get_interop_ptr(o, omp_ipr_platform, &err); - assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_platform)"); - auto hContext = omp_get_interop_ptr(o, omp_ipr_device_context, &err); - assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_device_context)"); - auto hDevice = omp_get_interop_ptr(o, omp_ipr_device, &err); - assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_device)"); - - const sycl::platform sycl_platform = - sycl::ext::oneapi::level_zero::make_platform(reinterpret_cast(hPlatform)); - devices[id].device = - sycl::ext::oneapi::level_zero::make_device(sycl_platform, reinterpret_cast(hDevice)); - - devices[id].context = sycl::ext::oneapi::level_zero::make_context({devices[id].device}, - reinterpret_cast(hContext), - true /* keep the ownership, no transfer */); - } - else if (omp_backend.find("opencl") == 0) - { - my_backend = Backend::OPENCL; - /* - auto hContext = omp_get_interop_ptr(o, omp_ipr_device_context, &err); - assert (err >= 0 && "omp_get_interop_ptr(omp_ipr_device_context)"); - auto hDevice = omp_get_interop_ptr(o, omp_ipr_device, &err); - assert (err >= 0 && "omp_get_interop_ptr(omp_ipr_device)"); - devices[id].device = sycl::make_device(static_cast(hDevice)); - devices[id].context = sycl::make_context(static_cast(hContext)); -*/ - } - -#pragma omp interop destroy(o) - - if (selected_backend == Backend::UNKNOWN) - selected_backend = my_backend; - else if (selected_backend != my_backend) - throw std::runtime_error("Inconsistent backends detected among OpenMP devices."); - } - - if (devices.size() > 0) - switch (selected_backend) - { - case Backend::LEVEL_ZERO: - app_log() << "SYCL adopts the Level Zero backend chosen by OpenMP." << std::endl; - break; - case Backend::OPENCL: - throw std::runtime_error("OpenMP has chosen the OpenCL backend. " - "We have not yet worked out its interoperability with SYCL. " - "Please set the Level Zero backend in OpenMP!"); - break; - default: - throw std::runtime_error("Failed in extracting OpenMP backend supported by SYCL."); - } - - return devices; -} -#endif - } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/SYCLDeviceManager.h b/src/Platforms/SYCL/SYCLDeviceManager.h index 3b5dff0c33..7a54391b9f 100644 --- a/src/Platforms/SYCL/SYCLDeviceManager.h +++ b/src/Platforms/SYCL/SYCLDeviceManager.h @@ -18,13 +18,16 @@ #include #include #include +#include namespace qmcplusplus { struct syclDeviceInfo { - sycl::context context; - sycl::device device; + sycl::context context_; + sycl::device device_; + sycl::queue* queue_ = nullptr; + omp_interop_t interop_; }; /** SYCL device manager diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp index 31cc35e89b..83ee3c6b4d 100644 --- a/src/Platforms/SYCL/syclBLAS.cpp +++ b/src/Platforms/SYCL/syclBLAS.cpp @@ -113,11 +113,11 @@ sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events) { - return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, events); + return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb, + beta, C, ldc, events); } - template sycl::event gemm(sycl::queue& handle, const char tA, const char tB, @@ -182,6 +182,109 @@ template sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events); +//transpose +template +sycl::event transpose(sycl::queue& q, + const T1* restrict in, + int m, + int lda, + T2* restrict out, + int n, + int ldb, + const std::vector& events) +{ + constexpr size_t tile_size = 16; + const size_t m_max = ((m + tile_size - 1) / tile_size) * tile_size; + const size_t n_max = ((n + tile_size - 1) / tile_size) * tile_size; + + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on(events); + sycl::accessor tile(sycl::range<2>(tile_size, + tile_size + 1), + cgh); + + cgh.parallel_for(sycl::nd_range<2>{{m_max, n_max}, {tile_size, tile_size}}, [=](sycl::nd_item<2> item) { + unsigned x = item.get_global_id(1); + unsigned y = item.get_global_id(0); + unsigned xth = item.get_local_id(1); + unsigned yth = item.get_local_id(0); + + if (x < n && y < m) + tile[yth][xth] = in[(y)*lda + x]; + item.barrier(sycl::access::fence_space::local_space); + + x = item.get_group(0) * tile_size + xth; + y = item.get_group(1) * tile_size + yth; + if (x < m && y < n) + out[(y)*ldb + x] = tile[xth][yth]; + }); + }); +} + +template sycl::event transpose(sycl::queue& q, + const float* restrict in, + int m, + int lda, + double* restrict out, + int n, + int ldb, + const std::vector& events); + +template sycl::event transpose(sycl::queue& q, + const double* restrict in, + int m, + int lda, + double* restrict out, + int n, + int ldb, + const std::vector& events); + +template sycl::event transpose(sycl::queue& q, + const std::complex* restrict in, + int m, + int lda, + std::complex* restrict out, + int n, + int ldb, + const std::vector& events); + +template sycl::event transpose(sycl::queue& q, + const std::complex* restrict in, + int m, + int lda, + std::complex* restrict out, + int n, + int ldb, + const std::vector& events); + +//copy_n for mixed precision +template +sycl::event copy_n(sycl::queue& aq, + const T1* restrict VA, + size_t array_size, + T2* restrict VC, + const std::vector& events) +{ + constexpr size_t tile_size = 64; + const size_t a_max = ((array_size + tile_size - 1) / tile_size) * tile_size; + return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) { + if (id < array_size) + VC[id] = static_cast(VA[id]); + }); +} + +template sycl::event copy_n(sycl::queue& aq, + const double* restrict VA, + size_t array_size, + float* restrict VC, + const std::vector& events); + +template sycl::event copy_n(sycl::queue& aq, + const std::complex* restrict VA, + size_t array_size, + std::complex* restrict VC, + const std::vector& events); + } // namespace syclBLAS } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp index 2952dfa3b3..3ecf85dcb1 100644 --- a/src/Platforms/SYCL/syclBLAS.hpp +++ b/src/Platforms/SYCL/syclBLAS.hpp @@ -56,6 +56,23 @@ sycl::event gemm(sycl::queue& handle, const int ldc, const std::vector& events = {}); +template +sycl::event transpose(sycl::queue& q, + const T1* in, + int m, + int lda, + T2* out, + int n, + int ldb, + const std::vector& events = {}); + +template +sycl::event copy_n(sycl::queue& aq, + const T1* VA, + size_t array_size, + T2* VC, + const std::vector& events = {}); + } // namespace syclBLAS } // namespace qmcplusplus diff --git a/src/Platforms/SYCL/syclSolver.hpp b/src/Platforms/SYCL/syclSolver.hpp new file mode 100644 index 0000000000..fe18f8a7bd --- /dev/null +++ b/src/Platforms/SYCL/syclSolver.hpp @@ -0,0 +1,25 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2022 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVER_H +#define QMCPLUSPLUS_SYCL_MKL_SOLVER_H + +#include "oneapi/mkl/lapack.hpp" + +namespace qmcplusplus +{ +namespace syclSolver +{ +using namespace oneapi::mkl::lapack; +} +} // namespace qmcplusplus + +#endif diff --git a/src/QMCHamiltonians/ECPComponentBuilder.2.cpp b/src/QMCHamiltonians/ECPComponentBuilder.2.cpp index 7d28e17902..f4aaf6eeac 100644 --- a/src/QMCHamiltonians/ECPComponentBuilder.2.cpp +++ b/src/QMCHamiltonians/ECPComponentBuilder.2.cpp @@ -474,7 +474,7 @@ void ECPComponentBuilder::doBreakUp(const std::vector& angList, RealType rmax, mRealType Vprefactor) { -#ifdef QMC_CUDA +#if defined(QMC_CUDA) && !defined(QMC_CUDA2HIP) int device; cudaCheck(cudaGetDevice(&device)); cudaDeviceProp deviceProp; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp index ceb6a9c756..3844d07b37 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp @@ -26,7 +26,6 @@ inline void assign_v(ST x, ST y, ST z, TT* restrict results_scratch_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, const ST* restrict myKcart_ptr, size_t myKcart_padded_size, @@ -56,8 +55,8 @@ inline void assign_vgl(ST x, ST y, ST z, TT* restrict results_scratch_ptr, + size_t orb_padded_size, const ST* mKK_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, size_t spline_padded_size, const ST symGGt[6], @@ -86,10 +85,6 @@ inline void assign_vgl(ST x, const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8; const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9; - TT* restrict psi = results_scratch_ptr; - TT* restrict dpsi = results_scratch_ptr + orb_size; - TT* restrict d2psi = results_scratch_ptr + orb_size * 4; - const size_t jr = index << 1; const size_t ji = jr + 1; @@ -125,12 +120,18 @@ inline void assign_vgl(ST x, const ST lap_r = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i); const ST lap_i = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r); - const size_t psiIndex = first_spo + index; - psi[psiIndex] = TT(c * val_r - s * val_i, c * val_i + s * val_r); - d2psi[psiIndex] = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r); - dpsi[psiIndex * 3] = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r); - dpsi[psiIndex * 3 + 1] = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r); - dpsi[psiIndex * 3 + 2] = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r); + TT* restrict psi = results_scratch_ptr; + TT* restrict dpsi_x = results_scratch_ptr + orb_padded_size; + TT* restrict dpsi_y = results_scratch_ptr + orb_padded_size * 2; + TT* restrict dpsi_z = results_scratch_ptr + orb_padded_size * 3; + TT* restrict d2psi = results_scratch_ptr + orb_padded_size * 4; + + const size_t psiIndex = first_spo + index; + psi[psiIndex] = TT(c * val_r - s * val_i, c * val_i + s * val_r); + d2psi[psiIndex] = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r); + dpsi_x[psiIndex] = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r); + dpsi_y[psiIndex] = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r); + dpsi_z[psiIndex] = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r); } } // namespace C2C @@ -236,7 +237,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const auto padded_size = myV.size(); offload_scratch.resize(padded_size * nVP); const auto orb_size = psiinv.size(); - results_scratch.resize(orb_size * nVP); + results_scratch.resize(padded_size * nVP); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -260,7 +261,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat; - auto* restrict psi_iat_ptr = results_scratch_ptr + orb_size * iat; + auto* restrict psi_iat_ptr = results_scratch_ptr + padded_size * iat; auto* restrict pos_scratch = reinterpret_cast(psiinv_ptr + orb_size); int ix, iy, iz; @@ -277,8 +278,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]), - psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, - first_spo_local, index); + psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index); ComplexT sum(0); PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)") @@ -355,7 +355,7 @@ void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeadergetSplinePtr(); @@ -379,7 +379,7 @@ void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST)); auto* restrict psiinv_ptr = reinterpret_cast(buffer_H2D_ptr)[ref_id_ptr[iat]]; auto* restrict pos_scratch = reinterpret_cast(buffer_H2D_ptr + nw * sizeof(ValueType*)); @@ -397,7 +397,7 @@ void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader::evaluateVGL(const ParticleSet& P, offload_scratch.resize(padded_size * 10); const auto orb_size = psi.size(); // for V(1)G(3)L(1) final result - results_scratch.resize(orb_size * 5); + results_scratch.resize(padded_size * 5); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -523,7 +523,7 @@ void SplineC2COMPTarget::evaluateVGL(const ParticleSet& P, { ScopedTimer offload(offload_timer_); PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \ - map(always, from: results_scratch_ptr[0:orb_size*5])") + map(always, from: results_scratch_ptr[0:padded_size*5])") for (int team_id = 0; team_id < NumTeams; team_id++) { const size_t first = ChunkSizePerTeam * team_id; @@ -548,7 +548,7 @@ void SplineC2COMPTarget::evaluateVGL(const ParticleSet& P, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2C::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G, + C2C::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index); } } @@ -556,10 +556,10 @@ void SplineC2COMPTarget::evaluateVGL(const ParticleSet& P, for (size_t i = 0; i < orb_size; i++) { psi[i] = results_scratch[i]; - dpsi[i][0] = results_scratch[orb_size + i * 3]; - dpsi[i][1] = results_scratch[orb_size + i * 3 + 1]; - dpsi[i][2] = results_scratch[orb_size + i * 3 + 2]; - d2psi[i] = results_scratch[orb_size * 4 + i]; + dpsi[i][0] = results_scratch[i + padded_size]; + dpsi[i][1] = results_scratch[i + padded_size * 2]; + dpsi[i][2] = results_scratch[i + padded_size * 3]; + d2psi[i] = results_scratch[i + padded_size * 4]; } } @@ -579,7 +579,7 @@ void SplineC2COMPTarget::evaluateVGLMultiPos(const VectorgetSplinePtr(); @@ -597,7 +597,7 @@ void SplineC2COMPTarget::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const { @@ -732,14 +732,14 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL } const size_t num_pos = nwalkers; + const auto orb_size = phi_vgl_v.size(2); + const auto padded_size = myV.size(); const size_t ChunkSizePerTeam = 512; const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam; - const auto padded_size = myV.size(); // for V(1)G(3)H(6) intermediate result mw_offload_scratch.resize(padded_size * num_pos * 10); - const auto orb_size = phi_vgl_v.size() / num_pos; // for V(1)G(3)L(1) final result - mw_results_scratch.resize(orb_size * num_pos * 5); + mw_results_scratch.resize(padded_size * num_pos * 5); // per team ratio and grads rg_private.resize(num_pos, NumTeams * 4); @@ -757,7 +757,7 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL auto* rg_private_ptr = rg_private.data(); const size_t buffer_H2D_stride = buffer_H2D.cols(); const size_t first_spo_local = first_spo; - const size_t phi_vgl_stride = phi_vgl_v.capacity(); + const size_t phi_vgl_stride = num_pos * orb_size; { ScopedTimer offload(offload_timer_); @@ -771,7 +771,7 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10; - auto* restrict psi_iw_ptr = results_scratch_ptr + orb_size * iw * 5; + auto* restrict psi_iw_ptr = results_scratch_ptr + padded_size * iw * 5; const auto* restrict pos_iw_ptr = reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw); const auto* restrict invRow_iw_ptr = *reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6); @@ -798,17 +798,21 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size, + C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, index); - ValueType* restrict psi = psi_iw_ptr; - ValueType* restrict dpsi = psi_iw_ptr + orb_size; - ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4; + ValueType* restrict psi = psi_iw_ptr; + ValueType* restrict dpsi_x = psi_iw_ptr + padded_size; + ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2; + ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3; + ValueType* restrict d2psi = psi_iw_ptr + padded_size * 4; - ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size; - ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3; - ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size; + ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size; + ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride; + ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride; + ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride; + ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride; ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0); PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)") @@ -816,16 +820,16 @@ void SplineC2COMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL { const size_t psiIndex = first_spo_local + j; - out_phi_v[psiIndex] = psi[psiIndex]; - out_phi_l[psiIndex] = d2psi[psiIndex]; - out_phi_g[psiIndex * 3] = dpsi[psiIndex * 3]; - out_phi_g[psiIndex * 3 + 1] = dpsi[psiIndex * 3 + 1]; - out_phi_g[psiIndex * 3 + 2] = dpsi[psiIndex * 3 + 2]; + out_phi[psiIndex] = psi[psiIndex]; + out_dphi_x[psiIndex] = dpsi_x[psiIndex]; + out_dphi_y[psiIndex] = dpsi_y[psiIndex]; + out_dphi_z[psiIndex] = dpsi_z[psiIndex]; + out_d2phi[psiIndex] = d2psi[psiIndex]; ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex]; - grad_x += dpsi[psiIndex * 3] * invRow_iw_ptr[psiIndex]; - grad_y += dpsi[psiIndex * 3 + 1] * invRow_iw_ptr[psiIndex]; - grad_z += dpsi[psiIndex * 3 + 2] * invRow_iw_ptr[psiIndex]; + grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex]; + grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex]; + grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex]; } rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h index 2a006b54d9..25e5d1c8d8 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h @@ -267,7 +267,7 @@ class SplineC2COMPTarget : public BsplineSet const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const override; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp index 60c0c7d5fe..387d24a0ee 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp @@ -25,7 +25,6 @@ inline void assign_v(ST x, ST y, ST z, TT* restrict results_scratch_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, const ST* restrict myKcart_ptr, size_t myKcart_padded_size, @@ -61,8 +60,8 @@ inline void assign_vgl(ST x, ST y, ST z, TT* restrict results_scratch_ptr, + size_t orb_padded_size, const ST* mKK_ptr, - size_t orb_size, const ST* restrict offload_scratch_ptr, size_t spline_padded_size, const ST symGGt[6], @@ -92,10 +91,6 @@ inline void assign_vgl(ST x, const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8; const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9; - TT* restrict psi = results_scratch_ptr; - TT* restrict dpsi = results_scratch_ptr + orb_size; - TT* restrict d2psi = results_scratch_ptr + orb_size * 4; - const size_t jr = index << 1; const size_t ji = jr + 1; @@ -131,20 +126,27 @@ inline void assign_vgl(ST x, const ST lap_r = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i); const ST lap_i = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r); - const size_t psiIndex = first_spo + index + omptarget::min(index, nComplexBands); - psi[psiIndex] = c * val_r - s * val_i; - d2psi[psiIndex] = c * lap_r - s * lap_i; - dpsi[psiIndex * 3] = c * gX_r - s * gX_i; - dpsi[psiIndex * 3 + 1] = c * gY_r - s * gY_i; - dpsi[psiIndex * 3 + 2] = c * gZ_r - s * gZ_i; + TT* restrict psi = results_scratch_ptr; + TT* restrict dpsi_x = results_scratch_ptr + orb_padded_size; + TT* restrict dpsi_y = results_scratch_ptr + orb_padded_size * 2; + TT* restrict dpsi_z = results_scratch_ptr + orb_padded_size * 3; + TT* restrict d2psi = results_scratch_ptr + orb_padded_size * 4; + + const size_t psiIndex = first_spo + index + omptarget::min(index, nComplexBands); + + psi[psiIndex] = c * val_r - s * val_i; + d2psi[psiIndex] = c * lap_r - s * lap_i; + dpsi_x[psiIndex] = c * gX_r - s * gX_i; + dpsi_y[psiIndex] = c * gY_r - s * gY_i; + dpsi_z[psiIndex] = c * gZ_r - s * gZ_i; if (index < nComplexBands) { - psi[psiIndex + 1] = c * val_i + s * val_r; - d2psi[psiIndex + 1] = c * lap_i + s * lap_r; - dpsi[psiIndex * 3 + 3] = c * gX_i + s * gX_r; - dpsi[psiIndex * 3 + 4] = c * gY_i + s * gY_r; - dpsi[psiIndex * 3 + 5] = c * gZ_i + s * gZ_r; + psi[psiIndex + 1] = c * val_i + s * val_r; + d2psi[psiIndex + 1] = c * lap_i + s * lap_r; + dpsi_x[psiIndex + 1] = c * gX_i + s * gX_r; + dpsi_y[psiIndex + 1] = c * gY_i + s * gY_r; + dpsi_z[psiIndex + 1] = c * gZ_i + s * gZ_r; } } } // namespace C2R @@ -277,8 +279,8 @@ void SplineC2ROMPTarget::evaluateValue(const ParticleSet& P, const int iat, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_v(x, y, z, psi_ptr, orb_size, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, - first_spo_local, nComplexBands_local, index); + C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, + nComplexBands_local, index); } } } @@ -316,7 +318,7 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const auto padded_size = myV.size(); offload_scratch.resize(padded_size * nVP); const auto orb_size = psiinv.size(); - results_scratch.resize(orb_size * nVP); + results_scratch.resize(padded_size * nVP); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -341,7 +343,7 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat; - auto* restrict psi_iat_ptr = results_scratch_ptr + orb_size * iat; + auto* restrict psi_iat_ptr = results_scratch_ptr + padded_size * iat; auto* restrict pos_scratch = psiinv_ptr + orb_size; int ix, iy, iz; @@ -358,8 +360,8 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]), - psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, - first_spo_local, nComplexBands_local, index); + psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, + nComplexBands_local, index); const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx); const size_t last_real = last_cplx + omptarget::min(nComplexBands_local, last_cplx); @@ -438,7 +440,7 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeadergetSplinePtr(); @@ -463,7 +465,7 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(TT)); auto* restrict psiinv_ptr = reinterpret_cast(buffer_H2D_ptr)[ref_id_ptr[iat]]; auto* restrict pos_scratch = reinterpret_cast(buffer_H2D_ptr + nw * sizeof(ValueType*)); @@ -482,8 +484,8 @@ void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader::evaluateVGL(const ParticleSet& P, offload_scratch.resize(padded_size * 10); const auto orb_size = psi.size(); // for V(1)G(3)L(1) final result - results_scratch.resize(orb_size * 5); + results_scratch.resize(padded_size * 5); // Ye: need to extract sizes and pointers before entering target region const auto* spline_ptr = SplineInst->getSplinePtr(); @@ -666,7 +668,7 @@ void SplineC2ROMPTarget::evaluateVGL(const ParticleSet& P, { ScopedTimer offload(offload_timer_); PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \ - map(always, from: results_scratch_ptr[0:orb_size*5])") + map(always, from: results_scratch_ptr[0:padded_size*5])") for (int team_id = 0; team_id < NumTeams; team_id++) { const size_t first = ChunkSizePerTeam * team_id; @@ -691,7 +693,7 @@ void SplineC2ROMPTarget::evaluateVGL(const ParticleSet& P, const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G, + C2R::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index); } } @@ -699,10 +701,10 @@ void SplineC2ROMPTarget::evaluateVGL(const ParticleSet& P, for (size_t i = 0; i < orb_size; i++) { psi[i] = results_scratch[i]; - dpsi[i][0] = results_scratch[orb_size + i * 3]; - dpsi[i][1] = results_scratch[orb_size + i * 3 + 1]; - dpsi[i][2] = results_scratch[orb_size + i * 3 + 2]; - d2psi[i] = results_scratch[orb_size * 4 + i]; + dpsi[i][0] = results_scratch[i + padded_size * 1]; + dpsi[i][1] = results_scratch[i + padded_size * 2]; + dpsi[i][2] = results_scratch[i + padded_size * 3]; + d2psi[i] = results_scratch[i + padded_size * 4]; } } @@ -722,7 +724,7 @@ void SplineC2ROMPTarget::evaluateVGLMultiPos(const VectorgetSplinePtr(); @@ -741,7 +743,7 @@ void SplineC2ROMPTarget::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::evaluateVGLMultiPos(const Vector::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const { @@ -876,14 +878,14 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL } const size_t num_pos = nwalkers; + const auto orb_size = phi_vgl_v.size(2); + const auto padded_size = myV.size(); const size_t ChunkSizePerTeam = 512; const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam; - const auto padded_size = myV.size(); // for V(1)G(3)H(6) intermediate result mw_offload_scratch.resize(padded_size * num_pos * 10); - const auto orb_size = phi_vgl_v.size() / num_pos; // for V(1)G(3)L(1) final result - mw_results_scratch.resize(orb_size * num_pos * 5); + mw_results_scratch.resize(padded_size * num_pos * 5); // per team ratio and grads rg_private.resize(num_pos, NumTeams * 4); @@ -901,7 +903,7 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL auto* rg_private_ptr = rg_private.data(); const size_t buffer_H2D_stride = buffer_H2D.cols(); const size_t first_spo_local = first_spo; - const size_t phi_vgl_stride = phi_vgl_v.capacity(); + const size_t phi_vgl_stride = num_pos * orb_size; const size_t nComplexBands_local = nComplexBands; { @@ -916,7 +918,7 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last = omptarget::min(first + ChunkSizePerTeam, padded_size); auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10; - auto* restrict psi_iw_ptr = results_scratch_ptr + orb_size * iw * 5; + auto* restrict psi_iw_ptr = results_scratch_ptr + padded_size * iw * 5; const auto* restrict pos_iw_ptr = reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw); const auto* restrict invRow_iw_ptr = *reinterpret_cast(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6); @@ -943,17 +945,21 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL const size_t last_cplx = omptarget::min(last / 2, orb_size); PRAGMA_OFFLOAD("omp parallel for") for (int index = first_cplx; index < last_cplx; index++) - C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size, + C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index); - ValueType* restrict psi = psi_iw_ptr; - ValueType* restrict dpsi = psi_iw_ptr + orb_size; - ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4; + ValueType* restrict psi = psi_iw_ptr; + ValueType* restrict dpsi_x = psi_iw_ptr + padded_size; + ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2; + ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3; + ValueType* restrict d2psi = psi_iw_ptr + padded_size * 4; - ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size; - ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3; - ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size; + ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size; + ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride; + ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride; + ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride; + ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride; const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx); const size_t last_real = last_cplx + omptarget::min(nComplexBands_local, last_cplx); @@ -961,16 +967,16 @@ void SplineC2ROMPTarget::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)") for (size_t j = first_real; j < last_real; j++) { - out_phi_v[j] = psi[j]; - out_phi_l[j] = d2psi[j]; - out_phi_g[j * 3] = dpsi[j * 3]; - out_phi_g[j * 3 + 1] = dpsi[j * 3 + 1]; - out_phi_g[j * 3 + 2] = dpsi[j * 3 + 2]; + out_phi[j] = psi[j]; + out_dphi_x[j] = dpsi_x[j]; + out_dphi_y[j] = dpsi_y[j]; + out_dphi_z[j] = dpsi_z[j]; + out_d2phi[j] = d2psi[j]; ratio += psi[j] * invRow_iw_ptr[j]; - grad_x += dpsi[j * 3] * invRow_iw_ptr[j]; - grad_y += dpsi[j * 3 + 1] * invRow_iw_ptr[j]; - grad_z += dpsi[j * 3 + 2] * invRow_iw_ptr[j]; + grad_x += dpsi_x[j] * invRow_iw_ptr[j]; + grad_y += dpsi_y[j] * invRow_iw_ptr[j]; + grad_z += dpsi_z[j] * invRow_iw_ptr[j]; } rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h index c1f311ee78..b183e6256c 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h @@ -276,7 +276,7 @@ class SplineC2ROMPTarget : public BsplineSet const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const override; diff --git a/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp b/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp index 62f8e259cf..7417bd09f5 100644 --- a/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp +++ b/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp @@ -388,8 +388,7 @@ void BackflowTransformation::evaluateDerivatives(const ParticleSet& P) Bmat_full = 0.0; Cmat = 0.0; Ymat = 0.0; - for (int i = 0; i < Xmat.size(); i++) - Xmat(i) = 0; + std::fill_n(Xmat.data(), Xmat.size(), 0); for (int i = 0; i < NumTargets; i++) { QP.R[i] = P.R[i]; @@ -414,9 +413,7 @@ void BackflowTransformation::testDeriv(const ParticleSet& P) Bmat_full = 0.0; Cmat = 0.0; Ymat = 0.0; - // Xmat=DummyHess; - for (int i = 0; i < Xmat.size(); i++) - Xmat(i) = 0; + std::fill_n(Xmat.data(), Xmat.size(), 0); for (int i = 0; i < NumTargets; i++) { QP.R[i] = P.R[i]; diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h index ee3582e3b5..aa0cc1eb0f 100644 --- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h +++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h @@ -20,11 +20,12 @@ #include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp" #include "DiracMatrix.h" #include "PrefetchedRange.h" +#include "syclSolverInverter.hpp" + //#define SYCL_BLOCKING namespace qmcplusplus { - /** implements delayed update on Intel GPU using SYCL * @tparam T base precision for most computation * @tparam T_FP high precision for matrix inversion, T_FP >= T @@ -50,7 +51,7 @@ class DelayedUpdateSYCL /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv int delay_count; - DiracMatrix host_inverter_; + syclSolverInverter sycl_inverter_; // the range of prefetched_Ainv_rows PrefetchedRange prefetched_range; @@ -102,8 +103,9 @@ class DelayedUpdateSYCL template void invert_transpose(const Matrix& logdetT, Matrix& Ainv, std::complex& log_value) { - host_inverter_.invert_transpose(logdetT, Ainv, log_value); - initializeInv(Ainv); + clearDelayCount(); + + sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_); } /** initialize internal objects when Ainv is refreshed @@ -215,11 +217,11 @@ class DelayedUpdateSYCL #ifdef SYCL_BLOCKING syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv, - cone, Ainv_gpu.data(), norb, {u_event, temp_v_event}) + cone, Ainv_gpu.data(), norb, {u_event}) .wait(); #else ainv_event_ = syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, - temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event, temp_v_event}); + temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event}); #endif clearDelayCount(); diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp index b70f3431d0..fd877e858e 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp @@ -232,13 +232,11 @@ void DiracDeterminantBatched::mw_ratioGrad(const RefVectorWithLeader auto psiMinv_row_dev_ptr_list = DET_ENGINE::mw_getInvRow(engine_list, WorkingIndex, !Phi->isOMPoffload()); - phi_vgl_v.resize(NumOrbitals * wfc_list.size()); + phi_vgl_v.resize(DIM_VGL, wfc_list.size(), NumOrbitals); ratios_local.resize(wfc_list.size()); grad_new_local.resize(wfc_list.size()); - VectorSoaContainer phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(), - phi_vgl_v.capacity()); - wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view, + wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v, ratios_local, grad_new_local); } @@ -317,15 +315,8 @@ void DiracDeterminantBatched::mw_accept_rejectMove( det.curRatio = 1.0; } - if (!Phi->isOMPoffload() && n_accepted > 0) - { - auto* phi_vgl_v_ptr = phi_vgl_v.data(); - // transfer host to device, total size 5, v(1) + g(3) + l(1) - PRAGMA_OFFLOAD("omp target update to(phi_vgl_v_ptr[:phi_vgl_v.capacity()*5])") - } - DET_ENGINE::mw_accept_rejectRow(engine_list, WorkingIndex, psiM_g_dev_ptr_list, psiM_l_dev_ptr_list, isAccepted, - phi_vgl_v.device_data(), phi_vgl_v.capacity(), ratios_local); + phi_vgl_v, ratios_local); if (!safe_to_delay) DET_ENGINE::mw_updateInvMat(engine_list); @@ -595,15 +586,13 @@ void DiracDeterminantBatched::mw_calcRatio(const RefVectorWithLeader auto psiMinv_row_dev_ptr_list = DET_ENGINE::mw_getInvRow(engine_list, WorkingIndex, !Phi->isOMPoffload()); - phi_vgl_v.resize(NumOrbitals * wfc_list.size()); + phi_vgl_v.resize(DIM_VGL, wfc_list.size(), NumOrbitals); ratios_local.resize(wfc_list.size()); grad_new_local.resize(wfc_list.size()); - VectorSoaContainer phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(), - phi_vgl_v.capacity()); // calling Phi->mw_evaluateVGLandDetRatioGrads is a temporary workaround. // We may implement mw_evaluateVandDetRatio in the future. - wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view, + wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v, ratios_local, grad_new_local); } diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h index 176a00db44..04c5de28f6 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h @@ -55,6 +55,8 @@ class DiracDeterminantBatched : public DiracDeterminantBase using DualMatrix = Matrix>; using DualVGLVector = VectorSoaContainer>; + using OffloadMWVGLArray = typename SPOSet::OffloadMWVGLArray; + struct DiracDeterminantBatchedMultiWalkerResource : public Resource { DiracDeterminantBatchedMultiWalkerResource() : Resource("DiracDeterminantBatched") {} @@ -64,8 +66,8 @@ class DiracDeterminantBatched : public DiracDeterminantBase Resource* makeClone() const override { return new DiracDeterminantBatchedMultiWalkerResource(*this); } DualVector log_values; - /// value, grads, laplacian of single-particle orbital for particle-by-particle update and multi walker [5][nw*norb] - DualVGLVector phi_vgl_v; + /// value, grads, laplacian of single-particle orbital for particle-by-particle update and multi walker [5][nw][norb] + OffloadMWVGLArray phi_vgl_v; // multi walker of ratio std::vector ratios_local; // multi walker of grads diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h index 7775b36441..a2fbc37d80 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h +++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h @@ -60,6 +60,8 @@ class MatrixDelayedUpdateCUDA using DualMatrix = Matrix>; template using DualVGLVector = VectorSoaContainer>; + template + using OffloadMWVGLArray = Array>; // [VGL, walker, Orbs] struct MatrixDelayedUpdateCUDAMultiWalkerMem : public Resource { @@ -204,11 +206,13 @@ class MatrixDelayedUpdateCUDA const int norb = engine_leader.get_psiMinv().rows(); const int nw = engines.size(); int& delay_count = engine_leader.delay_count; - prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * 7 * nw); + + constexpr size_t num_ptrs_packed = 7; // it must match packing and unpacking + prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); engine_leader.resize_fill_constant_arrays(nw); const int lda_Binv = engine_leader.Binv_gpu.cols(); - Matrix ptr_buffer(reinterpret_cast(prepare_inv_row_buffer_H2D.data()), 7, nw); + Matrix ptr_buffer(reinterpret_cast(prepare_inv_row_buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { This_t& engine = engines[iw]; @@ -269,8 +273,7 @@ class MatrixDelayedUpdateCUDA * \param[in] psiM_g_list device ptrs * \param[in] psiM_l_list device ptrs * \param[in] isAccepted bool but wait some lists are also filtered - * \param[in] phi_vgl_v_dev_ptr device ptr - * \param[in] phi_vgl_stride size of each "vector" in phi_vgl_v + * \param[in] phi_vgl_v multiple walker orbital VGL * \param[inout] ratios */ static void mw_updateRow(const RefVectorWithLeader& engines, @@ -278,8 +281,7 @@ class MatrixDelayedUpdateCUDA const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { auto& engine_leader = engines.getLeader(); @@ -293,32 +295,35 @@ class MatrixDelayedUpdateCUDA if (n_accepted == 0) return; - auto& hstream = engine_leader.cuda_handles_->hstream; - auto& updateRow_buffer_H2D = engine_leader.mw_mem_->updateRow_buffer_H2D; - auto& mw_temp = engine_leader.mw_mem_->mw_temp; - auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; - auto& cone_vec = engine_leader.mw_mem_->cone_vec; - auto& czero_vec = engine_leader.mw_mem_->czero_vec; - const int norb = engine_leader.get_ref_psiMinv().rows(); - const int lda = engine_leader.get_ref_psiMinv().cols(); + auto& hstream = engine_leader.cuda_handles_->hstream; + auto& updateRow_buffer_H2D = engine_leader.mw_mem_->updateRow_buffer_H2D; + auto& mw_temp = engine_leader.mw_mem_->mw_temp; + auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; + auto& cone_vec = engine_leader.mw_mem_->cone_vec; + auto& czero_vec = engine_leader.mw_mem_->czero_vec; + const int norb = engine_leader.get_ref_psiMinv().rows(); + const int lda = engine_leader.get_ref_psiMinv().cols(); + const int nw = engines.size(); + const size_t phi_vgl_stride = nw * norb; mw_temp.resize(norb * n_accepted); mw_rcopy.resize(norb * n_accepted); - updateRow_buffer_H2D.resize((sizeof(Value*) * 8 + sizeof(Value)) * n_accepted); + + constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking + updateRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted); // to handle T** of Ainv, psi_v, temp, rcopy - Matrix ptr_buffer(reinterpret_cast(updateRow_buffer_H2D.data()), 8, n_accepted); - Value* c_ratio_inv = reinterpret_cast(updateRow_buffer_H2D.data() + sizeof(Value*) * 8 * n_accepted); + Matrix ptr_buffer(reinterpret_cast(updateRow_buffer_H2D.data()), num_ptrs_packed, n_accepted); + Value* c_ratio_inv = + reinterpret_cast(updateRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted); for (int iw = 0, count = 0; iw < isAccepted.size(); iw++) if (isAccepted[iw]) { ptr_buffer[0][count] = engines[iw].get_ref_psiMinv().device_data(); - ptr_buffer[1][count] = const_cast(phi_vgl_v_dev_ptr + norb * iw); + ptr_buffer[1][count] = const_cast(phi_vgl_v.device_data_at(0, iw, 0)); ptr_buffer[2][count] = mw_temp.device_data() + norb * count; ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count; ptr_buffer[4][count] = psiM_g_list[count]; ptr_buffer[5][count] = psiM_l_list[count]; - ptr_buffer[6][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw); - ptr_buffer[7][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw); c_ratio_inv[count] = Value(-1) / ratios[iw]; count++; @@ -333,7 +338,8 @@ class MatrixDelayedUpdateCUDA { Value** Ainv_mw_ptr = reinterpret_cast(updateRow_buffer_H2D.device_data()); - Value** phiV_mw_ptr = reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted); + Value** phiVGL_mw_ptr = + reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted); Value** temp_mw_ptr = reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 2); Value** rcopy_mw_ptr = @@ -342,21 +348,18 @@ class MatrixDelayedUpdateCUDA reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 4); Value** d2psiM_mw_out = reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 5); - Value** dpsiM_mw_in = - reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6); - Value** d2psiM_mw_in = - reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 7); Value* ratio_inv_mw = - reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 8); + reinterpret_cast(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6); + // invoke the Fahy's variant of Sherman-Morrison update. cudaErrorCheck(cuBLAS_MFs::gemv_batched(hstream, 'T', norb, norb, cone_vec.device_data(), Ainv_mw_ptr, lda, - phiV_mw_ptr, 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted), + phiVGL_mw_ptr, 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted), "cuBLAS_MFs::gemv_batched failed!"); cudaErrorCheck(CUDA::copyAinvRow_saveGL_cuda(hstream, rowchanged, norb, Ainv_mw_ptr, lda, temp_mw_ptr, - rcopy_mw_ptr, dpsiM_mw_in, d2psiM_mw_in, dpsiM_mw_out, d2psiM_mw_out, - n_accepted), + rcopy_mw_ptr, phiVGL_mw_ptr, phi_vgl_stride, dpsiM_mw_out, + d2psiM_mw_out, n_accepted), "CUDA::copyAinvRow_saveGL_cuda failed!"); @@ -450,9 +453,10 @@ class MatrixDelayedUpdateCUDA auto& evalGrad_buffer_H2D = engine_leader.mw_mem_->evalGrad_buffer_H2D; auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; - const int nw = engines.size(); - evalGrad_buffer_H2D.resize(sizeof(Value*) * 2 * nw); - Matrix ptr_buffer(reinterpret_cast(evalGrad_buffer_H2D.data()), 2, nw); + const int nw = engines.size(); + constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking + evalGrad_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); + Matrix ptr_buffer(reinterpret_cast(evalGrad_buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { if (engine_leader.isSM1()) @@ -522,13 +526,15 @@ class MatrixDelayedUpdateCUDA int success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1); if (success != 0) throw std::runtime_error("ompBLAS::gemv failed."); - PRAGMA_OFFLOAD("omp target is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + + PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + for (int i = 0; i < norb; i++) { - temp_ptr[rowchanged] -= cone; - PRAGMA_OFFLOAD("omp parallel for simd") - for (int i = 0; i < norb; i++) - rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + if (i == 0) + temp_ptr[rowchanged] -= cone; } + success = ompBLAS::ger(dummy_handle, norb, norb, static_cast(FullPrecValue(-1) / c_ratio_in), rcopy_ptr, 1, temp_ptr, 1, Ainv_ptr, lda); if (success != 0) @@ -544,8 +550,7 @@ class MatrixDelayedUpdateCUDA * \param[in] psiM_g_list * \param[in] psiM_l_list * \param[in] isAccepted - * \param[in] phi_vgl_v_dev_ptr - * \param[in] phi_vgl_stride size of each "vector" in phi_vgl_v + * \param[in] phi_vgl_v multiple walker orbital VGL * \param[inout] ratios */ static void mw_accept_rejectRow(const RefVectorWithLeader& engines, @@ -553,8 +558,7 @@ class MatrixDelayedUpdateCUDA const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { auto& engine_leader = engines.getLeader(); @@ -563,8 +567,7 @@ class MatrixDelayedUpdateCUDA if (engine_leader.isSM1()) { - mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v_dev_ptr, phi_vgl_stride, - ratios); + mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios); return; } @@ -579,11 +582,15 @@ class MatrixDelayedUpdateCUDA const int lda = engine_leader.get_psiMinv().cols(); const int nw = engines.size(); const int n_accepted = psiM_g_list.size(); - accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 14 + sizeof(Value)) * nw); + const size_t phi_vgl_stride = nw * norb; + + constexpr size_t num_ptrs_packed = 12; // it must match packing and unpacking + accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * nw); engine_leader.resize_fill_constant_arrays(nw); - Matrix ptr_buffer(reinterpret_cast(accept_rejectRow_buffer_H2D.data()), 14, nw); - Value* c_ratio_inv = reinterpret_cast(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 14 * nw); + Matrix ptr_buffer(reinterpret_cast(accept_rejectRow_buffer_H2D.data()), num_ptrs_packed, nw); + Value* c_ratio_inv = + reinterpret_cast(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * nw); for (int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++) { This_t& engine = engines[iw]; @@ -598,11 +605,9 @@ class MatrixDelayedUpdateCUDA ptr_buffer[6][count_accepted] = engine.Binv_gpu.data() + delay_count; ptr_buffer[7][count_accepted] = reinterpret_cast(engine.delay_list_gpu.data()); ptr_buffer[8][count_accepted] = engine.V_gpu.data() + norb * delay_count; - ptr_buffer[9][count_accepted] = const_cast(phi_vgl_v_dev_ptr + norb * iw); - ptr_buffer[10][count_accepted] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw); - ptr_buffer[11][count_accepted] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw); - ptr_buffer[12][count_accepted] = psiM_g_list[count_accepted]; - ptr_buffer[13][count_accepted] = psiM_l_list[count_accepted]; + ptr_buffer[9][count_accepted] = const_cast(phi_vgl_v.device_data_at(0, iw, 0)); + ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted]; + ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted]; c_ratio_inv[count_accepted] = Value(1) / ratios[iw]; count_accepted++; } @@ -640,18 +645,14 @@ class MatrixDelayedUpdateCUDA reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 7); Value** V_row_mw_ptr = reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 8); - Value** phiV_mw_ptr = + Value** phiVGL_mw_ptr = reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 9); - Value** dpsiM_mw_in = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10); - Value** d2psiM_mw_in = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11); Value** dpsiM_mw_out = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12); + reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10); Value** d2psiM_mw_out = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 13); + reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11); Value* ratio_inv_mw_ptr = - reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 14); + reinterpret_cast(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12); //std::copy_n(Ainv[rowchanged], norb, V[delay_count]); cudaErrorCheck(cuBLAS_MFs::copy_batched(hstream, norb, invRow_mw_ptr, 1, V_row_mw_ptr, 1, nw), @@ -660,7 +661,7 @@ class MatrixDelayedUpdateCUDA // the new Binv is [[X Y] [Z sigma]] //BLAS::gemv('T', norb, delay_count + 1, cminusone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1); cudaErrorCheck(cuBLAS_MFs::gemv_batched(hstream, 'T', norb, delay_count, cminusone_vec.device_data(), V_mw_ptr, - norb, phiV_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted), + norb, phiVGL_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted), "cuBLAS_MFs::gemv_batched failed!"); // Y //BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero, Binv.data() + delay_count, @@ -677,8 +678,8 @@ class MatrixDelayedUpdateCUDA "cuBLAS_MFs::ger_batched failed!"); // sigma and Z cudaErrorCheck(CUDA::add_delay_list_save_sigma_VGL_batched(hstream, delay_list_mw_ptr, rowchanged, delay_count, - Binv_mw_ptr, lda_Binv, ratio_inv_mw_ptr, phiV_mw_ptr, - dpsiM_mw_in, d2psiM_mw_in, U_row_mw_ptr, dpsiM_mw_out, + Binv_mw_ptr, lda_Binv, ratio_inv_mw_ptr, phiVGL_mw_ptr, + phi_vgl_stride, U_row_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, norb, n_accepted, nw), "CUDA::add_delay_list_save_y_VGL_batched failed!"); delay_count++; @@ -703,10 +704,12 @@ class MatrixDelayedUpdateCUDA const int norb = engine_leader.get_psiMinv().rows(); const int lda = engine_leader.get_psiMinv().cols(); const int nw = engines.size(); - updateInv_buffer_H2D.resize(sizeof(Value*) * 6 * nw); + + constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking + updateInv_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); engine_leader.resize_fill_constant_arrays(nw); - Matrix ptr_buffer(reinterpret_cast(updateInv_buffer_H2D.data()), 6, nw); + Matrix ptr_buffer(reinterpret_cast(updateInv_buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { This_t& engine = engines[iw]; diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h index 185bd0a530..cbb1efe06f 100644 --- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h +++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h @@ -54,6 +54,8 @@ class MatrixUpdateOMPTarget using OffloadMatrix = Matrix>; template using OffloadVGLVector = VectorSoaContainer>; + template + using OffloadMWVGLArray = Array>; // [VGL, walker, Orbs] struct MatrixUpdateOMPTargetMultiWalkerMem : public Resource { @@ -155,10 +157,11 @@ class MatrixUpdateOMPTarget auto& buffer_H2D = engine_leader.mw_mem_->buffer_H2D; auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; - const int norb = engine_leader.get_psiMinv().rows(); - const int nw = engines.size(); - buffer_H2D.resize(sizeof(Value*) * 2 * nw); - Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 2, nw); + const int norb = engine_leader.get_psiMinv().rows(); + const int nw = engines.size(); + constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking + buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw); + Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), num_ptrs_packed, nw); for (int iw = 0; iw < nw; iw++) { ptr_buffer[0][iw] = engines[iw].get_psiMinv().device_data() + rowchanged * engine_leader.get_psiMinv().cols(); @@ -219,13 +222,15 @@ class MatrixUpdateOMPTarget success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1); if (success != 0) throw std::runtime_error("ompBLAS::gemv failed."); - PRAGMA_OFFLOAD("omp target is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + + PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)") + for (int i = 0; i < norb; i++) { - temp_ptr[rowchanged] -= cone; - PRAGMA_OFFLOAD("omp parallel for simd") - for (int i = 0; i < norb; i++) - rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; + if (i == 0) + temp_ptr[rowchanged] -= cone; } + success = ompBLAS::ger(dummy_handle, norb, norb, static_cast(-1.0 / c_ratio_in), rcopy_ptr, 1, temp_ptr, 1, Ainv_ptr, lda); if (success != 0) @@ -240,41 +245,41 @@ class MatrixUpdateOMPTarget const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { const size_t n_accepted = psiM_g_list.size(); if (n_accepted == 0) return; - auto& engine_leader = engines.getLeader(); - auto& buffer_H2D = engine_leader.mw_mem_->buffer_H2D; - auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; - auto& cone_vec = engine_leader.mw_mem_->cone_vec; - auto& czero_vec = engine_leader.mw_mem_->czero_vec; - auto& mw_temp = engine_leader.mw_mem_->mw_temp; - auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; - const int norb = engine_leader.get_psiMinv().rows(); - const int lda = engine_leader.get_psiMinv().cols(); + auto& engine_leader = engines.getLeader(); + auto& buffer_H2D = engine_leader.mw_mem_->buffer_H2D; + auto& grads_value_v = engine_leader.mw_mem_->grads_value_v; + auto& cone_vec = engine_leader.mw_mem_->cone_vec; + auto& czero_vec = engine_leader.mw_mem_->czero_vec; + auto& mw_temp = engine_leader.mw_mem_->mw_temp; + auto& mw_rcopy = engine_leader.mw_mem_->mw_rcopy; + const int norb = engine_leader.get_psiMinv().rows(); + const int lda = engine_leader.get_psiMinv().cols(); + const size_t nw = isAccepted.size(); + const size_t phi_vgl_stride = nw * norb; engine_leader.resize_scratch_arrays(norb, n_accepted); // to handle Value** of Ainv, psi_v, temp, rcopy - buffer_H2D.resize((sizeof(Value*) * 8 + sizeof(Value)) * n_accepted); - Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), 8, n_accepted); - Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * 8 * n_accepted); - for (int iw = 0, count = 0; iw < isAccepted.size(); iw++) + constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking + buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted); + Matrix ptr_buffer(reinterpret_cast(buffer_H2D.data()), num_ptrs_packed, n_accepted); + Value* c_ratio_inv = reinterpret_cast(buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted); + for (int iw = 0, count = 0; iw < nw; iw++) if (isAccepted[iw]) { ptr_buffer[0][count] = engines[iw].get_ref_psiMinv().device_data(); - ptr_buffer[1][count] = const_cast(phi_vgl_v_dev_ptr + norb * iw); + ptr_buffer[1][count] = const_cast(phi_vgl_v.device_data_at(0, iw, 0)); ptr_buffer[2][count] = mw_temp.device_data() + norb * count; ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count; ptr_buffer[4][count] = psiM_g_list[count]; ptr_buffer[5][count] = psiM_l_list[count]; - ptr_buffer[6][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw); - ptr_buffer[7][count] = const_cast(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw); c_ratio_inv[count] = Value(-1) / ratios[iw]; count++; @@ -294,23 +299,21 @@ class MatrixUpdateOMPTarget use_device_ptr(buffer_H2D_ptr, cone_ptr, czero_ptr)") { Value** Ainv_mw_ptr = reinterpret_cast(buffer_H2D_ptr); - Value** phiV_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted); + Value** phiVGL_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted); Value** temp_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 2); Value** rcopy_mw_ptr = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 3); Value** dpsiM_mw_out = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 4); Value** d2psiM_mw_out = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 5); - Value** dpsiM_mw_in = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 6); - Value** d2psiM_mw_in = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 7); - Value* ratio_inv_mw = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 8); + Value* ratio_inv_mw = reinterpret_cast(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 6); // invoke the Fahy's variant of Sherman-Morrison update. - success = ompBLAS::gemv_batched(dummy_handle, 'T', norb, norb, cone_ptr, Ainv_mw_ptr, lda, phiV_mw_ptr, 1, + success = ompBLAS::gemv_batched(dummy_handle, 'T', norb, norb, cone_ptr, Ainv_mw_ptr, lda, phiVGL_mw_ptr, 1, czero_ptr, temp_mw_ptr, 1, n_accepted); if (success != 0) throw std::runtime_error("ompBLAS::gemv_batched failed."); PRAGMA_OFFLOAD("omp target teams distribute num_teams(n_accepted) is_device_ptr(Ainv_mw_ptr, temp_mw_ptr, \ - rcopy_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, dpsiM_mw_in, d2psiM_mw_in)") + rcopy_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, phiVGL_mw_ptr)") for (int iw = 0; iw < n_accepted; iw++) { Value* __restrict__ Ainv_ptr = Ainv_mw_ptr[iw]; @@ -318,8 +321,8 @@ class MatrixUpdateOMPTarget Value* __restrict__ rcopy_ptr = rcopy_mw_ptr[iw]; Value* __restrict__ dpsiM_out = dpsiM_mw_out[iw]; Value* __restrict__ d2psiM_out = d2psiM_mw_out[iw]; - Value* __restrict__ dpsiM_in = dpsiM_mw_in[iw]; - Value* __restrict__ d2psiM_in = d2psiM_mw_in[iw]; + Value* __restrict__ dpsiM_in = phiVGL_mw_ptr[iw] + phi_vgl_stride; + Value* __restrict__ d2psiM_in = phiVGL_mw_ptr[iw] + phi_vgl_stride * 4; temp_ptr[rowchanged] -= cone; PRAGMA_OFFLOAD("omp parallel for simd") @@ -328,9 +331,9 @@ class MatrixUpdateOMPTarget rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i]; // the following copying data on the device is not part of SM-1 // it is intended to copy dpsiM and d2psiM from temporary to final without a separate kernel. - dpsiM_out[i * 3] = dpsiM_in[i * 3]; - dpsiM_out[i * 3 + 1] = dpsiM_in[i * 3 + 1]; - dpsiM_out[i * 3 + 2] = dpsiM_in[i * 3 + 2]; + dpsiM_out[i * 3] = dpsiM_in[i]; + dpsiM_out[i * 3 + 1] = dpsiM_in[i + phi_vgl_stride]; + dpsiM_out[i * 3 + 2] = dpsiM_in[i + phi_vgl_stride * 2]; d2psiM_out[i] = d2psiM_in[i]; } } @@ -347,11 +350,10 @@ class MatrixUpdateOMPTarget const std::vector& psiM_g_list, const std::vector& psiM_l_list, const std::vector& isAccepted, - const Value* phi_vgl_v_dev_ptr, - const size_t phi_vgl_stride, + const OffloadMWVGLArray& phi_vgl_v, const std::vector& ratios) { - mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v_dev_ptr, phi_vgl_stride, ratios); + mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios); } /** update the full Ainv and reset delay_count diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp index a42f88409b..9787e91d68 100644 --- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp +++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp @@ -517,6 +517,19 @@ void MultiDiracDeterminant::restore(int iat) */ } +void MultiDiracDeterminant::mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted) +{ + // TODO to be expanded to serve offload needs without relying on calling acceptMove and restore + for (int iw = 0; iw < wfc_list.size(); iw++) + if (isAccepted[iw]) + wfc_list[iw].acceptMove(p_list[iw], iat, false); + else + wfc_list[iw].restore(iat); +} + // this has been fixed MultiDiracDeterminant::MultiDiracDeterminant(const MultiDiracDeterminant& s) : WaveFunctionComponent(s), diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h index 690ba02368..b42a61df73 100644 --- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h +++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h @@ -201,6 +201,11 @@ class MultiDiracDeterminant : public WaveFunctionComponent */ void restore(int iat) override; + static void mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted); + void createResource(ResourceCollection& collection) const override; void acquireResource(ResourceCollection& collection, const RefVectorWithLeader& wfc_list) const; diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp index 4c8e73b905..c86af93eeb 100644 --- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp +++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp @@ -669,6 +669,28 @@ void MultiSlaterDetTableMethod::restore(int iat) curRatio = 1.0; } +void MultiSlaterDetTableMethod::mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted, + bool safe_to_delay) const +{ + ScopedTimer local_timer(AccRejTimer); + for (size_t iw = 0; iw < isAccepted.size(); iw++) + { + auto& det = wfc_list.getCastedElement(iw); + if (isAccepted[iw]) + { + det.psi_ratio_to_ref_det_ = det.new_psi_ratio_to_new_ref_det_; + det.log_value_ += convertValueToLog(det.curRatio); + } + det.curRatio = 1.0; + } + const auto det_id = getDetID(iat); + const auto det_list(extract_DetRef_list(wfc_list, det_id)); + Dets[det_id]->mw_accept_rejectMove(det_list, p_list, iat, isAccepted); +} + void MultiSlaterDetTableMethod::registerData(ParticleSet& P, WFBufferType& buf) { for (size_t id = 0; id < Dets.size(); id++) @@ -1082,6 +1104,9 @@ void MultiSlaterDetTableMethod::mw_prepareGroup(const RefVectorWithLeader& p_list, int ig) const { + if (!use_pre_computing_) + return; + auto& det_leader = wfc_list.getCastedLeader(); const size_t nw = wfc_list.size(); assert(this == &det_leader); diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h index 51c2a2b163..66d6699fb3 100644 --- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h +++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h @@ -159,6 +159,12 @@ class MultiSlaterDetTableMethod : public WaveFunctionComponent void acceptMove(ParticleSet& P, int iat, bool safe_to_delay = false) override; void restore(int iat) override; + void mw_accept_rejectMove(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& p_list, + int iat, + const std::vector& isAccepted, + bool safe_to_delay = false) const override; + void registerData(ParticleSet& P, WFBufferType& buf) override; LogValueType updateBuffer(ParticleSet& P, WFBufferType& buf, bool fromscratch = false) override; void copyFromBuffer(ParticleSet& P, WFBufferType& buf) override; diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp new file mode 100644 index 0000000000..694ed9a2e3 --- /dev/null +++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp @@ -0,0 +1,135 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2019 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H +#define QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H + +#include "OhmmsPETE/OhmmsVector.h" +#include "OhmmsPETE/OhmmsMatrix.h" +#include "SYCL/SYCLallocator.hpp" +#include "SYCL/syclBLAS.hpp" +#include "SYCL/syclSolver.hpp" +#include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp" + +namespace qmcplusplus +{ +/** implements matrix inversion via cuSolverDN + * @tparam T_FP high precision for matrix inversion, T_FP >= T + */ +template +class syclSolverInverter +{ + /// scratch memory for cusolverDN + Matrix> Mat1_gpu; + /// pivot array + info + Vector> ipiv; + /// workspace + Vector> workspace; + std::int64_t getrf_ws = 0; + std::int64_t getri_ws = 0; + + /** resize the internal storage + * @param norb number of electrons/orbitals + * @param delay, maximum delay 0(m_queue, norb, norb, norb); + getri_ws = syclSolver::getri_scratchpad_size(m_queue, norb, norb); + workspace.resize(std::max(getrf_ws, getri_ws)); + } + } + +public: + /** compute the inverse of the transpose of matrix A and its determinant value in log + * when T_FP and TMAT are the same + * @tparam TREAL real type + */ + template::value>> + std::enable_if_t::value> invert_transpose(const Matrix& logdetT, + Matrix& Ainv, + Matrix>& Ainv_gpu, + std::complex& log_value, + sycl::queue& m_queue) + { + const int norb = logdetT.rows(); + resize(norb, m_queue); + + auto c_event = m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT)); + auto t_event = syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb, + Ainv_gpu.cols(), {c_event}); + try + { + syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, + {t_event}).wait(); + } + catch (cl::sycl::exception const& ex) + { + std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" + << ex.what() << " status: " << ex.code() << std::endl; + abort(); + } + + log_value = computeLogDet_sycl(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data()); + + c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws); + + m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {c_event}).wait(); + } + + /** compute the inverse of the transpose of matrix A and its determinant value in log + * when T_FP and TMAT are not the same + * @tparam TREAL real type + */ + template::value>> + std::enable_if_t::value> invert_transpose(const Matrix& logdetT, + Matrix& Ainv, + Matrix>& Ainv_gpu, + std::complex& log_value, + sycl::queue& m_queue) + { + const int norb = logdetT.rows(); + resize(norb, m_queue); + //use Ainv_gpu for transpose + auto c_event = m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT)); + //transpose + auto t_event = syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb, + Mat1_gpu.cols(), {c_event}); + + //getrf (LU) -> getri (inverse) + try + { + syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, + {t_event}).wait(); + } + catch (cl::sycl::exception const& ex) + { + std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n" + << ex.what() << " status: " << ex.code() << std::endl; + abort(); + } + + log_value = computeLogDet_sycl(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data()); + + c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws); + + t_event = syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data(), {c_event}); + + m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {t_event}).wait(); + } +}; +} // namespace qmcplusplus + +#endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H diff --git a/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp b/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp index dbb7f941a1..28b87f6fad 100644 --- a/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp +++ b/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp @@ -137,22 +137,21 @@ void SHOSet::evaluate_vgl(PosType r, ValueVector& psi, GradVector& dpsi, ValueVe void SHOSet::evaluate_hermite(const PosType& xpos) { - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { int nh = qn_max[d]; if (nh > 0) { - RealType x = xpos[d]; - hermite(0 + shift) = 1.0; - RealType Hnm2 = 0.0; - RealType Hnm1 = 1.0; + RealType x = xpos[d]; + hermite(d, 0) = 1.0; + RealType Hnm2 = 0.0; + RealType Hnm1 = 1.0; for (int n = 1; n < nh; ++n) { - RealType Hn = 2 * (x * Hnm1 - (n - 1) * Hnm2); - hermite(n + shift) = Hn; - Hnm2 = Hnm1; - Hnm1 = Hn; + RealType Hn = 2 * (x * Hnm1 - (n - 1) * Hnm2); + hermite(d, n) = Hn; + Hnm2 = Hnm1; + Hnm1 = Hn; } } } @@ -162,24 +161,21 @@ void SHOSet::evaluate_hermite(const PosType& xpos) void SHOSet::evaluate_d0(const PosType& xpos, ValueVector& psi) { using std::exp; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { RealType x = xpos[d]; RealType g = exp(-.5 * x * x); for (int n = 0; n < qn_max[d]; ++n) { - int ns = n + shift; - bvalues(ns) = prefactors[n] * g * hermite(ns); + bvalues(d, n) = prefactors[n] * g * hermite(d, n); } } for (int s = 0; s < state_info.size(); ++s) { const SHOState& state = state_info[s]; RealType phi = 1.0; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) - phi *= bvalues(shift + state.quantum_number[d]); + for (int d = 0; d < DIM; ++d) + phi *= bvalues(d, state.quantum_number[d]); psi[s] = phi; } } @@ -188,26 +184,23 @@ void SHOSet::evaluate_d0(const PosType& xpos, ValueVector& psi) void SHOSet::evaluate_d1(const PosType& xpos, ValueVector& psi, GradVector& dpsi) { RealType ol = 1.0 / length; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { RealType x = xpos[d]; RealType Hnm1 = 0.0; for (int n = 0; n < qn_max[d]; ++n) { - int ns = n + shift; - RealType Hn = hermite(ns); - bvalues(ns) = (-x + 2 * n * Hnm1 / Hn) * ol; - Hnm1 = Hn; + RealType Hn = hermite(d, n); + bvalues(d, n) = (-x + 2 * n * Hnm1 / Hn) * ol; + Hnm1 = Hn; } } for (int s = 0; s < state_info.size(); ++s) { const SHOState& state = state_info[s]; TinyVector dphi; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) - dphi[d] = bvalues(shift + state.quantum_number[d]); + for (int d = 0; d < DIM; ++d) + dphi[d] = bvalues(d, state.quantum_number[d]); dphi *= psi[s]; dpsi[s] = dphi; } @@ -217,24 +210,21 @@ void SHOSet::evaluate_d1(const PosType& xpos, ValueVector& psi, GradVector& dpsi void SHOSet::evaluate_d2(const PosType& xpos, ValueVector& psi, ValueVector& d2psi) { RealType ol2 = 1.0 / (length * length); - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) + for (int d = 0; d < DIM; ++d) { RealType x = xpos[d]; RealType x2 = x * x; for (int n = 0; n < qn_max[d]; ++n) { - int ns = n + shift; - bvalues(ns) = (-1.0 + x2 - 2 * n) * ol2; + bvalues(d, n) = (-1.0 + x2 - 2 * n) * ol2; } } for (int s = 0; s < state_info.size(); ++s) { const SHOState& state = state_info[s]; ValueType d2phi = 0.0; - int shift = 0; - for (int d = 0; d < DIM; ++d, shift += nmax) - d2phi += bvalues(shift + state.quantum_number[d]); + for (int d = 0; d < DIM; ++d) + d2phi += bvalues(d, state.quantum_number[d]); d2phi *= psi[s]; d2psi[s] = d2phi; } diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index 4da7844190..b668e1c975 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -715,7 +715,6 @@ void J2OMPTarget::mw_recompute(const RefVectorWithLeader>(); assert(this == &wfc_leader); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) if (recompute[iw]) wfc_list[iw].recompute(p_list[iw]); diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp index 88292d3a97..8a1d2a6ca1 100644 --- a/src/QMCWaveFunctions/SPOSet.cpp +++ b/src/QMCWaveFunctions/SPOSet.cpp @@ -52,7 +52,6 @@ void SPOSet::mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, std::vector>& ratios_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) { Vector invRow(const_cast(invRow_ptr_list[iw]), psi_list[iw].get().size()); @@ -78,7 +77,6 @@ void SPOSet::mw_evaluateVGL(const RefVectorWithLeader& spo_list, const RefVector& d2psi_v_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluateVGL(P_list[iw], iat, psi_v_list[iw], dpsi_v_list[iw], d2psi_v_list[iw]); } @@ -89,7 +87,6 @@ void SPOSet::mw_evaluateValue(const RefVectorWithLeader& spo_list, const RefVector& psi_v_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluateValue(P_list[iw], iat, psi_v_list[iw]); } @@ -103,7 +100,6 @@ void SPOSet::mw_evaluateVGLWithSpin(const RefVectorWithLeader& spo_list, const RefVector& dspin_v_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluateVGL_spin(P_list[iw], iat, psi_v_list[iw], dpsi_v_list[iw], d2psi_v_list[iw], dspin_v_list[iw]); } @@ -112,24 +108,34 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader& s const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const { assert(this == &spo_list.getLeader()); + assert(phi_vgl_v.size(0) == DIM_VGL); + assert(phi_vgl_v.size(1) == spo_list.size()); const size_t nw = spo_list.size(); - const size_t norb_requested = phi_vgl_v.size() / nw; -#pragma omp parallel for + const size_t norb_requested = phi_vgl_v.size(2); + GradVector dphi_v(norb_requested); for (int iw = 0; iw < nw; iw++) { - ValueVector phi_v(phi_vgl_v.data() + norb_requested * iw, norb_requested); - GradVector dphi_v(reinterpret_cast(phi_vgl_v.data(1)) + norb_requested * iw, norb_requested); - ValueVector d2phi_v(phi_vgl_v.data(4) + norb_requested * iw, norb_requested); + ValueVector phi_v(phi_vgl_v.data_at(0, iw, 0), norb_requested); + ValueVector d2phi_v(phi_vgl_v.data_at(4, iw, 0), norb_requested); spo_list[iw].evaluateVGL(P_list[iw], iat, phi_v, dphi_v, d2phi_v); ratios[iw] = simd::dot(invRow_ptr_list[iw], phi_v.data(), norb_requested); grads[iw] = simd::dot(invRow_ptr_list[iw], dphi_v.data(), norb_requested) / ratios[iw]; + + // transpose the array of gradients to SoA in phi_vgl_v + for (size_t idim = 0; idim < DIM; idim++) + { + ValueType* phi_g = phi_vgl_v.data_at(idim + 1, iw, 0); + for (size_t iorb = 0; iorb < norb_requested; iorb++) + phi_g[iorb] = dphi_v[iorb][idim]; + } } + phi_vgl_v.updateTo(); } void SPOSet::evaluateThirdDeriv(const ParticleSet& P, int first, int last, GGGMatrix& grad_grad_grad_logdet) @@ -158,7 +164,6 @@ void SPOSet::mw_evaluate_notranspose(const RefVectorWithLeader& spo_list const RefVector& d2logdet_list) const { assert(this == &spo_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < spo_list.size(); iw++) spo_list[iw].evaluate_notranspose(P_list[iw], first, last, logdet_list[iw], dlogdet_list[iw], d2logdet_list[iw]); } diff --git a/src/QMCWaveFunctions/SPOSet.h b/src/QMCWaveFunctions/SPOSet.h index bc031a4591..4c17b1bb58 100644 --- a/src/QMCWaveFunctions/SPOSet.h +++ b/src/QMCWaveFunctions/SPOSet.h @@ -27,6 +27,7 @@ #ifdef QMC_CUDA #include "type_traits/CUDATypes.h" #endif +#include "OMPTarget/OffloadAlignedAllocators.hpp" namespace qmcplusplus { @@ -56,6 +57,8 @@ class SPOSet : public QMCTraits using Walker_t = ParticleSet::Walker_t; using SPOPool_t = std::map; + using OffloadMWVGLArray = Array>; // [VGL, walker, Orbs] + /** constructor */ SPOSet(bool use_OMP_offload = false, bool ion_deriv = false, bool optimizable = false); @@ -283,8 +286,8 @@ class SPOSet : public QMCTraits const RefVector& d2psi_v_list, const RefVector& dspin_v_list) const; - /** evaluate the values, gradients and laplacians of this single-particle orbital sets - * and determinant ratio and grads of multiple walkers + /** evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ratio + * and grads of multiple walkers. Device data of phi_vgl_v must be up-to-date upon return * @param spo_list the list of SPOSet pointers in a walker batch * @param P_list the list of ParticleSet pointers in a walker batch * @param iat active particle @@ -295,7 +298,7 @@ class SPOSet : public QMCTraits const RefVectorWithLeader& P_list, int iat, const std::vector& invRow_ptr_list, - VGLVector& phi_vgl_v, + OffloadMWVGLArray& phi_vgl_v, std::vector& ratios, std::vector& grads) const; diff --git a/src/QMCWaveFunctions/SpinorSet.cpp b/src/QMCWaveFunctions/SpinorSet.cpp index 7609800b5b..62b2fc4a47 100644 --- a/src/QMCWaveFunctions/SpinorSet.cpp +++ b/src/QMCWaveFunctions/SpinorSet.cpp @@ -195,7 +195,6 @@ void SpinorSet::mw_evaluateVGLWithSpin(const RefVectorWithLeader& spo_li up_spo_leader.mw_evaluateVGL(up_spo_list, P_list, iat, up_psi_v_list, up_dpsi_v_list, up_d2psi_v_list); dn_spo_leader.mw_evaluateVGL(dn_spo_list, P_list, iat, dn_psi_v_list, dn_dpsi_v_list, dn_d2psi_v_list); -#pragma omp parallel for for (int iw = 0; iw < nw; iw++) { ParticleSet::Scalar_t s = P_list[iw].activeSpin(iat); @@ -327,7 +326,6 @@ void SpinorSet::mw_evaluate_notranspose(const RefVectorWithLeader& spo_l dn_spo_leader.mw_evaluate_notranspose(dn_spo_list, P_list, first, last, dn_logdet_list, dn_dlogdet_list, dn_d2logdet_list); -#pragma omp parallel for for (int iw = 0; iw < nw; iw++) for (int iat = 0; iat < nelec; iat++) { diff --git a/src/QMCWaveFunctions/WaveFunctionComponent.cpp b/src/QMCWaveFunctions/WaveFunctionComponent.cpp index 284fe17996..880f67684e 100644 --- a/src/QMCWaveFunctions/WaveFunctionComponent.cpp +++ b/src/QMCWaveFunctions/WaveFunctionComponent.cpp @@ -44,7 +44,6 @@ void WaveFunctionComponent::mw_evaluateLog(const RefVectorWithLeader& L_list) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) wfc_list[iw].evaluateLog(p_list[iw], G_list[iw], L_list[iw]); } @@ -62,7 +61,6 @@ void WaveFunctionComponent::mw_recompute(const RefVectorWithLeader& recompute) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) if (recompute[iw]) wfc_list[iw].recompute(p_list[iw]); @@ -73,7 +71,6 @@ void WaveFunctionComponent::mw_prepareGroup(const RefVectorWithLeader& grad_now) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) grad_now[iw] = wfc_list[iw].evalGrad(p_list[iw], iat); } @@ -108,7 +104,6 @@ void WaveFunctionComponent::mw_evalGradWithSpin(const RefVectorWithLeader& spingrad_now) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) grad_now[iw] = wfc_list[iw].evalGradWithSpin(p_list[iw], iat, spingrad_now[iw]); } @@ -119,7 +114,6 @@ void WaveFunctionComponent::mw_calcRatio(const RefVectorWithLeader& ratios) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) ratios[iw] = wfc_list[iw].ratio(p_list[iw], iat); } @@ -151,7 +145,6 @@ void WaveFunctionComponent::mw_ratioGrad(const RefVectorWithLeader& grad_new) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) ratios[iw] = wfc_list[iw].ratioGrad(p_list[iw], iat, grad_new[iw]); } @@ -164,7 +157,6 @@ void WaveFunctionComponent::mw_ratioGradWithSpin(const RefVectorWithLeader& spingrad_new) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) ratios[iw] = wfc_list[iw].ratioGradWithSpin(p_list[iw], iat, grad_new[iw], spingrad_new[iw]); } @@ -176,7 +168,6 @@ void WaveFunctionComponent::mw_accept_rejectMove(const RefVectorWithLeader& wfc_list) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) wfc_list[iw].completeUpdates(); } @@ -207,7 +197,6 @@ void WaveFunctionComponent::mw_evaluateGL(const RefVectorWithLeader>& ratios) const { assert(this == &wfc_list.getLeader()); -#pragma omp parallel for for (int iw = 0; iw < wfc_list.size(); iw++) wfc_list[iw].evaluateRatios(vp_list[iw], ratios[iw]); } diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu index 238e03e3d4..e152d15439 100644 --- a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu +++ b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu @@ -21,7 +21,7 @@ namespace CUDA { using namespace thrust::cuda_cub::core; } -} +} // namespace qmcplusplus #else #include #include "ROCm/cuda2hip.h" @@ -44,19 +44,18 @@ __global__ void copyAinvRow_saveGL_kernel(const int rowchanged, const int lda, T* const temp[], T* const rcopy[], - const T* const dphi_in[], - const T* const d2phi_in[], + const T* const phi_vgl_in[], + const size_t phi_vgl_stride, T* const dphi_out[], T* const d2phi_out[]) { - const int iw = blockIdx.x; - const T* __restrict__ Ainv_iw = Ainv[iw]; - T* __restrict__ temp_iw = temp[iw]; - T* __restrict__ rcopy_iw = rcopy[iw]; - const T* __restrict__ dphi_in_iw = dphi_in[iw]; - const T* __restrict__ d2phi_in_iw = d2phi_in[iw]; - T* __restrict__ dphi_out_iw = dphi_out[iw]; - T* __restrict__ d2phi_out_iw = d2phi_out[iw]; + const int iw = blockIdx.x; + const T* __restrict__ Ainv_iw = Ainv[iw]; + T* __restrict__ temp_iw = temp[iw]; + T* __restrict__ rcopy_iw = rcopy[iw]; + const T* __restrict__ phi_in_iw = phi_vgl_in[iw]; + T* __restrict__ dphi_out_iw = dphi_out[iw]; + T* __restrict__ d2phi_out_iw = d2phi_out[iw]; const int tid = threadIdx.x; if (tid == 0) @@ -72,10 +71,10 @@ __global__ void copyAinvRow_saveGL_kernel(const int rowchanged, // the following copying data on the device is not part of SM-1 // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel. - dphi_out_iw[col_id * 3] = dphi_in_iw[col_id * 3]; - dphi_out_iw[col_id * 3 + 1] = dphi_in_iw[col_id * 3 + 1]; - dphi_out_iw[col_id * 3 + 2] = dphi_in_iw[col_id * 3 + 2]; - d2phi_out_iw[col_id] = d2phi_in_iw[col_id]; + dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride]; + dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2]; + dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3]; + d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4]; } } } @@ -87,8 +86,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, float* const temp[], float* const rcopy[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const dphi_out[], float* const d2phi_out[], const int batch_count) @@ -100,7 +99,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); copyAinvRow_saveGL_kernel<<>>(rowchanged, n, Ainv, lda, temp, rcopy, - dphi_in, d2phi_in, dphi_out, d2phi_out); + phi_vgl_in, phi_vgl_stride, dphi_out, + d2phi_out); return cudaPeekAtLastError(); } @@ -111,8 +111,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, double* const temp[], double* const rcopy[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const dphi_out[], double* const d2phi_out[], const int batch_count) @@ -124,7 +124,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); copyAinvRow_saveGL_kernel<<>>(rowchanged, n, Ainv, lda, temp, rcopy, - dphi_in, d2phi_in, dphi_out, d2phi_out); + phi_vgl_in, phi_vgl_stride, dphi_out, + d2phi_out); return cudaPeekAtLastError(); } @@ -135,8 +136,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count) @@ -147,8 +148,10 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - copyAinvRow_saveGL_kernel<<>>(rowchanged, n, (const cuComplex**)Ainv, lda, (cuComplex**)temp, (cuComplex**)rcopy, - (const cuComplex**)dphi_in, (const cuComplex**)d2phi_in, (cuComplex**)dphi_out, (cuComplex**)d2phi_out); + copyAinvRow_saveGL_kernel + <<>>(rowchanged, n, (const cuComplex**)Ainv, lda, (cuComplex**)temp, + (cuComplex**)rcopy, (const cuComplex**)phi_vgl_in, phi_vgl_stride, + (cuComplex**)dphi_out, (cuComplex**)d2phi_out); return cudaPeekAtLastError(); } @@ -159,8 +162,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count) @@ -171,8 +174,10 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - copyAinvRow_saveGL_kernel<<>>(rowchanged, n, (const cuDoubleComplex**)Ainv, lda, (cuDoubleComplex**)temp, (cuDoubleComplex**)rcopy, - (const cuDoubleComplex**)dphi_in, (const cuDoubleComplex**)d2phi_in, (cuDoubleComplex**)dphi_out, (cuDoubleComplex**)d2phi_out); + copyAinvRow_saveGL_kernel + <<>>(rowchanged, n, (const cuDoubleComplex**)Ainv, lda, (cuDoubleComplex**)temp, + (cuDoubleComplex**)rcopy, (const cuDoubleComplex**)phi_vgl_in, phi_vgl_stride, + (cuDoubleComplex**)dphi_out, (cuDoubleComplex**)d2phi_out); return cudaPeekAtLastError(); } @@ -261,7 +266,9 @@ cudaError_t calcGradients_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - calcGradients_kernel, COLBS><<>>(n, (const thrust::complex**)Ainvrow, (const thrust::complex**)dpsiMrow, (thrust::complex*)grads_now); + calcGradients_kernel, COLBS> + <<>>(n, (const thrust::complex**)Ainvrow, + (const thrust::complex**)dpsiMrow, (thrust::complex*)grads_now); return cudaPeekAtLastError(); } @@ -278,7 +285,10 @@ cudaError_t calcGradients_cuda(cudaStream_t& hstream, const int COLBS = 64; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - calcGradients_kernel, COLBS><<>>(n, (const thrust::complex**)Ainvrow, (const thrust::complex**)dpsiMrow, (thrust::complex*)grads_now); + calcGradients_kernel, COLBS> + <<>>(n, (const thrust::complex**)Ainvrow, + (const thrust::complex**)dpsiMrow, + (thrust::complex*)grads_now); return cudaPeekAtLastError(); } @@ -289,9 +299,8 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[], T* const binv[], const int binv_lda, const T* const ratio_inv, - const T* const phi_in[], - const T* const dphi_in[], - const T* const d2phi_in[], + const T* const phi_vgl_in[], + const size_t phi_vgl_stride, T* const phi_out[], T* const dphi_out[], T* const d2phi_out[], @@ -304,14 +313,12 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[], if (iw < n_accepted) { // real accept, settle y and Z - int* __restrict__ delay_list_iw = delay_list[iw]; - T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda; - const T* __restrict__ phi_in_iw = phi_in[iw]; - const T* __restrict__ dphi_in_iw = dphi_in[iw]; - const T* __restrict__ d2phi_in_iw = d2phi_in[iw]; - T* __restrict__ phi_out_iw = phi_out[iw]; - T* __restrict__ dphi_out_iw = dphi_out[iw]; - T* __restrict__ d2phi_out_iw = d2phi_out[iw]; + int* __restrict__ delay_list_iw = delay_list[iw]; + T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda; + const T* __restrict__ phi_in_iw = phi_vgl_in[iw]; + T* __restrict__ phi_out_iw = phi_out[iw]; + T* __restrict__ dphi_out_iw = dphi_out[iw]; + T* __restrict__ d2phi_out_iw = d2phi_out[iw]; if (tid == 0) { @@ -335,10 +342,10 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[], { // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel. phi_out_iw[col_id] = phi_in_iw[col_id]; - dphi_out_iw[col_id * 3] = dphi_in_iw[col_id * 3]; - dphi_out_iw[col_id * 3 + 1] = dphi_in_iw[col_id * 3 + 1]; - dphi_out_iw[col_id * 3 + 2] = dphi_in_iw[col_id * 3 + 2]; - d2phi_out_iw[col_id] = d2phi_in_iw[col_id]; + dphi_out_iw[col_id * 3] = phi_in_iw[col_id + phi_vgl_stride]; + dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2]; + dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3]; + d2phi_out_iw[col_id] = phi_in_iw[col_id + phi_vgl_stride * 4]; } } } @@ -379,9 +386,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, float* const binv[], const int binv_lda, const float* const ratio_inv, - const float* const phi_in[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const phi_out[], float* const dphi_out[], float* const d2phi_out[], @@ -396,8 +402,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); add_delay_list_save_sigma_VGL_kernel - <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_in, - dphi_in, d2phi_in, phi_out, dphi_out, d2phi_out, norb, n_accepted); + <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_vgl_in, + phi_vgl_stride, phi_out, dphi_out, d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -408,9 +414,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, double* const binv[], const int binv_lda, const double* const ratio_inv, - const double* const phi_in[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const phi_out[], double* const dphi_out[], double* const d2phi_out[], @@ -425,8 +430,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); add_delay_list_save_sigma_VGL_kernel - <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_in, - dphi_in, d2phi_in, phi_out, dphi_out, d2phi_out, norb, n_accepted); + <<>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_vgl_in, + phi_vgl_stride, phi_out, dphi_out, d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -437,9 +442,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], @@ -456,11 +460,9 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, add_delay_list_save_sigma_VGL_kernel, COLBS> <<>>(delay_list, rowchanged, delay_count, (thrust::complex**)binv, binv_lda, (const thrust::complex*)ratio_inv, - (const thrust::complex**)phi_in, - (const thrust::complex**)dphi_in, - (const thrust::complex**)d2phi_in, (thrust::complex**)phi_out, - (thrust::complex**)dphi_out, (thrust::complex**)d2phi_out, norb, - n_accepted); + (const thrust::complex**)phi_vgl_in, phi_vgl_stride, + (thrust::complex**)phi_out, (thrust::complex**)dphi_out, + (thrust::complex**)d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -471,9 +473,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], @@ -490,11 +491,9 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, add_delay_list_save_sigma_VGL_kernel, COLBS> <<>>(delay_list, rowchanged, delay_count, (thrust::complex**)binv, binv_lda, (const thrust::complex*)ratio_inv, - (const thrust::complex**)phi_in, - (const thrust::complex**)dphi_in, - (const thrust::complex**)d2phi_in, (thrust::complex**)phi_out, - (thrust::complex**)dphi_out, (thrust::complex**)d2phi_out, - norb, n_accepted); + (const thrust::complex**)phi_vgl_in, phi_vgl_stride, + (thrust::complex**)phi_out, (thrust::complex**)dphi_out, + (thrust::complex**)d2phi_out, norb, n_accepted); return cudaPeekAtLastError(); } @@ -566,7 +565,8 @@ cudaError_t applyW_batched(cudaStream_t& hstream, const int COLBS = 32; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - applyW_kernel<<>>(delay_list, delay_count, (cuComplex**)tempMat, lda); + applyW_kernel + <<>>(delay_list, delay_count, (cuComplex**)tempMat, lda); return cudaPeekAtLastError(); } @@ -583,7 +583,8 @@ cudaError_t applyW_batched(cudaStream_t& hstream, const int COLBS = 32; dim3 dimBlock(COLBS); dim3 dimGrid(batch_count); - applyW_kernel<<>>(delay_list, delay_count, (cuDoubleComplex**)tempMat, lda); + applyW_kernel + <<>>(delay_list, delay_count, (cuDoubleComplex**)tempMat, lda); return cudaPeekAtLastError(); } diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp index 3a7e9a27eb..a6e5433808 100644 --- a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp +++ b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp @@ -40,8 +40,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, float* const temp[], float* const rcopy[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const dphi_out[], float* const d2phi_out[], const int batch_count); @@ -53,8 +53,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, double* const temp[], double* const rcopy[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const dphi_out[], double* const d2phi_out[], const int batch_count); @@ -66,8 +66,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count); @@ -79,8 +79,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream, const int lda, std::complex* const temp[], std::complex* const rcopy[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const dphi_out[], std::complex* const d2phi_out[], const int batch_count); @@ -121,9 +121,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, float* const binv[], const int binv_lda, const float* const ratio_inv, - const float* const phi_in[], - const float* const dphi_in[], - const float* const d2phi_in[], + const float* const phi_vgl_in[], + const size_t phi_vgl_stride, float* const phi_out[], float* const dphi_out[], float* const d2phi_out[], @@ -138,9 +137,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, double* const binv[], const int binv_lda, const double* const ratio_inv, - const double* const phi_in[], - const double* const dphi_in[], - const double* const d2phi_in[], + const double* const phi_vgl_in[], + const size_t phi_vgl_stride, double* const phi_out[], double* const dphi_out[], double* const d2phi_out[], @@ -155,9 +153,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], @@ -172,9 +169,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream, std::complex* const binv[], const int binv_lda, const std::complex* const ratio_inv, - const std::complex* const phi_in[], - const std::complex* const dphi_in[], - const std::complex* const d2phi_in[], + const std::complex* const phi_vgl_in[], + const size_t phi_vgl_stride, std::complex* const phi_out[], std::complex* const dphi_out[], std::complex* const d2phi_out[], diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp index 7ce7a9d3dc..20711c1ff4 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp @@ -86,4 +86,52 @@ template sycl::event applyW_stageV_sycl(sycl::queue& aq, const int ndelay, std::complex* restrict V_gpu, const std::complex* restrict Ainv); + + +template +std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const TMAT* restrict a, + const INDEX* restrict pivot, + const std::vector& dependencies) +{ + constexpr size_t COLBS = 128; + + std::complex result{}; + { + sycl::buffer, 1> abuff(&result, {1}); + aq.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + + size_t n_max = ((n + COLBS - 1) / COLBS) * COLBS; + sycl::global_ptr A{a}; + sycl::global_ptr Pivot{pivot}; + cgh.parallel_for(sycl::range<1>{n_max}, sycl::reduction(abuff, cgh, {T{}, T{}}, std::plus>()), + [=](sycl::id<1> i, auto& sum) { + std::complex val{}; + //if(i((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i])); + if (i < n) + val = (Pivot[i] == i + 1) ? std::log(std::complex(A[i * lda + i])) + : std::log(std::complex(-A[i * lda + i])); + sum.combine(val); + }); + }); + } //synchronous + return result; +} + +template std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const double* restrict a, + const std::int64_t* restrict pivot, + const std::vector& dependencies); + +template std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const std::complex* restrict a, + const std::int64_t* restrict pivot, + const std::vector& dependencies); } // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp index 7baec66af7..23bcaa774a 100644 --- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp +++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp @@ -20,7 +20,6 @@ namespace qmcplusplus { - template sycl::event applyW_stageV_sycl(sycl::queue& aq, const std::vector& dependencies, @@ -31,5 +30,14 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq, const int ndelay, T* V_gpu, const T* Ainv); -} + +template +std::complex computeLogDet_sycl(sycl::queue& aq, + int n, + int lda, + const TMAT* a, + const INDEX* pivot, + const std::vector& dependencies = {}); + +} // namespace qmcplusplus #endif diff --git a/src/QMCWaveFunctions/tests/CMakeLists.txt b/src/QMCWaveFunctions/tests/CMakeLists.txt index 97a6d20717..c0d82bce66 100644 --- a/src/QMCWaveFunctions/tests/CMakeLists.txt +++ b/src/QMCWaveFunctions/tests/CMakeLists.txt @@ -59,7 +59,7 @@ endforeach() if(NOT QMC_CUDA) if(NOT QMC_COMPLEX) - set(MO_SRCS test_MO.cpp test_multiquintic_spline.cpp test_cartesian_ao.cpp) + set(MO_SRCS test_MO.cpp test_multiquintic_spline.cpp test_cartesian_ao.cpp test_RotatedSPOs_LCAO.cpp) if(NOT QMC_MIXED_PRECISION) set(MO_SRCS ${MO_SRCS} test_soa_cusp_corr.cpp) endif() diff --git a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py new file mode 100644 index 0000000000..cb066cfbb5 --- /dev/null +++ b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py @@ -0,0 +1,323 @@ + +# Helium atom with a combination of two orbitals and simple jastrow factor + +# Uses automatic differentiation via the autograd package to +# compute spatial and parameter derivatives +import autograd.numpy as np +from autograd import hessian,grad +from stats import averager + +# Point values used in test_RotatedSPOs_LCAO.cpp +# QMC values used to validate tests/molecules/He_param/He_orb_rot_param_grad_legacy + + +def mag(r): + return np.sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]) + +# normalized STO's correspond to the 'normalized="no"' part of the input +# + + +def sto_norm1(zeta): + return 2*np.sqrt(zeta**3) + +def sto_norm2(zeta): + return 2*np.sqrt(3)*np.sqrt(zeta**5)/3 + + +def orb1(R): + r = mag(R) + Z = 2.0 + y00 = 1/np.sqrt(4 * np.pi) + snorm1 = sto_norm1(Z) + return y00 * snorm1 * np.exp(-Z*r) + +def orb2(R): + r = mag(R) + zeta = 1.0 + y00 = 1/np.sqrt(4*np.pi) + snorm2 = sto_norm2(zeta) + return snorm2* y00 * r* np.exp(-zeta*r) + +def rot_orb(R, theta): + return orb1(R) * np.cos(theta) + orb2(R) * np.sin(theta) + + +def jastrow(r12, B): + A = 0.5 + return np.exp(A*r12/(1.0 + B*r12) - A/B) + +def psi_no_jastrow(r1, r2, VP): + theta1 = VP[0] + theta2 = VP[1] + o1 = rot_orb(r1,theta1) + o2 = rot_orb(r2,theta2) + return o1*o2 + +def psi_with_jastrow(r1, r2, VP): + theta1 = VP[0] + theta2 = VP[1] + B = VP[2] + o1 = rot_orb(r1,theta1) + o2 = rot_orb(r2,theta2) + r12 = r2 - r1 + j = jastrow(r12, B) + return o1*o2*j + +def psi(r1, r2, VP): + global use_jastrow + theta1 = VP[0] + theta2 = VP[1] + j = 1.0 + if use_jastrow: + B = VP[2] + r12 = mag(r2 - r1) + j = jastrow(r12, B) + + o1 = rot_orb(r1,theta1) + o2 = rot_orb(r2,theta2) + return o1*o2*j + +def log_psi(r1, r2, B): + return np.log(psi(r1, r2, B)) + + +# Spatial derivatives +hess0 = hessian(psi, 0) +hess1 = hessian(psi, 1) + + +hess_log_0 = hessian(log_psi, 0) +hess_log_1 = hessian(log_psi, 1) + +grad0 = grad(psi, 0) +grad1 = grad(psi, 1) + +# Derivative wrt parameters +dpsi = grad(psi, 2) + +def lap0(r1, r2, VP): + h0 = np.sum(np.diag(hess_log_0(r1, r2, VP))) + return h0 + +def lap1(r1, r2, VP): + h1 = np.sum(np.diag(hess_log_1(r1, r2, VP))) + return h1 + +def lap(r1, r2, VP): + h0 = np.sum(np.diag(hess0(r1, r2, VP))) + h1 = np.sum(np.diag(hess1(r1, r2, VP))) + return h0 + h1 + +def en_pot(r1, r2): + r1_mag = mag(r1) + r2_mag = mag(r2) + Z = 2.0 + return -Z/r1_mag - Z/r2_mag + +def ee_pot(r1, r2): + r12 = r2 - r1 + r12_mag = mag(r12) + return 1.0/r12_mag + + +def local_energy(r1, r2, VP): + pot = en_pot(r1, r2) + ee_pot(r1, r2) + psi_val = psi(r1, r2, VP) + lapl = lap(r1, r2, VP) + + h = -0.5*lapl/psi_val + pot + return h + +dlocal_energy = grad(local_energy, 2) + + + +def print_wf_values(theta1=0.0, theta2=0.0, use_j=False, B=0.0): + global use_jastrow + + # Adjust numpy output so arrays are printed with higher precision + float_formatter = "{:.15g}".format + np.set_printoptions(formatter={'float_kind':float_formatter}) + + if use_j: + use_jastrow = True + VP = np.array([theta1, theta2, B]) + print("Values for theta = ",theta1,theta2," and jastrow B = ",B) + else: + use_jastrow = False + VP = np.array([theta1, theta2]) + print("Values for theta = ",theta1,theta2," and no jastrow") + + + + r1 = np.array([1.0, 2.0, 3.0]) + r2 = np.array([0.0, 1.1, 2.2]) + + psi_val = psi(r1, r2, VP) + print(" wf = ",psi_val," log wf = ",np.log(np.abs(psi_val))) + + g0 = grad0(r1, r2, VP)/psi_val + print(" grad/psi for particle 0 = ",g0[0],g0[1],g0[2]) + + # Using the laplacian of log psi to match internal QMCPACK values + lap_0 = lap0(r1, r2, VP) + print(" laplacian of log psi for particle 0 = ",lap_0) + + lap_1 = lap1(r1, r2, VP) + print(" laplacian for log psi particle 1 = ",lap_1) + + eloc = local_energy(r1, r2, VP) + print(" local energy = ",eloc) + + dp = dpsi(r1, r2, VP) + print(" parameter derivative of log psi = ",dp / psi_val) + + deloc = dlocal_energy(r1, r2, VP) + print(" parameter derivative of local energy = ",deloc) + + print("") + + +# Generate the wavefunction values for a single set of electron positions +# used in test_RotatedSPOs_LCAO.cpp + +def print_point_values(): + r1 = np.array([1.0, 2.0, 3.0]) + r2 = np.array([0.0, 1.1, 2.2]) + + print_wf_values(theta1=0.1, theta2=0.2) + + print_wf_values(theta1=0.0, theta2=0.0) + + print_wf_values(theta1=0.0, theta2=0.0, use_j=True, B=0.1) + +def run_qmc(VP): + r1 = np.array([1.0, 2.0, 3.0]) + r2 = np.array([0.0, 1.1, 2.2]) + + # Outer loop for statistics + nblock = 20 + + # Loop to gather data + nstep = 100 + + # Loop for decorrelation + nsubstep = 10 + + nelectron = 2 + + # Step size for trial move + delta = 1.2 + + r = np.zeros((2,3)) + r[0,:] = r1 + r[1,:] = r2 + + r_old = np.zeros(3) + r_trial = np.zeros(3) + + block_dp_ave = averager() + total_en_ave = averager() + + # Outer loop for statistics + for nb in range(nblock): + + naccept = 0 + en_ave = averager() + dpsi_ave = averager() + deloc_ave = averager() + eloc_dpsi_ave = averager() + + # Loop to gather data + for ns in range(nstep): + # Loop for decorrelation + for nss in range(nsubstep): + # Loop over electrons + for ne in range(nelectron): + + wf_old = psi(r[0,:], r[1,:], VP) + r_old[:] = r[ne,:] + + change = delta*(np.random.rand(3)-0.5) + + r_trial[:] = r_old[:] + change + r[ne,:] = r_trial[:] + + wf_new = psi(r[0,:], r[1,:], VP) + + wf_ratio = (wf_new/wf_old)**2 + + if wf_ratio > np.random.random(): + naccept += 1 + else: + r[ne,:] = r_old[:] + + eloc = local_energy(r[0,:], r[1,:], VP) + en_ave.add_value(eloc) + + psi_val = psi(r[0,:], r[1,:], VP) + dpsi_val = dpsi(r[0,:], r[1,:], VP) + dpsi_ave.add_value(dpsi_val/psi_val) + + deloc_val = dlocal_energy(r[0,:], r[1,:], VP) + deloc_ave.add_value(deloc_val) + + eloc_dpsi_ave.add_value(eloc * dpsi_val/psi_val) + + + en = en_ave.average() + en_err = en_ave.error() + ar = naccept/(nstep*nsubstep*nelectron) + print('block = ',nb, ' energy = ',en,en_err,' acceptance ratio = ',ar) + + dp = dpsi_ave.average() + dp_err = dpsi_ave.error() + + deloc = deloc_ave.average() + deloc_err = deloc_ave.error() + + eloc_dpsi = eloc_dpsi_ave.average() + eloc_dpsi_err = eloc_dpsi_ave.error() + + # For the parameter derivative formula, see + # https://github.com/QMCPACK/qmc_algorithms/blob/master/Variational/Parameter_Optimization.ipynb + + dg = deloc + 2*eloc_dpsi - 2*en*dp + block_dp_ave.add_value(dg) + + dg = block_dp_ave.average() + dg_err = block_dp_ave.error() + print('parameter values = ',VP) + print('parameter derivatives = ',dg) + print('parameter derivative errors = ',dg_err) + + +def run_qmc_parameter_derivatives(): + global use_jastrow + use_jastrow=False + theta1 = 0.1 + theta2 = 0.1 + VP = np.array([theta1, theta2]) + + run_qmc(VP) + +# Some results from run_qmc_parameter_derivatives + +# Run took about 10 minutes on laptop +# nblock=20, nstep=1000, nsubstep=10 +# parameter values = [0.1 0.1] +# parameter derivatives = [-0.20164722 -0.18347461] +# parameter derivative errors = [0.01201481 0.01314164] + +# Run took about 40 minutes on laptop +# nblock=40, nstep=2000, nsubstep=10 +# parameter values = [0.1 0.1] +# parameter derivatives = [-0.2204924 -0.21471184] +# parameter derivative errors = [0.00493837 0.00571082] + + +if __name__=='__main__': + #print_point_values() + run_qmc_parameter_derivatives() + diff --git a/src/QMCWaveFunctions/tests/stats.py b/src/QMCWaveFunctions/tests/stats.py new file mode 100644 index 0000000000..61794a9a3c --- /dev/null +++ b/src/QMCWaveFunctions/tests/stats.py @@ -0,0 +1,43 @@ + +import math +import numpy as np + +class averager: + """Compute average, variance, and standard error for a set of data""" + def __init__(self): + self.sum = 0.0 + self.sum_sq = 0.0 + self.norm = 0 + + def add_value(self,v): + self.sum += v + self.sum_sq += v*v + self.norm += 1 + + def average(self): + if (self.norm == 0): + return 0.0 + else: + return self.sum/self.norm + + def variance(self): + var = 0.0 + if (self.norm != 0): + var = (self.sum_sq - self.sum*self.sum/self.norm)/self.norm + return var + + def std_dev(self): + return math.sqrt(self.variance()) + + def error(self): + err = 0.0 + if (self.norm > 1): + var = self.variance() + err = np.sqrt(var/(self.norm-1)) + return err + + def merge(self, other): + self.sum += other.sum + self.sum_sq += other.sum_sq + self.norm += other.norm + diff --git a/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp b/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp new file mode 100644 index 0000000000..d68ad67bac --- /dev/null +++ b/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp @@ -0,0 +1,364 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2022 QMCPACK developers. +// +// File developed by: Mark Dewing, mdewing@anl.gov Argonne National Laboratory +// +// File created by: Mark Dewing, mdewing@anl.gov Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + + +#include "catch.hpp" + + +#include "Message/Communicate.h" +#include "OhmmsData/Libxml2Doc.h" +#include "Particle/ParticleSetPool.h" +#include "QMCWaveFunctions/WaveFunctionPool.h" + + +#include +#include +#include + + +// Reference values from gen_rotated_lcao_wf.py + + +namespace qmcplusplus +{ +void setupParticleSetPool(ParticleSetPool& pp) +{ + // See ParticleIO/tests/test_xml_io.cpp for particle parsing + const char* particles = R"( + + + 2 + + + 0.0 0.0 0.0 + + + + + + -1 + + 1.0 2.0 3.0 + + + + -1 + + 0.0 1.1 2.2 + + + +)"; + Libxml2Document doc; + + bool okay = doc.parseFromString(particles); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + xmlNodePtr part_ion = xmlFirstElementChild(root); + pp.put(part_ion); + xmlNodePtr part_elec = xmlNextElementSibling(part_ion); + pp.put(part_elec); +} + + +// No Jastrow, rotation angle theta1=0.1 and theta2=0.2 +TEST_CASE("Rotated LCAO WF1", "[qmcapp]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSetPool pp(c); + setupParticleSetPool(pp); + + WaveFunctionPool wp(pp, c); + + REQUIRE(wp.empty() == true); + + + const char* wf_input = R"( + + + + + + + + + + + + + + 0.1 + + 1.0 0.0 + 0.0 1.0 + + + + 0.2 + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + )"; + + Libxml2Document doc; + bool okay = doc.parseFromString(wf_input); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + wp.put(root); + + TrialWaveFunction* psi = wp.getWaveFunction("psi0"); + REQUIRE(psi != nullptr); + REQUIRE(psi->getOrbitals().size() == 1); + + opt_variables_type opt_vars; + psi->checkInVariables(opt_vars); + psi->checkOutVariables(opt_vars); + psi->resetParameters(opt_vars); + + ParticleSet* elec = pp.getParticleSet("e"); + elec->update(); + + + double logval = psi->evaluateLog(*elec); + CHECK(logval == Approx(-9.26625670653773)); + + CHECK(elec->G[0][0] == ValueApprox(-0.2758747113720909)); + CHECK(elec->L[0] == ValueApprox(-0.316459652026054)); + CHECK(elec->L[1] == ValueApprox(-0.6035591598540904)); + + using ValueType = QMCTraits::ValueType; + std::vector dlogpsi(2); + std::vector dhpsioverpsi(2); + psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi); + + + CHECK(dlogpsi[0] == ValueApprox(7.58753078998516)); + CHECK(dlogpsi[1] == ValueApprox(2.58896036829191)); + CHECK(dhpsioverpsi[0] == ValueApprox(2.59551625714144)); + CHECK(dhpsioverpsi[1] == ValueApprox(1.70071425070404)); +} + +// No Jastrow, rotation angle of 0 +TEST_CASE("Rotated LCAO WF zero angle", "[qmcapp]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSetPool pp(c); + setupParticleSetPool(pp); + + WaveFunctionPool wp(pp, c); + + REQUIRE(wp.empty() == true); + + + const char* wf_input = R"( + + + + + + + + + + + + + + + 1.0 0.0 + 0.0 1.0 + + + + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + )"; + + Libxml2Document doc; + bool okay = doc.parseFromString(wf_input); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + wp.put(root); + + TrialWaveFunction* psi = wp.getWaveFunction("psi0"); + REQUIRE(psi != nullptr); + REQUIRE(psi->getOrbitals().size() == 1); + + opt_variables_type opt_vars; + psi->checkInVariables(opt_vars); + opt_vars.resetIndex(); + psi->checkOutVariables(opt_vars); + psi->resetParameters(opt_vars); + + ParticleSet* elec = pp.getParticleSet("e"); + elec->update(); + + + double logval = psi->evaluateLog(*elec); + CHECK(logval == Approx(-11.467952668216984)); + + CHECK(elec->G[0][0] == ValueApprox(-0.5345224838248487)); + CHECK(elec->L[0] == ValueApprox(-1.0690449676496974)); + CHECK(elec->L[1] == ValueApprox(-1.626231256363484)); + + using ValueType = QMCTraits::ValueType; + std::vector dlogpsi(2); + std::vector dhpsioverpsi(2); + psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi); + + CHECK(dlogpsi[0] == ValueApprox(32.2062050179872)); + CHECK(dlogpsi[1] == ValueApprox(5.87482925187464)); + CHECK(dhpsioverpsi[0] == ValueApprox(46.0088643114103)); + CHECK(dhpsioverpsi[1] == ValueApprox(7.84119772047731)); + + RefVectorWithLeader wf_list(*psi, {*psi}); + RefVectorWithLeader p_list(*elec, {*elec}); + + // Test list with one wavefunction + + int nparam = 2; + int nentry = 1; + RecordArray dlogpsi_list(nparam, nentry); + RecordArray dhpsi_over_psi_list(nparam, nentry); + + TrialWaveFunction::mw_evaluateParameterDerivatives(wf_list, p_list, opt_vars, dlogpsi_list, dhpsi_over_psi_list); + + CHECK(dlogpsi_list.getValue(0, 0) == ValueApprox(32.2062050179872)); + CHECK(dlogpsi_list.getValue(1, 0) == ValueApprox(5.87482925187464)); + CHECK(dhpsi_over_psi_list.getValue(0, 0) == ValueApprox(46.0088643114103)); + CHECK(dhpsi_over_psi_list.getValue(1, 0) == ValueApprox(7.84119772047731)); +} + +// Rotation angle of 0 and add Jastrow factory +TEST_CASE("Rotated LCAO WF with jastrow", "[qmcapp]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSetPool pp(c); + setupParticleSetPool(pp); + + WaveFunctionPool wp(pp, c); + + REQUIRE(wp.empty() == true); + + + const char* wf_input = R"( + + + + + + + + + + + + + + + + 1.0 0.0 + 0.0 1.0 + + + + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + + + 0.1 + + + )"; + + Libxml2Document doc; + bool okay = doc.parseFromString(wf_input); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + wp.put(root); + + TrialWaveFunction* psi = wp.getWaveFunction("psi0"); + REQUIRE(psi != nullptr); + REQUIRE(psi->getOrbitals().size() == 2); + + opt_variables_type opt_vars; + psi->checkInVariables(opt_vars); + opt_vars.resetIndex(); + psi->checkOutVariables(opt_vars); + psi->resetParameters(opt_vars); + + ParticleSet* elec = pp.getParticleSet("e"); + elec->update(); + + + double logval = psi->evaluateLog(*elec); + CHECK(logval == Approx(-15.791249652199634)); + + CHECK(elec->G[0][0] == ValueApprox(-0.2956989647881321)); + CHECK(elec->L[0] == ValueApprox(-0.6560429678274734)); + CHECK(elec->L[1] == ValueApprox(-1.2132292565412577)); + + using ValueType = QMCTraits::ValueType; + std::vector dlogpsi(3); + std::vector dhpsioverpsi(3); + psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi); + + CHECK(dlogpsi[0] == ValueApprox(32.206205017987166)); + CHECK(dlogpsi[1] == ValueApprox(5.874829251874641)); + CHECK(dlogpsi[2] == ValueApprox(49.08414605622605)); + CHECK(dhpsioverpsi[0] == ValueApprox(32.462519534916666)); + CHECK(dhpsioverpsi[1] == ValueApprox(10.047601212881027)); + CHECK(dhpsioverpsi[2] == ValueApprox(2.0820644399551895)); +} +} // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp b/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp index f980fda7e9..8c222f6ca6 100644 --- a/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp +++ b/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp @@ -276,6 +276,43 @@ void test_LiH_msd(const std::string& spo_xml_string, CHECK(grad_new.grads_positions[1][0] == ValueApprox(-0.8633778143)); CHECK(grad_new.grads_positions[1][1] == ValueApprox(0.8245347691)); CHECK(grad_new.grads_positions[1][2] == ValueApprox(-5.1513380151)); + + std::vector isAccepted{false, true}; + TrialWaveFunction::mw_accept_rejectMove(wf_ref_list, p_ref_list, moved_elec_id, isAccepted); + ParticleSet::mw_accept_rejectMove(p_ref_list, moved_elec_id, isAccepted); + + CHECK(wf_ref_list[0].getLogPsi() == Approx(-7.803347327300152)); + CHECK(wf_ref_list[1].getLogPsi() == Approx(-7.321765331299484)); + + // move the next electron + TrialWaveFunction::mw_prepareGroup(wf_ref_list, p_ref_list, 1); + const int moved_elec_id_next = 2; + TrialWaveFunction::mw_evalGrad(wf_ref_list, p_ref_list, moved_elec_id_next, grad_old); + + CHECK(grad_old.grads_positions[0][0] == ValueApprox(1.3325558736)); + CHECK(grad_old.grads_positions[0][1] == ValueApprox(1.3327966725)); + CHECK(grad_old.grads_positions[0][2] == ValueApprox(1.2157689586)); + CHECK(grad_old.grads_positions[1][0] == ValueApprox(1.3222514142)); + CHECK(grad_old.grads_positions[1][1] == ValueApprox(1.3230108868)); + CHECK(grad_old.grads_positions[1][2] == ValueApprox(1.2035047435)); + + ParticleSet::mw_makeMove(p_ref_list, moved_elec_id_next, displ); + TrialWaveFunction::mw_calcRatio(wf_ref_list, p_ref_list, moved_elec_id_next, ratios); + + CHECK(ratios[0] == ValueApprox(PsiValueType(2.1080036144))); + CHECK(ratios[1] == ValueApprox(PsiValueType(0.4947158435))); + + TrialWaveFunction::mw_calcRatioGrad(wf_ref_list, p_ref_list, moved_elec_id_next, ratios, grad_new); + + CHECK(ratios[0] == ValueApprox(PsiValueType(2.1080036144))); + CHECK(ratios[1] == ValueApprox(PsiValueType(0.4947158435))); + + CHECK(grad_new.grads_positions[0][0] == ValueApprox(1.8412365668)); + CHECK(grad_new.grads_positions[0][1] == ValueApprox(1.3736370007)); + CHECK(grad_new.grads_positions[0][2] == ValueApprox(0.8043818454)); + CHECK(grad_new.grads_positions[1][0] == ValueApprox(1.3553132105)); + CHECK(grad_new.grads_positions[1][1] == ValueApprox(1.5552132255)); + CHECK(grad_new.grads_positions[1][2] == ValueApprox(0.804301246)); } } diff --git a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp new file mode 100644 index 0000000000..0c6ecfcc20 --- /dev/null +++ b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp @@ -0,0 +1,65 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2021 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#include "catch.hpp" + +#include "Configuration.h" +#include "CPU/SIMD/aligned_allocator.hpp" +#include "QMCWaveFunctions/Fermion/syclSolverInverter.hpp" +#include "QMCWaveFunctions/Fermion/DiracMatrix.h" +#include "Utilities/for_testing/checkMatrix.hpp" +#include "Containers/tests/makeRngSpdMatrix.hpp" + +namespace qmcplusplus +{ +template +void test_inverse(const std::int64_t N) +{ + sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue(); + + syclSolverInverter solver; + + Matrix m(N, N); + Matrix m_invT(N, N); + Matrix m_invT_CPU(N, N); + Matrix> m_invGPU; + m_invGPU.resize(N, N); + + std::complex log_value, log_value_cpu; + testing::MakeRngSpdMatrix makeRngSpdMatrix{}; + makeRngSpdMatrix(m); + + solver.invert_transpose(m, m_invT, m_invGPU, log_value, m_queue); + m_queue.wait(); + + DiracMatrix dmat; + dmat.invert_transpose(m, m_invT_CPU, log_value_cpu); + + REQUIRE(log_value == ComplexApprox(log_value_cpu)); + + auto check_matrix_result = checkMatrix(m_invT, m_invT_CPU); + CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); } +} + +TEST_CASE("OmpSYCL syclSolverInverter", "[SYCL]") +{ + const int N = 911; + +#ifdef MIXED_PRECISION + std::cout << "Testing Inverse for mixed precision " << std::endl; + test_inverse(N); +#else + std::cout << "Testing Inverse for double double " << std::endl; + test_inverse(N); +#endif +} + +} // namespace qmcplusplus diff --git a/tests/molecules/He_param/CMakeLists.txt b/tests/molecules/He_param/CMakeLists.txt index 02affdc006..c77a43a599 100644 --- a/tests/molecules/He_param/CMakeLists.txt +++ b/tests/molecules/He_param/CMakeLists.txt @@ -78,6 +78,32 @@ if(NOT QMC_CUDA) endif() + # Orbital Rotation + + # Two atomic orbitals, no jastrow + + list(APPEND HE_ORB_ROT_PARAM spo-up_orb_rot_0000_0001 -0.218 0.01) # scalar name, value, error + list(APPEND HE_ORB_ROT_PARAM spo-down_orb_rot_0000_0001 -0.218 0.01) # scalar name, value, error + + qmc_run_and_check_custom_scalar( + BASE_NAME + He_orb_rot_param_grad_legacy + BASE_DIR + "${qmcpack_SOURCE_DIR}/tests/molecules/He_param" + PREFIX + He_orb_rot_param_grad_legacy.param + INPUT_FILE + He_orb_rot_param_grad_legacy.xml + PROCS + 1 + THREADS + 1 + SERIES + 0 + SCALAR_VALUES + HE_ORB_ROT_PARAM) + + else() message(VERBOSE "Skipping He_param tests because parameter output is not supported by mixed precison build (QMC_MIXED_PRECISION=1)") endif() diff --git a/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml b/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml new file mode 100644 index 0000000000..71b42b5b81 --- /dev/null +++ b/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml @@ -0,0 +1,100 @@ + + + + legacy + + + + + + + 2 + + + 0.0 0.0 0.0 + + + + + + + + -1 + + + -1 + + + + + + + + + + + + + + + + + + + + 0.1 + + 1.0 0.0 + 0.0 1.0 + + + + 0.1 + + 1.0 0.0 + 0.0 1.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + + 100 + + 25 + 10 + 20 + 0.5 + 1000 + 1.0 + 0.00 + + + + + diff --git a/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt b/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt index fee8c897d9..c130bbde4b 100644 --- a/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt +++ b/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt @@ -80,11 +80,11 @@ if(QMC_COMPLEX) #Deterministic test if(NOT QMC_MIXED_PRECISION) - list(APPEND DET_BCC_MO_VMC_SCALARS "totenergy" "-136.89324595 0.000001") + list(APPEND DET_BCC_MO_VMC_SCALARS "totenergy" "-136.89324595 0.000002") list(APPEND DET_BCC_MO_VMC_SCALARS "kinetic" "48.19451127 0.000001") - list(APPEND DET_BCC_MO_VMC_SCALARS "potential" "-185.08775723 0.000001") + list(APPEND DET_BCC_MO_VMC_SCALARS "potential" "-185.08775723 0.000002") list(APPEND DET_BCC_MO_VMC_SCALARS "eeenergy" "0.73302788 0.000001") - list(APPEND DET_BCC_MO_VMC_SCALARS "ionion" "-119.82754227 0.000001") + list(APPEND DET_BCC_MO_VMC_SCALARS "ionion" "-119.82754227 0.000002") list(APPEND DET_BCC_MO_VMC_SCALARS "localecp" "-90.13681859 0.000001") list(APPEND DET_BCC_MO_VMC_SCALARS "nonlocalecp" "24.16055828 0.000001") list(APPEND DET_BCC_MO_VMC_SCALARS "soecp" "-0.01698252 0.000001") @@ -102,11 +102,11 @@ if(QMC_COMPLEX) DET_BCC_MO_VMC_SCALARS # VMC ) - list(APPEND DET_BCC_MO_DMC_SCALARS "totenergy" "-135.29589113 0.000001") + list(APPEND DET_BCC_MO_DMC_SCALARS "totenergy" "-135.29589113 0.000002") list(APPEND DET_BCC_MO_DMC_SCALARS "kinetic" "29.19088964 0.000001") - list(APPEND DET_BCC_MO_DMC_SCALARS "potential" "-164.48678078 0.000001") + list(APPEND DET_BCC_MO_DMC_SCALARS "potential" "-164.48678078 0.000002") list(APPEND DET_BCC_MO_DMC_SCALARS "eeenergy" "-2.19402623 0.000001") - list(APPEND DET_BCC_MO_DMC_SCALARS "ionion" "-119.82754227 0.000001") + list(APPEND DET_BCC_MO_DMC_SCALARS "ionion" "-119.82754227 0.000002") list(APPEND DET_BCC_MO_DMC_SCALARS "localecp" "-64.57264081 0.000001") list(APPEND DET_BCC_MO_DMC_SCALARS "nonlocalecp" "22.11861933 0.000001") list(APPEND DET_BCC_MO_DMC_SCALARS "soecp" "-0.01119080 0.000001") diff --git a/tests/test_automation/Jenkinsfile_ornl_vm b/tests/test_automation/Jenkinsfile_ornl_vm deleted file mode 100644 index 66ac751dff..0000000000 --- a/tests/test_automation/Jenkinsfile_ornl_vm +++ /dev/null @@ -1,4 +0,0 @@ -library 'qmcpack_shared_jenkins' - -common_pipeline(name:"cpu", spack_path:"/var/lib/jenkins/spack", prefix:"vm") - diff --git a/tests/test_automation/README.md b/tests/test_automation/README.md index 5eddb9409d..8ed2b715c3 100644 --- a/tests/test_automation/README.md +++ b/tests/test_automation/README.md @@ -18,7 +18,4 @@ correspond to exactly those used in production, since these are sometimes tweaked for runtime, minor software version updates etc. # CI configuration -Most of these files begin with jenkins. Scrutinize any updates to them -carefully there is basically no reason they should be touched except by maintainers. -They depend on a common jenkins pipeline library pulled from QMCPACK/qmcpack_shared_jenkins. -Updates to that library don't seem to propagate faster than ~10 minutes. +See github-actions subdirectory and /.github/workflows/ diff --git a/tests/test_automation/add_fortran_for_spack_clang.py b/tests/test_automation/add_fortran_for_spack_clang.py deleted file mode 100755 index 859b8ca772..0000000000 --- a/tests/test_automation/add_fortran_for_spack_clang.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env spack python - -import spack.config -# spack uses this as well so it must be there -import argparse -import os - -parser = argparse.ArgumentParser(description='For chosen spec of gcc add gfortran as fortran compiler for all clang compiler configs') -parser.add_argument('gcc_spec', type=str, help='spack spec to get gfortran from') -args = parser.parse_args() - -conf_scope = spack.config.ConfigScope('user/linux', os.path.join(os.environ['HOME'],'.spack','linux')) -compilers_config = conf_scope.get_section_filename('compilers') - -gcc_configs = [ c for c in conf_scope.get_section('compilers')['compilers'] if args.gcc_spec in c['compiler']['spec'] ] -if len(gcc_configs) != 1: - raise NameError('gcc version must evaluate to a single gcc spec') -gcc = gcc_configs[0] - -llvm_configs = [ c for c in conf_scope.get_section('compilers')['compilers'] if "clang" in c['compiler']['spec'] ] -for lconf in llvm_configs: - lconf['compiler']['paths']['f77'] = gcc['compiler']['paths']['f77'] - lconf['compiler']['paths']['fc'] = gcc['compiler']['paths']['fc'] - lconf['compiler']['modules'].append(args.gcc_spec) - -conf_scope.write_section('compilers') diff --git a/tests/test_automation/github-actions/ci/run_step.sh b/tests/test_automation/github-actions/ci/run_step.sh index 5d15188167..1ec85c84dc 100755 --- a/tests/test_automation/github-actions/ci/run_step.sh +++ b/tests/test_automation/github-actions/ci/run_step.sh @@ -231,13 +231,21 @@ case "$1" in -DQMC_DATA=$QMC_DATA_DIR \ ${GITHUB_WORKSPACE} ;; - *"ROCm-Clang13-NoMPI-Legacy-CUDA2HIP"*) + *"ROCm-Clang13-MPI-Legacy-CUDA2HIP"*) echo 'Configure for building CUDA2HIP with clang compilers shipped with ROCM on AMD hardware' + + export OMPI_CC=/opt/rocm/llvm/bin/clang + export OMPI_CXX=/opt/rocm/llvm/bin/clang++ + + # Make current environment variables available to subsequent steps + echo "OMPI_CC=/opt/rocm/llvm/bin/clang" >> $GITHUB_ENV + echo "OMPI_CXX=/opt/rocm/llvm/bin/clang++" >> $GITHUB_ENV + cmake -GNinja \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_C_COMPILER=/usr/lib64/openmpi/bin/mpicc \ + -DCMAKE_CXX_COMPILER=/usr/lib64/openmpi/bin/mpicxx \ + -DMPIEXEC_EXECUTABLE=/usr/lib64/openmpi/bin/mpirun \ -DQMC_CUDA=1 \ - -DQMC_MPI=0 \ -DQMC_CUDA2HIP=ON \ -DCMAKE_PREFIX_PATH="/opt/OpenBLAS/0.3.18" \ -DQMC_COMPLEX=$IS_COMPLEX \ diff --git a/tests/test_automation/jenkins_ornl_test.sh b/tests/test_automation/jenkins_ornl_test.sh deleted file mode 100755 index ddd35608e6..0000000000 --- a/tests/test_automation/jenkins_ornl_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -x - -exit_code=0 - -echo "starting spack using ${SPACK_ENV_FILE}" - -# this depends on SPACK_ROOT being set in Jenkinsfile_xxx -# it also supplies QMC_IMMUTABLE_FLAGS which makes it a bit more than the -# environment from the set of loaded spack packages. -. ${SPACK_ENV_FILE} - -cd build_${1}_${2} -BUILD_DIR=$(pwd) -echo "BUILD_DIR: ${BUILD_DIR}" - -export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} - -module list - -# this keeps tee from eating the exit status -set -o pipefail - -# exclude failing hamiltonian test -EXCLUDE_FLAG='' -if [[ $2 == 'mixed' ]]; then - EXCLUDE_FLAG='-E unit_test_hamiltonian' -fi - - -ctest -j${JNK_THREADS} -L unit ${EXCLUDE_FLAG} --output-on-failure --timeout 120 2>&1 | tee ${1}_${2}_ctest.out -ret=$? -if [[ ${ret} -ne 0 ]] ; then - exit_code=${ret} -fi - -exit ${exit_code} diff --git a/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh b/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh deleted file mode 100755 index 3cee8e5e17..0000000000 --- a/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/bin/bash -# source environment -# MPI wrappers, MKL, and Intel and GCC compiler -export PATH=/opt/cmake/current/bin:$PATH -source /opt/rh/devtoolset-6/enable -source /opt/intel/2018/parallel_studio_xe_2018/psxevars.sh intel64 - -export N_PROCS_BUILD=64 -export N_PROCS=64 - -# KNL NUMA + Memory Mode -# quit if in a hybrid mode, could run out of memory -if [grep -i hybrid /var/run/hwloc/knl_memoryside_cache]; then - echo Memory Mode equal Hybrid. Quitting. - exit 1 -else - cat /var/run/hwloc/knl_memoryside_cache -fi - -# timeout -timeout=1800 - -# topdir must exist, otherwise it fails -topdir=/data/ci -QMC_DATA=/data/ci/NiO -export BOOST_ROOT=/data/ci/boost_1_61_0 -testdir=${topdir}/scratch/QMCPACK_CI_BUILDS_DO_NOT_REMOVE - -if [ -e $topdir ]; then - -if [ ! -e $testdir ]; then -mkdir -p $testdir -fi - -if [ -e $testdir ]; then -cd $testdir - -# Minimize load of GitHub by maintaining a local cloned git used for all builds -if [ ! -e qmcpack ]; then -echo --- Cloning QMCPACK git `date` -git clone https://github.com/QMCPACK/qmcpack.git --depth 1 -else -cd qmcpack -echo --- Updating local QMCPACK git `date` -git pull -cd .. -fi - -# Sanity check cmake config file present -if [ -e qmcpack/CMakeLists.txt ]; then -echo --- QMCPACK git repo contains CMakeLists.txt - -# Build Quantum Espresso -# Compiled only once with the Intel Compiler -QE_VERSION=6.3 -QE_sysdir=${testdir}/intel2018_QE -QE_BIN=${QE_sysdir}/qe-${QE_VERSION}/bin -echo --- QE_BIN set to ${QE_BIN} - -# Always start from clean build, just in case we updated the QE patch. -if [ -e ${QE_sysdir} ]; then - rm -rf ${QE_sysdir} -fi - -mkdir ${QE_sysdir} - -# Download QE and patch it -cd ${QE_sysdir} -cp -p ../qmcpack/external_codes/quantum_espresso/*${QE_VERSION}* . -./download_and_patch_qe${QE_VERSION}.sh -cd qe-${QE_VERSION} - -# Hack to get QE build to build and link against proper libraries on KNL -# Eventually, Copy make.sys that is known to work. -cp /data/ci/auxfiles/configure-qe-knl-omp.sh . -cp /data/ci/auxfiles/configure-qe-libxsmm.mak . -cp /data/ci/auxfiles/configure-qe-tbbmalloc.mak . - -echo --- Configure QE ${QE_VERSION}$ -./configure-qe-knl-omp.sh -# HDF5 support in QE 6.3 is buggy, revert to older file I/O format -sed -i 's/D__HDF5/D__HDF5_C/' make.inc -echo --- Building QE ${QE_VERSION}$ -# make pwall # parallel build fails due to incorrect dependency -make -j 64 pw -make -j 64 pp - -# Make fault-tolerant, maybe QE did not download properly or there -# was a build failure -if [ -e ${QE_BIN}/pw.x ]; then - echo -- QE ${QE_VERSION} was built properly. -else - echo -- QE ${QE_VERSION} failed to build. - exit 1 -fi - - - -echo --- Starting test builds and tests - -for sys in build_gcc build_gcc_complex build_gcc_mixed build_gcc_complex_mixed build_intel2018 build_intel2018_complex build_intel2018_mixed build_intel2018_complex_mixed -do - -echo --- Building for $sys `date` - -cd ${testdir} - -if [ -e $sys ]; then -rm -rf $sys -fi -mkdir $sys -cd $sys - -# export PATH=/opt/local/bin:/opt/local/sbin:/usr/local/cuda/bin/:/usr/lib64/openmpi/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin -# export LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda-7.0/lib64 - -case $sys in - "build_gcc") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_gcc_complex") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Complex-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_gcc_mixed") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Mixed-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_gcc_complex_mixed") - export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Complex-Mixed-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018_complex") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Complex-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018_mixed") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Mixed-SoA-Release - ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - "build_intel2018_complex_mixed") - # intel compiler should already be loaded - export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Complex-Mixed-SoA-Release - ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout - ;; - *) - echo "ERROR: Unknown build type $sys" - ;; -esac - -done # termination for $sys loop - -else - echo "ERROR: No CMakeLists. Bad git clone or update." - exit 1 -fi # termination for if block which checks CMakeLists.txt - - -else -echo "ERROR: Unable to make test directory ${testdir}" -exit 1 -fi # termination for if block which checks testing directory - -else -echo "ERROR: No top level directory" -exit 1 -fi # termination for if block which checks top level directory - - - diff --git a/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh b/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh index 0e1731e8fa..0799667cbe 100755 --- a/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh +++ b/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh @@ -13,11 +13,14 @@ fi module load cmake module load intel-mkl -module load cuda/10.1 + +module use /nfs/gce/projects/QMCPACK_dev/spack/share/spack/lmod/linux-ubuntu18.04-x86_64/Core +module load cuda/11.2.2-lo3x6k7 export TEST_SITE_NAME=bora.alcf.anl.gov export N_PROCS=16 export N_PROCS_BUILD=16 +export N_CONCURRENT_TESTS=16 # run on socket 1 NUMA_ID=0 @@ -26,7 +29,7 @@ QE_BIN=/scratch/opt/qe-stable/qe-6.4.1/bin QMC_DATA=/scratch/opt/h5data #Must be an absolute path -place=/scratch/QMCPACK_CI_BUILDS_DO_NOT_REMOVE +place=/scratch2/QMCPACK_CI_BUILDS_DO_NOT_REMOVE if [ ! -e $place ]; then mkdir $place @@ -56,9 +59,11 @@ cd $entry git checkout $branch -for sys in ClangDev-Offload-CUDA-Real ClangDev-Offload-CUDA-Complex ClangDev-Offload-CUDA-Real-Mixed ClangDev-Offload-CUDA-Complex-Mixed \ - Intel18-Real Intel18-Real-Mixed Intel18-Complex Intel18-Complex-Mixed \ - Intel18-Real-Mixed-CUDA2 Intel18-Complex-Mixed-CUDA2 Intel18-Real-Mixed-legacy-CUDA Intel18-Complex-Mixed-legacy-CUDA +for sys in ROCm-CUDA2HIP-Real ROCm-CUDA2HIP-Complex ROCm-CUDA2HIP-Real-Mixed ROCm-CUDA2HIP-Complex-Mixed \ + Intel19-Real Intel19-Real-Mixed Intel19-Complex Intel19-Complex-Mixed \ + ClangDev-Offload-Real ClangDev-Offload-Complex ClangDev-Offload-Real-Mixed ClangDev-Offload-Complex-Mixed \ + ClangDev-Offload-CUDA-Real ClangDev-Offload-CUDA-Complex ClangDev-Offload-CUDA-Real-Mixed ClangDev-Offload-CUDA-Complex-Mixed \ + Intel19-CUDA2-Real-Mixed Intel19-CUDA2-Complex-Mixed Intel19-legacy-CUDA-Real-Mixed Intel19-legacy-CUDA-Complex-Mixed do echo --- Building for $sys `date` @@ -78,42 +83,70 @@ if [[ $sys == *"ClangDev"* ]]; then #define and load compiler module load llvm/master-latest module load openmpi/4.0.2-llvm - export CC=mpicc - export CXX=mpicxx - CTEST_FLAGS="$CTEST_FLAGS -DCMAKE_C_FLAGS=-march=skylake -DCMAKE_CXX_FLAGS=-march=skylake" if [[ $sys == *"Offload-CUDA"* ]]; then - CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DUSE_OBJECT_TARGET=ON;-DCMAKE_CUDA_HOST_COMPILER=`which gcc`;-DQMC_NIO_MAX_SIZE=16'" - CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61" - CTEST_LABELS="-L deterministic -LE unstable" - export N_CONCURRENT_TESTS=4 + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DUSE_OBJECT_TARGET=ON;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=61 -DBUILD_AFQMC=ON" + elif [[ $sys == *"Offload"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DOFFLOAD_ARCH=sm_61;-DUSE_OBJECT_TARGET=ON;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DBUILD_AFQMC=ON" else CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN" - CTEST_LABELS="-L 'deterministic|performance' -LE unstable" - export N_CONCURRENT_TESTS=16 + fi + + export LIBOMP_USE_HIDDEN_HELPER_TASK=OFF + if [[ $sys == *"Real-Mixed"* ]]; then + #CTEST_LABELS="-E 'long|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp'" + CTEST_LABELS="-E 'restart|save|long|developer|He_param|example|cpu_limit|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp|short-C'" + else + CTEST_LABELS="-L 'deterministic|QMCPACK' -LE unstable -E long" + fi +elif [[ $sys == *"ROCm"* ]]; then + #define and load compiler + module load rocm/afar001-432 aomp/afar001-432 + module load openmpi/4.0.2-llvm + + if [[ $sys == *"Offload-CUDA2HIP"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DHIP_ARCH=gfx1030;-DENABLE_OFFLOAD=ON;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON" + elif [[ $sys == *"CUDA2HIP"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DHIP_ARCH=gfx1030;-DQMC_NIO_MAX_SIZE=8'" + CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON" + elif [[ $sys == *"Offload"* ]]; then + CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DOFFLOAD_TARGET=amdgcn-amd-amdhsa;-DOFFLOAD_ARCH=gfx1030;-DQMC_NIO_MAX_SIZE=8'" + else + CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN" + fi + + if [[ $sys == *"Real-Mixed"* ]]; then + #CTEST_LABELS="-E 'long|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp'" + CTEST_LABELS="-E 'restart|save|long|developer|He_param|example|cpu_limit|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp|short-C'" + else + CTEST_LABELS="-L 'deterministic|QMCPACK' -LE unstable -E long" fi elif [[ $sys == *"Intel"* ]]; then #define and load compiler - module load intel/18.4 + module load intel/20.2 module load openmpi/4.0.2-intel - export CC=mpicc - export CXX=mpicxx CTEST_FLAGS="$CTEST_FLAGS -DCMAKE_C_FLAGS=-xCOMMON-AVX512 -DCMAKE_CXX_FLAGS=-xCOMMON-AVX512" if [[ $sys == *"-CUDA2"* ]]; then CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61 -DQMC_OPTIONS='-DQMC_NIO_MAX_SIZE=16'" - CTEST_LABELS="-L 'deterministic|performance' -LE unstable" + CTEST_LABELS="-L 'deterministic|performance' -LE unstable -E long" export N_CONCURRENT_TESTS=4 elif [[ $sys == *"-legacy-CUDA"* ]]; then CTEST_FLAGS="$CTEST_FLAGS -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61 -DQMC_OPTIONS='-DQMC_NIO_MAX_SIZE=16'" - CTEST_LABELS="-L 'deterministic|performance' -LE unstable" + CTEST_LABELS="-L 'deterministic|performance' -LE unstable -E long" export N_CONCURRENT_TESTS=1 else CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN" + CTEST_LABELS="-E long" export N_CONCURRENT_TESTS=16 fi fi +CTEST_FLAGS="-DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx $CTEST_FLAGS" + # compiler independent options if [[ $sys == *"-Complex"* ]]; then CTEST_FLAGS="$CTEST_FLAGS -DQMC_COMPLEX=1" @@ -132,8 +165,15 @@ fi mkdir $folder cd $folder +logfile=$place/log/$entry/$mydate/${QMCPACK_TEST_SUBMIT_NAME}.log +echo "$CTEST_FLAGS $CTEST_LABELS" | tee $logfile +echo PATH=$PATH >> $logfile +echo OMPI_CXX=${OMPI_CXX} >> $logfile +mpicxx -v >> $logfile +echo >> $logfile + numactl -N $NUMA_ID \ -ctest $CTEST_FLAGS $CTEST_LABELS -S $PWD/../CMake/ctest_script.cmake,release -VV -E 'long' --timeout 800 &> $place/log/$entry/$mydate/${QMCPACK_TEST_SUBMIT_NAME}.log +ctest $CTEST_FLAGS $CTEST_LABELS -S $PWD/../CMake/ctest_script.cmake,release -VV --timeout 800 &>> $logfile cd .. echo --- Finished $sys `date` diff --git a/tests/test_automation/spack_supported_package_versions.sh b/tests/test_automation/spack_supported_package_versions.sh deleted file mode 100644 index e16336bbc8..0000000000 --- a/tests/test_automation/spack_supported_package_versions.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -# GCC -# Dates at https://gcc.gnu.org/releases.html -gcc_vnew=9.2.0 # 2019-08-12 -gcc_vold=7.3.0 # 2018-01-25 - -#For Intel: -gcc_vintel=7.4.0 # 2018-12-06 - -#PGI 19.4 -# makelocalrc configured with 8.3.0 currently -gcc_vpgi=8.3.0 # 2019-02-22 - -# For CUDA toolkit compatibility -gcc_vcuda=8.3.0 # 2019-02-22 - -# LLVM -# Dates at http://releases.llvm.org/ -llvm_vnew=9.0.0 # 2019-09-19 -llvm_vold=5.0.1 # 2017-12-21 -# for CUDA 10.1 update 2 -llvm_vcuda=8.0.0 # 2019-03- - -# HDF5 -hdf5_vnew=1.10.5 # Releeased 2019-02-28 -hdf5_vold=1.8.19 # Released 2017-06-16 - -# CMake -# Dates at https://cmake.org/files/ -cmake_vnew=3.16.2 # Released 2019-12-19 -cmake_vold=3.10.2 # Released 2018-01-18 - -# OpenMPI -# Dates at https://www.open-mpi.org/software/ompi/v4.0/ -ompi_vnew=4.0.2 # Released 2019-10-07 -ompi_vold=2.1.2 # Released 2017-09-20 - -libxml2_vnew=2.9.9 # Released 2019-01-03 See http://xmlsoft.org/sources/ -libxml2_vold=2.9.1 # Released 2013-04-19 - -# FFTW -# Dates at http://www.fftw.org/release-notes.html -fftw_vnew=3.3.8 # Released 2018-05-28 -fftw_vold=3.3.4 # Released 2014-03-16 - -# BOOST -# Dates at https://www.boost.org/users/history/ -boost_vnew=1.70.0 # Released 2019-04-12 -boost_vold=1.65.1 # Released 2016-05-13 diff --git a/tests/test_automation/start_spack_env.sh b/tests/test_automation/start_spack_env.sh deleted file mode 100644 index 4dc0f19cd5..0000000000 --- a/tests/test_automation/start_spack_env.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# This shared script fragment start spack and loads in QMCPACKS currently supported versions of compilers -# and libraries - -. $SPACK_ROOT/share/spack/setup-env.sh - -# This is to deal with the fact that we don't know the relationship between the working directory -# and checkout directory in a relative way. -# this doesn't deal with symlinks -SRC="${BASH_SOURCE[0]}" -SRC_DIR="$( cd -P "$( dirname "$SRC" )" >/dev/null 2>&1 && pwd )" - -. ${SRC_DIR}/spack_supported_package_versions.sh diff --git a/tests/test_automation/vm_setup.sh b/tests/test_automation/vm_setup.sh deleted file mode 100755 index 0d8ef048f1..0000000000 --- a/tests/test_automation/vm_setup.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash - -# -# Installs compilers & libraries for QMCPACK testing via SPACK on a clean VM -# - assumes 32 jobs is reasonable -# -# ./vm_setup.sh Builds current CI requirements (use with caution specs could become ambiguous) -# ./vm_setup.sh clean *** DESTROYS EXISTING INSTALLATION *** -# -# - -module() { eval `/usr/bin/modulecmd bash $*`; } - -if [[ $1 == "clean" ]]; then - rm -r -f $HOME/spack $HOME/.spack - mkdir $HOME/.spack - cat >$HOME/.spack/config.yaml< -#Date: Fri Jan 3 15:52:59 2020 -0600 -# -# py-intervaltree: new package at 3.0.2 (#14277) - - cd bin - ./spack bootstrap -fi - -export SPACK_ROOT=$HOME/spack -. $SPACK_ROOT/share/spack/setup-env.sh - -echo --- Spack list -spack find -echo --- Spack compilers -spack compilers -echo --- Modules list -module list -echo --- End listings - -# -# Versions should be consistent with setup script -# - -# GCC -# Dates at https://gcc.gnu.org/releases.html -gcc_vnew=9.2.0 # 2019-08-12 -gcc_vold=7.3.0 # 2018-01-25 - -#For Intel: -gcc_vintel=7.4.0 # 2018-12-06 - -#PGI 19.4 -# makelocalrc configured with 8.3.0 currently -gcc_vpgi=8.3.0 # 2019-02-22 - -# For CUDA toolkit compatibility -gcc_vcuda=8.3.0 # 2019-02-22 - -# LLVM -# Dates at http://releases.llvm.org/ -llvm_vnew=9.0.0 # 2019-09-19 -llvm_vold=5.0.1 # 2017-12-21 -# for CUDA 10.1 update 2 -llvm_vcuda=8.0.0 # 2019-03- - -# HDF5 -hdf5_vnew=1.10.5 # Releeased 2019-02-28 -hdf5_vold=1.8.19 # Released 2017-06-16 - -# CMake -# Dates at https://cmake.org/files/ -cmake_vnew=3.16.2 # Released 2019-12-19 -cmake_vold=3.10.2 # Released 2018-01-18 - -# OpenMPI -# Dates at https://www.open-mpi.org/software/ompi/v4.0/ -ompi_vnew=4.0.2 # Released 2019-10-07 -ompi_vold=2.1.2 # Released 2017-09-20 - -libxml2_vnew=2.9.9 # Released 2019-01-03 See http://xmlsoft.org/sources/ -libxml2_vold=2.9.1 # Released 2013-04-19 - -# FFTW -# Dates at http://www.fftw.org/release-notes.html -fftw_vnew=3.3.8 # Released 2018-05-28 -fftw_vold=3.3.4 # Released 2014-03-16 - -# BOOST -# Dates at https://www.boost.org/users/history/ -boost_vnew=1.70.0 # Released 2019-04-12 -boost_vold=1.65.1 # Released 2016-05-13 - -echo --- START env `date` -echo --- gcc@${gcc_vold} -spack install gcc@${gcc_vold} -spack load gcc@${gcc_vold} -spack compiler find -echo --- Spack compilers -spack compilers - -if [[ $1 == "clean" ]]; then - cat >$HOME/.spack/packages.yaml</dev/null 2>&1 && pwd )" -. ${SRC_DIR}/start_spack_env.sh - -spack load gcc@$gcc_vnew%gcc@$gcc_vold -spack load boost@$boost_vnew%gcc@$gcc_vnew -spack load hdf5@$hdf5_vnew%gcc@$gcc_vnew~mpi -spack load openssl@1.1.1d%gcc@9.2.0+systemcerts -spack load cmake@$cmake_vnew%gcc@$gcc_vnew -spack load hwloc%gcc@$gcc_vnew -spack load libiconv%gcc@$gcc_vnew -spack load openmpi@$ompi_vnew%gcc@$gcc_vnew -spack load libxml2@$libxml2_vnew%gcc@$gcc_vnew -spack load fftw@$fftw_vnew%gcc@$gcc_vnew -spack load openblas%gcc@$gcc_vnew -spack load netlib-lapack%gcc@$gcc_vnew - -#if you've installed more than one python for the new compiler this will fail -spack load python%gcc@$gcc_vnew - -QMC_IMMUTABLE_FLAGS="-DBUILD_AFQMC=1" - -echo ${PATH}