ye-luo · jngkim · May 10, 2022 · May 10, 2022 · May 11, 2022 · May 11, 2022
diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml
@@ -222,10 +222,10 @@ jobs:
             ROCm-Clang13-NoMPI-CUDA2HIP-Real,
             ROCm-Clang13-NoMPI-CUDA2HIP-Complex-Mixed,
             ROCm-Clang13-NoMPI-CUDA2HIP-Complex,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real-Mixed,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex-Mixed,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real-Mixed,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex-Mixed,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex,
           ]
 
     steps:

diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake
@@ -32,10 +32,6 @@ if(QMC_OMP)
       set(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Wno-linker-warnings")
     endif()
 
-    if(NOT DEFINED OFFLOAD_ARCH AND OFFLOAD_TARGET MATCHES "amdgcn")
-      set(OFFLOAD_ARCH gfx906)
-    endif()
-
     if(NOT DEFINED OFFLOAD_ARCH
        AND OFFLOAD_TARGET MATCHES "nvptx64"
        AND DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -51,6 +47,24 @@ if(QMC_OMP)
       endif()
     endif()
 
+    if(NOT DEFINED OFFLOAD_ARCH
+       AND OFFLOAD_TARGET MATCHES "amdgcn")
+      if (DEFINED HIP_ARCH)
+        list(LENGTH HIP_ARCH NUMBER_HIP_ARCHITECTURES)
+        if(NUMBER_HIP_ARCHITECTURES EQUAL "1")
+          set(OFFLOAD_ARCH ${HIP_ARCH})
+        else()
+          message(
+            FATAL_ERROR
+              "LLVM does not yet support offload to multiple architectures! "
+              "Deriving OFFLOAD_ARCH from HIP_ARCH failed. "
+              "Please keep only one entry in HIP_ARCH or set OFFLOAD_ARCH.")
+        endif()
+      else()
+        set(OFFLOAD_ARCH gfx906)
+      endif()
+    endif()
+
     if(DEFINED OFFLOAD_ARCH)
       set(OPENMP_OFFLOAD_COMPILE_OPTIONS
           "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}")
@@ -90,8 +104,17 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 
 # Set extra debug flags
-set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug")
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer")
+
+# unfortunately this removes standalone-debug altogether for offload builds
+# but until we discover how to use the ${OPENMP_OFFLOAD_COMPILE_OPTIONS} more selectively
+# this is the only way to avoid a warning per compilation unit that contains an omp symbol.
+if (NOT OFFLOAD_TARGET MATCHES "nvptx64")
+  message(STATUS "QMCPACK adds -fstandalone-debug for Debug builds")
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fstandalone-debug")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstandalone-debug")
+endif()
 
 #--------------------------------------
 # Special architectural flags

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -779,7 +779,7 @@ if(ENABLE_ROCM)
     message(STATUS "ROCM_ROOT not provided. Searching for FindHIP.cmake file.")
     find_path(
       HIP_MODULE_FILE_DIR FindHIP.cmake
-      HINTS /opt/rocm
+      HINTS $ENV{ROCM_PATH} /opt/rocm
       PATH_SUFFIXES hip/cmake)
     if(HIP_MODULE_FILE_DIR)
       message(STATUS "Found FindHIP.cmake file. ROCM_ROOT will be derived.")
@@ -856,7 +856,7 @@ if(ENABLE_SYCL)
   endif()
   add_library(SYCL::host INTERFACE IMPORTED)
   add_library(SYCL::device INTERFACE IMPORTED)
-  find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE})
+  find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE} NO_DEFAULT_PATH)
   target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host)
   target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device)
   if(TARGET MKL::sycl)

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -282,20 +282,20 @@ the path to the source directory.
 
   ::
 
-    QMC_CUDA              Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no)
     QMC_COMPLEX           Build the complex (general twist/k-point) version (1:yes, 0:no)
     QMC_MIXED_PRECISION   Build the mixed precision (mixing double/float) version
                           (1:yes (QMC_CUDA=1 default), 0:no (QMC_CUDA=0 default)).
                           Mixed precision calculations can be signifiantly faster but should be
                           carefully checked validated against full double precision runs,
                           particularly for large electron counts.
+    ENABLE_OFFLOAD        ON/OFF(default). Enable OpenMP target offload for GPU acceleration.
+    QMC_CUDA              Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no)
     ENABLE_CUDA           ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration.
-                          Production quality for AFQMC. Pre-production quality for real-space.
+                          Production quality for AFQMC and real-space performance portable implementation.
                           Use CMAKE_CUDA_ARCHITECTURES, default 70, to set the actual GPU architecture.
-    ENABLE_OFFLOAD        ON/OFF(default). Enable OpenMP target offload for GPU acceleration.
-    ENABLE_TIMERS         ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse
-                          to avoid potential slowdown in tiny systems.
-                          For systems beyond tiny sizes (100+ electrons) there is no risk.
+    QMC_CUDA2HIP          ON/OFF(default). To be set ON, it requires either QMC_CUDA or ENABLE_CUDA to be ON.
+                          Compile CUDA source code as HIP and use ROCm libraries for AMD GPUs.
+    ENABLE_SYCL           ON/OFF(default). Enable SYCL code path. Only support Intel GPUs and OneAPI compilers.
 
 - General build options
 
@@ -327,6 +327,9 @@ the path to the source directory.
 
   ::
 
+    ENABLE_TIMERS          ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse
+                           to avoid potential slowdown in tiny systems.
+                           For systems beyond tiny sizes (100+ electrons) there is no risk.
     QE_BIN                 Location of Quantum ESPRESSO binaries including pw2qmcpack.x
     RMG_BIN                Location of RMG binary (rmg-cpu)
     QMC_DATA               Specify data directory for QMCPACK performance and integration tests
@@ -412,7 +415,7 @@ and is not suitable for production. Additional implementation in QMCPACK as
 well as improvements in open-source and vendor compilers is required for production status 
 to be reached. The following compilers have been verified:
 
-- LLVM Clang 11. Support NVIDIA GPUs.
+- LLVM Clang 14. Support NVIDIA GPUs.
 
   ::
 
@@ -425,31 +428,43 @@ to be reached. The following compilers have been verified:
     OFFLOAD_TARGET for the offload target. default nvptx64-nvidia-cuda.
     OFFLOAD_ARCH for the target architecture (sm_80, gfx906, ...) if not using the compiler default.
 
-- AMD AOMP Clang 11.8. Support AMD GPUs.
+- AMD ROCm/AOMP LLVM-based compilers. Support AMD GPUs.
 
   ::
 
     -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=amdgcn-amd-amdhsa -D OFFLOAD_ARCH=gfx906
 
-- Intel oneAPI beta08. Support Intel GPUs.
+- Intel oneAPI 2022.1.0 icx/icpx compilers. Support Intel GPUs.
 
   ::
 
     -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=spir64
 
-- HPE Cray 11. It is derived from Clang and supports NVIDIA and AMD GPUs.
+- HPE Cray 13. It is derived from Clang and supports NVIDIA and AMD GPUs.
 
   ::
 
     -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=nvptx64-nvidia-cuda -D OFFLOAD_ARCH=sm_80
 
 OpenMP offload features can be used together with vendor specific code paths to maximize QMCPACK performance.
-Some new CUDA functionality has been implemented to improve efficiency on NVIDIA GPUs in conjunction with the Offload code paths:
-For example, using Clang 11 on Summit.
+Some new CUDA functionality has been implemented to improve performance on NVIDIA GPUs in conjunction with the offload code paths:
+For example, using Clang 14 on Summit.
 
   ::
 
-    -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70 -D CMAKE_CUDA_HOST_COMPILER=`which gcc`
+    -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70
+
+Similarly, HIP features can be enabled in conjunction with the offload code path to improve performance on AMD GPUs.
+
+  ::
+
+    -D ENABLE_OFFLOAD=ON -D ENABLE_CUDA=ON -D QMC_CUDA2HIP=ON -DHIP_ARCH=gfx906
+
+Similarly, SYCL features can be enabled in conjunction with the offload code path to improve performance on Intel GPUs.
+
+  ::
+
+    -D ENABLE_OFFLOAD=ON -D ENABLE_SYCL=ON
 
 
 Installation from CMake

diff --git a/nexus/tests/unit/test_structure.py b/nexus/tests/unit/test_structure.py
@@ -1372,7 +1372,9 @@ def test_embed():
         r = np.linalg.norm(dr,axis=1)
         dilation = 2*r*np.exp(-r)
         for i in range(npos):
-            gr.pos[i] += dilation[i]/r[i]*dr[i]
+            if r[i]>0:
+                gr.pos[i] += dilation[i]/r[i]*dr[i]
+            #end if
         #end for
 
         # Represent the unrelaxed large cell

diff --git a/src/Configuration.h b/src/Configuration.h
@@ -50,7 +50,8 @@ struct QMCTraits
 {
   enum
   {
-    DIM = OHMMS_DIM
+    DIM = OHMMS_DIM,
+    DIM_VGL = OHMMS_DIM + 2 // Value(1) + Gradients(OHMMS_DIM) + Laplacian(1)
   };
   using QTBase      = QMCTypes<OHMMS_PRECISION, DIM>;
   using QTFull      = QMCTypes<OHMMS_PRECISION_FULL, DIM>;

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -82,6 +82,8 @@ class Array
   inline typename Container_t::const_iterator begin() const { return X.begin(); }
   inline typename Container_t::const_iterator end() const { return X.end(); }
 
+  ///@{
+  /// access the container data pointer
   inline Type_t* data() { return X.data(); }
   inline const Type_t* data() const { return X.data(); }
   template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
@@ -94,14 +96,70 @@ class Array
   {
     return X.device_data();
   }
+  ///@}
 
-  inline const Type_t* first_address() const { return &(X[0]); }
+  ///@{
+  /// access the data pointer at {index_1, ..., index_D}
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  Type_t* data_at(const std::array<SIZET, D>& indices)
+  {
+    return X.data() + compute_offset(indices);
+  }
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  const Type_t* data_at(const std::array<SIZET, D>& indices) const
+  {
+    return X.data() + compute_offset(indices);
+  }
+  template<typename SIZET     = size_t,
+           typename           = std::is_integral<SIZET>,
+           typename Allocator = ALLOC,
+           typename           = qmcplusplus::IsDualSpace<Allocator>>
+  Type_t* device_data_at(const std::array<SIZET, D>& indices)
+  {
+    return X.device_data() + compute_offset(indices);
+  }
+  template<typename SIZET     = size_t,
+           typename           = std::is_integral<SIZET>,
+           typename Allocator = ALLOC,
+           typename           = qmcplusplus::IsDualSpace<Allocator>>
+  const Type_t* device_data_at(const std::array<SIZET, D>& indices) const
+  {
+    return X.device_data() + compute_offset(indices);
+  }
+
+  template<typename... Args>
+  Type_t* data_at(Args... indices)
+  {
+    static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
+    return data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
+  }
+  template<typename... Args>
+  const Type_t* data_at(Args... indices) const
+  {
+    static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
+    return data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
+  }
+  template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  Type_t* device_data_at(Args... indices)
+  {
+    static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
+    return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
+  }
+  template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  const Type_t* device_data_at(Args... indices) const
+  {
+    static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
+    return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
+  }
+  ///@}
 
-  inline const Type_t* last_address() const { return &(X[0]) + X.size(); }
+  inline const Type_t* first_address() const { return X.data(); }
 
-  inline Type_t* first_address() { return &(X[0]); }
+  inline const Type_t* last_address() const { return X.data() + X.size(); }
 
-  inline Type_t* last_address() { return &(X[0]) + X.size(); }
+  inline Type_t* first_address() { return X.data(); }
+
+  inline Type_t* last_address() { return X.data() + X.size(); }
 
   This_t& operator=(const T& rhs)
   {
@@ -127,22 +185,31 @@ class Array
     return *this;
   }
 
-  // Get and Set Operations
-  inline Type_t& operator()(size_t i) { return X[i]; }
-
-  inline Type_t operator()(size_t i) const { return X[i]; }
-  inline Type_t& operator()(size_t i, size_t j) { return X[j + Length[1] * i]; }
-  inline Type_t operator()(size_t i, size_t j) const { return X[j + Length[1] * i]; }
-  inline Type_t& operator()(size_t i, size_t j, size_t k) { return X[k + Length[2] * (j + Length[1] * i)]; }
-  inline Type_t operator()(size_t i, size_t j, size_t k) const { return X[k + Length[2] * (j + Length[1] * i)]; }
-  inline Type_t& operator()(size_t i, size_t j, size_t k, size_t l)
+  ///@{
+  /// access the element at {index_1, ..., index_D}
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  Type_t& operator()(const std::array<SIZET, D>& indices)
   {
-    return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))];
+    return X[compute_offset(indices)];
   }
-  inline Type_t operator()(size_t i, size_t j, size_t k, size_t l) const
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  const Type_t& operator()(const std::array<SIZET, D>& indices) const
   {
-    return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))];
+    return X[compute_offset(indices)];
   }
+  template<typename... Args>
+  Type_t& operator()(Args... indices)
+  {
+    static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
+    return operator()({static_cast<std::size_t>(std::forward<Args>(indices))...});
+  }
+  template<typename... Args>
+  const Type_t& operator()(Args... indices) const
+  {
+    static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
+    return operator()({static_cast<std::size_t>(std::forward<Args>(indices))...});
+  }
+  ///@}
 
   inline Type_t sum() const
   {
@@ -152,6 +219,18 @@ class Array
     return s;
   }
 
+  // Abstract Dual Space Transfers
+  template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  void updateTo()
+  {
+    X.updateTo();
+  }
+  template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  void updateFrom()
+  {
+    X.updateFrom();
+  }
+
 private:
   std::array<size_t, D> Length;
   Container_t X;
@@ -163,6 +242,15 @@ class Array
       total *= dims[i];
     return total;
   }
+
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  SIZET compute_offset(const std::array<SIZET, D>& indices) const
+  {
+    SIZET offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
+    return offset;
+  }
 };
 
 template<class T, unsigned D, class Alloc>

diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp
@@ -24,8 +24,8 @@ namespace qmcplusplus
 TEST_CASE("array", "[OhmmsPETE]")
 {
   using Array1D = Array<double, 1>;
-  Array1D A({3});
-  Array1D B({3});
+  Array1D A(3);
+  Array1D B(3);
 
   // iterator
   auto ia = A.begin();
@@ -73,6 +73,17 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]")
   CHECK(vec_copy(0).back() == 123);
 }
 
+TEST_CASE("Array::data", "[OhmmsPETE]")
+{
+  Array<float, 3> tensor(2, 4, 5);
+  REQUIRE(tensor.size() == 40);
+
+  CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data_at(1, 2, 3));
+
+  tensor(1, 2, 3) = 0.5f;
+  CHECK(*tensor.data_at(1, 2, 3) == 0.5f);
+}
+
 TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]")
 {
   const int dim = 2;