From 27390ec80f06cb12b4cc529cbeac1e5a94f87ebc Mon Sep 17 00:00:00 2001
From: Jeongnim Kim <jeongnim.kim@gmail.com>
Date: Tue, 10 May 2022 14:23:13 -0700
Subject: [PATCH 01/36] Add syclSolverInverter routines.

---
 src/Platforms/SYCL/syclBLAS.cpp               |  79 +++++++++++
 src/Platforms/SYCL/syclBLAS.hpp               |  17 +++
 src/Platforms/SYCL/syclSolver.hpp             |  25 ++++
 .../Fermion/syclSolverInverter.hpp            | 130 ++++++++++++++++++
 .../detail/SYCL/sycl_determinant_helper.cpp   |  43 ++++++
 .../detail/SYCL/sycl_determinant_helper.hpp   |   9 ++
 .../tests/test_syclSolverInverter.cpp         |  66 +++++++++
 7 files changed, 369 insertions(+)
 create mode 100644 src/Platforms/SYCL/syclSolver.hpp
 create mode 100644 src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
 create mode 100644 src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp
diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp
index 31cc35e89b..b8a9f87028 100644
--- a/src/Platforms/SYCL/syclBLAS.cpp
+++ b/src/Platforms/SYCL/syclBLAS.cpp
@@ -182,6 +182,85 @@ template sycl::event gemm(sycl::queue& handle,
                           const int ldc,
                           const std::vector<sycl::event>& events);
 
+//transpose 
+template<typename T1, typename T2>
+sycl::event transpose(sycl::queue& q,
+                      const T1* restrict in,
+                      int m,
+                      int lda,
+                      T2* restrict out,
+                      int n,
+                      int ldb,
+                      const std::vector<sycl::event>& events)
+{
+    constexpr size_t tile_size=16;
+    const size_t m_max=((m+tile_size-1)/tile_size)*tile_size;
+    const size_t n_max=((n+tile_size-1)/tile_size)*tile_size;
+
+    return q.submit([&](sycl::handler& cgh) {
+
+            cgh.depends_on(events);
+            sycl::accessor<T2, 2, sycl::access::mode::write, sycl::access::target::local> 
+            tile(sycl::range<2>(tile_size,tile_size+1), cgh);
+
+            cgh.parallel_for(sycl::nd_range<2>{{m_max,n_max},{tile_size,tile_size}},
+                    [=](sycl::nd_item<2> item) {
+                    unsigned x = item.get_global_id(1);
+                    unsigned y = item.get_global_id(0);
+                    unsigned xth=item.get_local_id(1);
+                    unsigned yth=item.get_local_id(0);
+
+                    if(x<n && y<m) tile[yth][xth] = in[(y)*lda + x];
+                    item.barrier(sycl::access::fence_space::local_space);
+
+                    x = item.get_group(0)*tile_size + xth;
+                    y = item.get_group(1)*tile_size + yth;
+                    if(x<m && y<n)   out[(y)*ldb + x] = tile[xth][yth];
+                    });
+            });
+
+}
+
+template sycl::event transpose(sycl::queue& q,
+                              const float* restrict in,
+                              int m,
+                              int lda,
+                              double* restrict out,
+                              int n,
+                              int ldb,
+                              const std::vector<sycl::event>& events);
+
+template sycl::event transpose(sycl::queue& q,
+                              const double* restrict in,
+                              int m,
+                              int lda,
+                              double* restrict out,
+                              int n,
+                              int ldb,
+                              const std::vector<sycl::event>& events);
+
+//copy_n for mixed precision
+template <typename T1, typename T2>
+sycl::event copy_n(sycl::queue &aq,
+                   const T1* restrict VA,
+                   size_t array_size,
+                   T2* restrict VC,
+                   const std::vector<sycl::event>& events)
+{
+    constexpr size_t tile_size=64;
+    const size_t a_max=((array_size+tile_size-1)/tile_size)*tile_size;
+    return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) {
+        if(id<array_size)
+            VC[id] = static_cast<T2>(VA[id]);
+    });
+}
+
+template sycl::event copy_n(sycl::queue &aq,
+                            const double* restrict VA,
+                            size_t array_size,
+                            float* restrict VC,
+                            const std::vector<sycl::event>& events);
+
 } // namespace syclBLAS
 
 } // namespace qmcplusplus
diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp
index 2952dfa3b3..36ac05e3a3 100644
--- a/src/Platforms/SYCL/syclBLAS.hpp
+++ b/src/Platforms/SYCL/syclBLAS.hpp
@@ -56,6 +56,23 @@ sycl::event gemm(sycl::queue& handle,
                  const int ldc,
                  const std::vector<sycl::event>& events = {});
 
+template<typename T1, typename T2>
+sycl::event transpose(sycl::queue& q,
+                      const T1* restrict in,
+                      int m,
+                      int lda,
+                      T2* restrict out,
+                      int n,
+                      int ldb,
+                      const std::vector<sycl::event>& events = {});
+
+template <typename T1, typename T2>
+sycl::event copy_n(sycl::queue &aq,
+                   const T1* restrict VA,
+                   size_t array_size,
+                   T2* restrict VC,
+                   const std::vector<sycl::event>& events = {});
+
 } // namespace syclBLAS
 
 } // namespace qmcplusplus
diff --git a/src/Platforms/SYCL/syclSolver.hpp b/src/Platforms/SYCL/syclSolver.hpp
new file mode 100644
index 0000000000..66bdc781d9
--- /dev/null
+++ b/src/Platforms/SYCL/syclSolver.hpp
@@ -0,0 +1,25 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2022 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVER_H
+#define QMCPLUSPLUS_SYCL_MKL_SOLVER_H
+
+#include "oneapi/mkl/lapack.hpp"
+
+namespace qmcplusplus
+{
+    namespace syclSolver
+    {
+        using namespace oneapi::mkl::lapack;
+    }
+}
+
+#endif
diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
new file mode 100644
index 0000000000..7300c0d6f2
--- /dev/null
+++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
@@ -0,0 +1,130 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H
+#define QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H
+
+#include "OhmmsPETE/OhmmsVector.h"
+#include "OhmmsPETE/OhmmsMatrix.h"
+#include "SYCL/SYCLallocator.hpp"
+#include "SYCL/syclBLAS.hpp"
+#include "SYCL/syclSolver.hpp"
+#include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp"
+
+namespace qmcplusplus
+{
+/** implements matrix inversion via cuSolverDN
+ * @tparam T_FP high precision for matrix inversion, T_FP >= T
+ */
+template<typename T_FP>
+class syclSolverInverter
+{
+  /// scratch memory for cusolverDN
+  Matrix<T_FP, SYCLAllocator<T_FP>> Mat1_gpu;
+  /// pivot array + info
+  Vector<std::int64_t, SYCLHostAllocator<std::int64_t>> ipiv;
+  /// workspace
+  Vector<T_FP, SYCLAllocator<T_FP>> workspace;
+  std::int64_t getrf_ws=0;
+  std::int64_t getri_ws=0;
+
+  /** resize the internal storage
+   * @param norb number of electrons/orbitals
+   * @param delay, maximum delay 0<delay<=norb
+   */
+  inline void resize(int norb, sycl::queue& m_queue)
+  {
+    if (Mat1_gpu.rows() != norb)
+    {
+      Mat1_gpu.resize(norb, norb);
+      ipiv.resize(norb);
+      getrf_ws=syclSolver::getrf_scratchpad_size<T_FP>(m_queue,norb,norb,norb);
+      getri_ws=syclSolver::getri_scratchpad_size<T_FP>(m_queue,norb,norb);
+      workspace.resize(std::max(getrf_ws,getri_ws));
+    }
+  }
+
+public:
+
+  /** compute the inverse of the transpose of matrix A and its determinant value in log
+   * when T_FP and TMAT are the same
+   * @tparam TREAL real type
+   */
+  template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
+  std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
+                                                                     Matrix<TMAT>& Ainv,
+                                                                     Matrix<TMAT, SYCLAllocator<TMAT>>& Ainv_gpu,
+                                                                     std::complex<TREAL>& log_value,
+                                                                     sycl::queue& m_queue)
+  {
+    const int norb = logdetT.rows();
+    resize(norb, m_queue);
+
+    auto c_event = m_queue.memcpy(Mat1_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT));
+    auto t_event = syclBLAS::transpose(m_queue,Mat1_gpu.data(),norb,Mat1_gpu.cols(),Ainv_gpu.data(),norb,Ainv_gpu.cols(), {c_event});
+    try
+    {
+        c_event = syclSolver::getrf(m_queue,norb,norb,Ainv_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event});
+    }
+    catch(cl::sycl::exception const& ex) {
+        std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n"
+            << ex.what() << "  status: " << ex.code() << std::endl;
+        abort();
+    }        
+
+    log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data(), {c_event});
+
+    c_event = syclSolver::getri(m_queue,norb,Ainv_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws);
+
+    m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {c_event}).wait();
+  }
+
+  /** compute the inverse of the transpose of matrix A and its determinant value in log
+   * when T_FP and TMAT are not the same
+   * @tparam TREAL real type
+   */
+  template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
+  std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
+                                                                      Matrix<TMAT>& Ainv,
+                                                                      Matrix<TMAT, SYCLAllocator<TMAT>>& Ainv_gpu,
+                                                                      std::complex<TREAL>& log_value,
+                                                                      sycl::queue& m_queue)
+  {
+    const int norb = logdetT.rows();
+    resize(norb, m_queue);
+    //use Ainv_gpu for transpose
+    auto c_event = m_queue.memcpy(Ainv_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT));
+    //transpose
+    auto t_event = syclBLAS::transpose(m_queue,Ainv_gpu.data(),norb,Ainv_gpu.cols(),Mat1_gpu.data(),norb,Mat1_gpu.cols(), {c_event});
+
+    //getrf (LU) -> getri (inverse)
+    try
+    {
+        c_event = syclSolver::getrf(m_queue,norb,norb,Mat1_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event});
+    }
+    catch(cl::sycl::exception const& ex) {
+        std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n"
+            << ex.what() << "  status: " << ex.code() << std::endl;
+        abort();
+    }        
+
+    log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data(), {c_event});
+
+    c_event = syclSolver::getri(m_queue,norb,Mat1_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws);
+
+    t_event = syclBLAS::copy_n(m_queue,Mat1_gpu.data(),Mat1_gpu.size(),Ainv_gpu.data(), {c_event} );
+
+    m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {t_event}).wait();
+  }
+};
+} // namespace qmcplusplus
+
+#endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
index 7ce7a9d3dc..84fb6cada0 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
@@ -86,4 +86,47 @@ template sycl::event applyW_stageV_sycl(sycl::queue& aq,
                                         const int ndelay,
                                         std::complex<double>* restrict V_gpu,
                                         const std::complex<double>* restrict Ainv);
+
+
+template<typename T, typename TMAT, typename Index_t>
+std::complex<T> computeLogDet_sycl(sycl::queue& aq,
+                                   int n,
+                                   int lda,
+                                   const TMAT* restrict a,
+                                   const Index_t* restrict pivot,
+                                   const std::vector<cl::sycl::event>& dependencies)
+  {
+    constexpr size_t COLBS=32;
+
+    std::complex<T> result{};
+    {
+      sycl::buffer<std::complex<T>,1> abuff(&result,{1});
+      aq.submit([&](sycl::handler& cgh) {
+          cgh.depends_on(dependencies);
+
+          size_t n_max=((n+COLBS-1)/COLBS)*COLBS;
+          sycl::global_ptr<const TMAT>  A{a};
+          sycl::global_ptr<const Index_t>  Pivot{pivot};
+          cgh.parallel_for(sycl::range<1>{n_max},
+              sycl::reduction(abuff,cgh,{T{},T{}},std::plus<std::complex<T>>()),
+              [=](sycl::id<1> i, auto& sum)
+              {
+                  std::complex<T> val{};
+                  //if(i<n) val= std::log(std::complex<T>((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i]));
+                  if(i<n)
+                  val = (Pivot[i] == i + 1) ?
+                    std::log(std::complex<T>(A[i*lda+i])) : std::log(std::complex<T>(-A[i*lda+i]));
+                  sum.combine(val);
+              });
+          });
+    } //synchronous
+    return  result;
+  }
+
+template std::complex<double> computeLogDet_sycl(sycl::queue& aq,
+                                                 int n,
+                                                 int lda,
+                                                 const double* restrict a,
+                                                 const std::int64_t* restrict pivot,
+                                                 const std::vector<cl::sycl::event>& dependencies);
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
index 7baec66af7..4cca93bc83 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
@@ -31,5 +31,14 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq,
                                const int ndelay,
                                T* V_gpu,
                                const T* Ainv);
+
+template<typename T, typename TMAT, typename Index_t>
+std::complex<T> computeLogDet_sycl(sycl::queue& aq,
+                                   int n,
+                                   int lda,
+                                   const TMAT* restrict a,
+                                   const Index_t* restrict pivot,
+                                   const std::vector<cl::sycl::event>& dependencies={});
+
 }
 #endif
diff --git a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp
new file mode 100644
index 0000000000..8ee4b65612
--- /dev/null
+++ b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp
@@ -0,0 +1,66 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2021 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "catch.hpp"
+
+#include "Configuration.h"
+#include "CPU/SIMD/aligned_allocator.hpp"
+#include "QMCWaveFunctions/Fermion/syclSolverInverter.hpp"
+#include "QMCWaveFunctions/Fermion/DiracMatrix.h"
+#include "Utilities/for_testing/checkMatrix.hpp"
+#include "Containers/tests/makeRngSpdMatrix.hpp"
+
+namespace qmcplusplus
+{
+
+template<typename T, typename T_FP>
+void test_inverse(const std::int64_t N)
+{
+  sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
+
+  syclSolverInverter<T_FP> solver;
+
+  Matrix<T> m(N, N);
+  Matrix<T> m_invT(N, N);
+  Matrix<T> m_invT_CPU(N, N);
+  Matrix<T, SYCLAllocator<T>> m_invGPU;
+  m_invGPU.resize(N,N);
+
+  std::complex<double> log_value, log_value_cpu;
+  testing::MakeRngSpdMatrix<T> makeRngSpdMatrix{};
+  makeRngSpdMatrix(m);
+
+  solver.invert_transpose(m, m_invT, m_invGPU, log_value, m_queue);
+  m_queue.wait();
+
+  DiracMatrix<T_FP> dmat;
+  dmat.invert_transpose(m, m_invT_CPU, log_value_cpu);
+
+  REQUIRE(log_value == ComplexApprox(log_value_cpu));
+
+  auto check_matrix_result = checkMatrix(m_invT, m_invT_CPU);
+  CHECKED_ELSE(check_matrix_result.result) { FAIL(check_matrix_result.result_message); }
+}
+
+TEST_CASE("OmpSYCL syclSolverInverter", "[SYCL]")
+{
+  const int N           = 911;
+
+#ifdef MIXED_PRECISION
+  std::cout << "Testing Inverse for mixed precision " << std::endl;
+  test_inverse<float,double>(N);
+#else
+  std::cout << "Testing Inverse for double double " << std::endl;
+  test_inverse<double,double>(N);
+#endif
+}
+
+} // namespace qmcplusplus

From ead985717cbead25681c211cab82d3c571f9cf53 Mon Sep 17 00:00:00 2001
From: Jeongnim Kim <jeongnim.kim@gmail.com>
Date: Tue, 10 May 2022 16:09:14 -0700
Subject: [PATCH 02/36] Merge DelayedUpdateSYCL.h

---
 .../Fermion/DelayedUpdateSYCL.h               | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
index ee3582e3b5..e453294f98 100644
--- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
+++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
@@ -20,6 +20,8 @@
 #include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp"
 #include "DiracMatrix.h"
 #include "PrefetchedRange.h"
+#include "syclSolverInverter.hpp"
+
 //#define SYCL_BLOCKING
 
 namespace qmcplusplus
@@ -52,6 +54,8 @@ class DelayedUpdateSYCL
 
   DiracMatrix<T_FP> host_inverter_;
 
+  syclSolverInverter<T_FP> sycl_inverter_;
+
   // the range of prefetched_Ainv_rows
   PrefetchedRange prefetched_range;
   // Ainv prefetch buffer
@@ -102,8 +106,17 @@ class DelayedUpdateSYCL
   template<typename TREAL>
   void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
   {
-    host_inverter_.invert_transpose(logdetT, Ainv, log_value);
-    initializeInv(Ainv);
+    clearDelayCount();
+
+    if(true)
+    {
+        host_inverter_.invert_transpose(logdetT, Ainv, log_value);
+        m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait();
+    }
+    else
+    {
+        sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
+    }
   }
 
   /** initialize internal objects when Ainv is refreshed
@@ -215,11 +228,11 @@ class DelayedUpdateSYCL
 
 #ifdef SYCL_BLOCKING
       syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
-                     cone, Ainv_gpu.data(), norb, {u_event, temp_v_event})
+                     cone, Ainv_gpu.data(), norb, {u_event})
           .wait();
 #else
       ainv_event_ = syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb,
-                                   temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event, temp_v_event});
+                                   temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event});
 #endif
 
       clearDelayCount();

From 2e3012db8eda7a753501268783ba2ca7e74e8a38 Mon Sep 17 00:00:00 2001
From: Jeongnim Kim <jeongnim.kim@gmail.com>
Date: Tue, 10 May 2022 17:11:01 -0700
Subject: [PATCH 03/36] Use syclSolverInverter in DelayedUpdateSYCL.h

---
 CMakeLists.txt                                       |  2 +-
 src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h     | 12 +-----------
 .../detail/SYCL/sycl_determinant_helper.cpp          |  2 +-
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 259f2f5aa1..73253a0315 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -851,7 +851,7 @@ if(ENABLE_SYCL)
   endif()
   add_library(SYCL::host INTERFACE IMPORTED)
   add_library(SYCL::device INTERFACE IMPORTED)
-  find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE})
+  find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE} NO_DEFAULT_PATH)
   target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host)
   target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device)
   if(TARGET MKL::sycl)
diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
index e453294f98..8fdb6b362c 100644
--- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
+++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
@@ -52,8 +52,6 @@ class DelayedUpdateSYCL
   /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
   int delay_count;
 
-  DiracMatrix<T_FP> host_inverter_;
-
   syclSolverInverter<T_FP> sycl_inverter_;
 
   // the range of prefetched_Ainv_rows
@@ -108,15 +106,7 @@ class DelayedUpdateSYCL
   {
     clearDelayCount();
 
-    if(true)
-    {
-        host_inverter_.invert_transpose(logdetT, Ainv, log_value);
-        m_queue_.memcpy(Ainv_gpu.data(), Ainv.data(), Ainv.size() * sizeof(T)).wait();
-    }
-    else
-    {
-        sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
-    }
+    sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
   }
 
   /** initialize internal objects when Ainv is refreshed
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
index 84fb6cada0..2593d34f19 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
@@ -96,7 +96,7 @@ std::complex<T> computeLogDet_sycl(sycl::queue& aq,
                                    const Index_t* restrict pivot,
                                    const std::vector<cl::sycl::event>& dependencies)
   {
-    constexpr size_t COLBS=32;
+    constexpr size_t COLBS=128;
 
     std::complex<T> result{};
     {

From b5fb457d8a3fcbebe41a2624effcb123ff861839 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Tue, 17 May 2022 15:52:05 -0500
Subject: [PATCH 04/36] Fix complex compilation.

---
 src/Platforms/SYCL/syclBLAS.cpp               | 24 +++++++++++++++++++
 src/Platforms/SYCL/syclBLAS.hpp               |  8 +++----
 .../detail/SYCL/sycl_determinant_helper.cpp   | 13 +++++++---
 .../detail/SYCL/sycl_determinant_helper.hpp   |  6 ++---
 4 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp
index b8a9f87028..3e0e3396f8 100644
--- a/src/Platforms/SYCL/syclBLAS.cpp
+++ b/src/Platforms/SYCL/syclBLAS.cpp
@@ -239,6 +239,24 @@ template sycl::event transpose(sycl::queue& q,
                               int ldb,
                               const std::vector<sycl::event>& events);
 
+template sycl::event transpose(sycl::queue& q,
+                              const std::complex<float>* restrict in,
+                              int m,
+                              int lda,
+                              std::complex<double>* restrict out,
+                              int n,
+                              int ldb,
+                              const std::vector<sycl::event>& events);
+
+template sycl::event transpose(sycl::queue& q,
+                              const std::complex<double>* restrict in,
+                              int m,
+                              int lda,
+                              std::complex<double>* restrict out,
+                              int n,
+                              int ldb,
+                              const std::vector<sycl::event>& events);
+
 //copy_n for mixed precision
 template <typename T1, typename T2>
 sycl::event copy_n(sycl::queue &aq,
@@ -261,6 +279,12 @@ template sycl::event copy_n(sycl::queue &aq,
                             float* restrict VC,
                             const std::vector<sycl::event>& events);
 
+template sycl::event copy_n(sycl::queue &aq,
+                            const std::complex<double>* restrict VA,
+                            size_t array_size,
+                            std::complex<float>* restrict VC,
+                            const std::vector<sycl::event>& events);
+
 } // namespace syclBLAS
 
 } // namespace qmcplusplus
diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp
index 36ac05e3a3..370c035417 100644
--- a/src/Platforms/SYCL/syclBLAS.hpp
+++ b/src/Platforms/SYCL/syclBLAS.hpp
@@ -58,19 +58,19 @@ sycl::event gemm(sycl::queue& handle,
 
 template<typename T1, typename T2>
 sycl::event transpose(sycl::queue& q,
-                      const T1* restrict in,
+                      const T1* in,
                       int m,
                       int lda,
-                      T2* restrict out,
+                      T2* out,
                       int n,
                       int ldb,
                       const std::vector<sycl::event>& events = {});
 
 template <typename T1, typename T2>
 sycl::event copy_n(sycl::queue &aq,
-                   const T1* restrict VA,
+                   const T1* VA,
                    size_t array_size,
-                   T2* restrict VC,
+                   T2* VC,
                    const std::vector<sycl::event>& events = {});
 
 } // namespace syclBLAS
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
index 2593d34f19..46b534c0ac 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
@@ -88,12 +88,12 @@ template sycl::event applyW_stageV_sycl(sycl::queue& aq,
                                         const std::complex<double>* restrict Ainv);
 
 
-template<typename T, typename TMAT, typename Index_t>
+template<typename T, typename TMAT, typename INDEX>
 std::complex<T> computeLogDet_sycl(sycl::queue& aq,
                                    int n,
                                    int lda,
                                    const TMAT* restrict a,
-                                   const Index_t* restrict pivot,
+                                   const INDEX* restrict pivot,
                                    const std::vector<cl::sycl::event>& dependencies)
   {
     constexpr size_t COLBS=128;
@@ -106,7 +106,7 @@ std::complex<T> computeLogDet_sycl(sycl::queue& aq,
 
           size_t n_max=((n+COLBS-1)/COLBS)*COLBS;
           sycl::global_ptr<const TMAT>  A{a};
-          sycl::global_ptr<const Index_t>  Pivot{pivot};
+          sycl::global_ptr<const INDEX>  Pivot{pivot};
           cgh.parallel_for(sycl::range<1>{n_max},
               sycl::reduction(abuff,cgh,{T{},T{}},std::plus<std::complex<T>>()),
               [=](sycl::id<1> i, auto& sum)
@@ -129,4 +129,11 @@ template std::complex<double> computeLogDet_sycl(sycl::queue& aq,
                                                  const double* restrict a,
                                                  const std::int64_t* restrict pivot,
                                                  const std::vector<cl::sycl::event>& dependencies);
+
+template std::complex<double> computeLogDet_sycl(sycl::queue& aq,
+                                                 int n,
+                                                 int lda,
+                                                 const std::complex<double>* restrict a,
+                                                 const std::int64_t* restrict pivot,
+                                                 const std::vector<cl::sycl::event>& dependencies);
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
index 4cca93bc83..6a7fb06933 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
@@ -32,12 +32,12 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq,
                                T* V_gpu,
                                const T* Ainv);
 
-template<typename T, typename TMAT, typename Index_t>
+template<typename T, typename TMAT, typename INDEX>
 std::complex<T> computeLogDet_sycl(sycl::queue& aq,
                                    int n,
                                    int lda,
-                                   const TMAT* restrict a,
-                                   const Index_t* restrict pivot,
+                                   const TMAT* a,
+                                   const INDEX* pivot,
                                    const std::vector<cl::sycl::event>& dependencies={});
 
 }

From 64fcaff9e6addb07958d3b4aa2cd16dc34f04439 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Tue, 17 May 2022 16:04:57 -0500
Subject: [PATCH 05/36] Formatting

---
 src/Platforms/SYCL/syclBLAS.cpp               | 126 +++++++++---------
 src/Platforms/SYCL/syclBLAS.hpp               |   4 +-
 src/Platforms/SYCL/syclSolver.hpp             |   8 +-
 .../Fermion/DelayedUpdateSYCL.h               |   1 -
 .../Fermion/syclSolverInverter.hpp            |  59 ++++----
 .../detail/SYCL/sycl_determinant_helper.cpp   |  50 ++++---
 .../detail/SYCL/sycl_determinant_helper.hpp   |   5 +-
 .../tests/test_syclSolverInverter.cpp         |   9 +-
 8 files changed, 131 insertions(+), 131 deletions(-)

diff --git a/src/Platforms/SYCL/syclBLAS.cpp b/src/Platforms/SYCL/syclBLAS.cpp
index 3e0e3396f8..83ee3c6b4d 100644
--- a/src/Platforms/SYCL/syclBLAS.cpp
+++ b/src/Platforms/SYCL/syclBLAS.cpp
@@ -113,11 +113,11 @@ sycl::event gemm(sycl::queue& handle,
                  const int ldc,
                  const std::vector<sycl::event>& events)
 {
-   return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, events);
+  return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb,
+                                 beta, C, ldc, events);
 }
 
 
-
 template sycl::event gemm(sycl::queue& handle,
                           const char tA,
                           const char tB,
@@ -182,7 +182,7 @@ template sycl::event gemm(sycl::queue& handle,
                           const int ldc,
                           const std::vector<sycl::event>& events);
 
-//transpose 
+//transpose
 template<typename T1, typename T2>
 sycl::event transpose(sycl::queue& q,
                       const T1* restrict in,
@@ -193,93 +193,93 @@ sycl::event transpose(sycl::queue& q,
                       int ldb,
                       const std::vector<sycl::event>& events)
 {
-    constexpr size_t tile_size=16;
-    const size_t m_max=((m+tile_size-1)/tile_size)*tile_size;
-    const size_t n_max=((n+tile_size-1)/tile_size)*tile_size;
-
-    return q.submit([&](sycl::handler& cgh) {
-
-            cgh.depends_on(events);
-            sycl::accessor<T2, 2, sycl::access::mode::write, sycl::access::target::local> 
-            tile(sycl::range<2>(tile_size,tile_size+1), cgh);
+  constexpr size_t tile_size = 16;
+  const size_t m_max         = ((m + tile_size - 1) / tile_size) * tile_size;
+  const size_t n_max         = ((n + tile_size - 1) / tile_size) * tile_size;
 
-            cgh.parallel_for(sycl::nd_range<2>{{m_max,n_max},{tile_size,tile_size}},
-                    [=](sycl::nd_item<2> item) {
-                    unsigned x = item.get_global_id(1);
-                    unsigned y = item.get_global_id(0);
-                    unsigned xth=item.get_local_id(1);
-                    unsigned yth=item.get_local_id(0);
+  return q.submit([&](sycl::handler& cgh) {
+    cgh.depends_on(events);
+    sycl::accessor<T2, 2, sycl::access::mode::write, sycl::access::target::local> tile(sycl::range<2>(tile_size,
+                                                                                                      tile_size + 1),
+                                                                                       cgh);
 
-                    if(x<n && y<m) tile[yth][xth] = in[(y)*lda + x];
-                    item.barrier(sycl::access::fence_space::local_space);
+    cgh.parallel_for(sycl::nd_range<2>{{m_max, n_max}, {tile_size, tile_size}}, [=](sycl::nd_item<2> item) {
+      unsigned x   = item.get_global_id(1);
+      unsigned y   = item.get_global_id(0);
+      unsigned xth = item.get_local_id(1);
+      unsigned yth = item.get_local_id(0);
 
-                    x = item.get_group(0)*tile_size + xth;
-                    y = item.get_group(1)*tile_size + yth;
-                    if(x<m && y<n)   out[(y)*ldb + x] = tile[xth][yth];
-                    });
-            });
+      if (x < n && y < m)
+        tile[yth][xth] = in[(y)*lda + x];
+      item.barrier(sycl::access::fence_space::local_space);
 
+      x = item.get_group(0) * tile_size + xth;
+      y = item.get_group(1) * tile_size + yth;
+      if (x < m && y < n)
+        out[(y)*ldb + x] = tile[xth][yth];
+    });
+  });
 }
 
 template sycl::event transpose(sycl::queue& q,
-                              const float* restrict in,
-                              int m,
-                              int lda,
-                              double* restrict out,
-                              int n,
-                              int ldb,
-                              const std::vector<sycl::event>& events);
+                               const float* restrict in,
+                               int m,
+                               int lda,
+                               double* restrict out,
+                               int n,
+                               int ldb,
+                               const std::vector<sycl::event>& events);
 
 template sycl::event transpose(sycl::queue& q,
-                              const double* restrict in,
-                              int m,
-                              int lda,
-                              double* restrict out,
-                              int n,
-                              int ldb,
-                              const std::vector<sycl::event>& events);
+                               const double* restrict in,
+                               int m,
+                               int lda,
+                               double* restrict out,
+                               int n,
+                               int ldb,
+                               const std::vector<sycl::event>& events);
 
 template sycl::event transpose(sycl::queue& q,
-                              const std::complex<float>* restrict in,
-                              int m,
-                              int lda,
-                              std::complex<double>* restrict out,
-                              int n,
-                              int ldb,
-                              const std::vector<sycl::event>& events);
+                               const std::complex<float>* restrict in,
+                               int m,
+                               int lda,
+                               std::complex<double>* restrict out,
+                               int n,
+                               int ldb,
+                               const std::vector<sycl::event>& events);
 
 template sycl::event transpose(sycl::queue& q,
-                              const std::complex<double>* restrict in,
-                              int m,
-                              int lda,
-                              std::complex<double>* restrict out,
-                              int n,
-                              int ldb,
-                              const std::vector<sycl::event>& events);
+                               const std::complex<double>* restrict in,
+                               int m,
+                               int lda,
+                               std::complex<double>* restrict out,
+                               int n,
+                               int ldb,
+                               const std::vector<sycl::event>& events);
 
 //copy_n for mixed precision
-template <typename T1, typename T2>
-sycl::event copy_n(sycl::queue &aq,
+template<typename T1, typename T2>
+sycl::event copy_n(sycl::queue& aq,
                    const T1* restrict VA,
                    size_t array_size,
                    T2* restrict VC,
                    const std::vector<sycl::event>& events)
 {
-    constexpr size_t tile_size=64;
-    const size_t a_max=((array_size+tile_size-1)/tile_size)*tile_size;
-    return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) {
-        if(id<array_size)
-            VC[id] = static_cast<T2>(VA[id]);
-    });
+  constexpr size_t tile_size = 64;
+  const size_t a_max         = ((array_size + tile_size - 1) / tile_size) * tile_size;
+  return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) {
+    if (id < array_size)
+      VC[id] = static_cast<T2>(VA[id]);
+  });
 }
 
-template sycl::event copy_n(sycl::queue &aq,
+template sycl::event copy_n(sycl::queue& aq,
                             const double* restrict VA,
                             size_t array_size,
                             float* restrict VC,
                             const std::vector<sycl::event>& events);
 
-template sycl::event copy_n(sycl::queue &aq,
+template sycl::event copy_n(sycl::queue& aq,
                             const std::complex<double>* restrict VA,
                             size_t array_size,
                             std::complex<float>* restrict VC,
diff --git a/src/Platforms/SYCL/syclBLAS.hpp b/src/Platforms/SYCL/syclBLAS.hpp
index 370c035417..3ecf85dcb1 100644
--- a/src/Platforms/SYCL/syclBLAS.hpp
+++ b/src/Platforms/SYCL/syclBLAS.hpp
@@ -66,8 +66,8 @@ sycl::event transpose(sycl::queue& q,
                       int ldb,
                       const std::vector<sycl::event>& events = {});
 
-template <typename T1, typename T2>
-sycl::event copy_n(sycl::queue &aq,
+template<typename T1, typename T2>
+sycl::event copy_n(sycl::queue& aq,
                    const T1* VA,
                    size_t array_size,
                    T2* VC,
diff --git a/src/Platforms/SYCL/syclSolver.hpp b/src/Platforms/SYCL/syclSolver.hpp
index 66bdc781d9..fe18f8a7bd 100644
--- a/src/Platforms/SYCL/syclSolver.hpp
+++ b/src/Platforms/SYCL/syclSolver.hpp
@@ -16,10 +16,10 @@
 
 namespace qmcplusplus
 {
-    namespace syclSolver
-    {
-        using namespace oneapi::mkl::lapack;
-    }
+namespace syclSolver
+{
+using namespace oneapi::mkl::lapack;
 }
+} // namespace qmcplusplus
 
 #endif
diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
index 8fdb6b362c..aa0cc1eb0f 100644
--- a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
+++ b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
@@ -26,7 +26,6 @@
 
 namespace qmcplusplus
 {
-
 /** implements delayed update on Intel GPU using SYCL
  * @tparam T base precision for most computation
  * @tparam T_FP high precision for matrix inversion, T_FP >= T
diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
index 7300c0d6f2..16731cf431 100644
--- a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
+++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
@@ -33,8 +33,8 @@ class syclSolverInverter
   Vector<std::int64_t, SYCLHostAllocator<std::int64_t>> ipiv;
   /// workspace
   Vector<T_FP, SYCLAllocator<T_FP>> workspace;
-  std::int64_t getrf_ws=0;
-  std::int64_t getri_ws=0;
+  std::int64_t getrf_ws = 0;
+  std::int64_t getri_ws = 0;
 
   /** resize the internal storage
    * @param norb number of electrons/orbitals
@@ -46,14 +46,13 @@ class syclSolverInverter
     {
       Mat1_gpu.resize(norb, norb);
       ipiv.resize(norb);
-      getrf_ws=syclSolver::getrf_scratchpad_size<T_FP>(m_queue,norb,norb,norb);
-      getri_ws=syclSolver::getri_scratchpad_size<T_FP>(m_queue,norb,norb);
-      workspace.resize(std::max(getrf_ws,getri_ws));
+      getrf_ws = syclSolver::getrf_scratchpad_size<T_FP>(m_queue, norb, norb, norb);
+      getri_ws = syclSolver::getri_scratchpad_size<T_FP>(m_queue, norb, norb);
+      workspace.resize(std::max(getrf_ws, getri_ws));
     }
   }
 
 public:
-
   /** compute the inverse of the transpose of matrix A and its determinant value in log
    * when T_FP and TMAT are the same
    * @tparam TREAL real type
@@ -68,23 +67,26 @@ class syclSolverInverter
     const int norb = logdetT.rows();
     resize(norb, m_queue);
 
-    auto c_event = m_queue.memcpy(Mat1_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT));
-    auto t_event = syclBLAS::transpose(m_queue,Mat1_gpu.data(),norb,Mat1_gpu.cols(),Ainv_gpu.data(),norb,Ainv_gpu.cols(), {c_event});
+    auto c_event = m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
+    auto t_event = syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
+                                       Ainv_gpu.cols(), {c_event});
     try
     {
-        c_event = syclSolver::getrf(m_queue,norb,norb,Ainv_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event});
+      c_event = syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
+                                  {t_event});
+    }
+    catch (cl::sycl::exception const& ex)
+    {
+      std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n"
+                << ex.what() << "  status: " << ex.code() << std::endl;
+      abort();
     }
-    catch(cl::sycl::exception const& ex) {
-        std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n"
-            << ex.what() << "  status: " << ex.code() << std::endl;
-        abort();
-    }        
 
     log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data(), {c_event});
 
-    c_event = syclSolver::getri(m_queue,norb,Ainv_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws);
+    c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
 
-    m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {c_event}).wait();
+    m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {c_event}).wait();
   }
 
   /** compute the inverse of the transpose of matrix A and its determinant value in log
@@ -101,28 +103,31 @@ class syclSolverInverter
     const int norb = logdetT.rows();
     resize(norb, m_queue);
     //use Ainv_gpu for transpose
-    auto c_event = m_queue.memcpy(Ainv_gpu.data(),logdetT.data(),logdetT.size()*sizeof(TMAT));
+    auto c_event = m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
     //transpose
-    auto t_event = syclBLAS::transpose(m_queue,Ainv_gpu.data(),norb,Ainv_gpu.cols(),Mat1_gpu.data(),norb,Mat1_gpu.cols(), {c_event});
+    auto t_event = syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
+                                       Mat1_gpu.cols(), {c_event});
 
     //getrf (LU) -> getri (inverse)
     try
     {
-        c_event = syclSolver::getrf(m_queue,norb,norb,Mat1_gpu.data(),norb, ipiv.data(), workspace.data(), getrf_ws, {t_event});
+      c_event = syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
+                                  {t_event});
+    }
+    catch (cl::sycl::exception const& ex)
+    {
+      std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n"
+                << ex.what() << "  status: " << ex.code() << std::endl;
+      abort();
     }
-    catch(cl::sycl::exception const& ex) {
-        std::cout << "\t\tCaught synchronous SYCL exception during getrf:\n"
-            << ex.what() << "  status: " << ex.code() << std::endl;
-        abort();
-    }        
 
     log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data(), {c_event});
 
-    c_event = syclSolver::getri(m_queue,norb,Mat1_gpu.data(),norb,ipiv.data(), workspace.data(), getri_ws);
+    c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
 
-    t_event = syclBLAS::copy_n(m_queue,Mat1_gpu.data(),Mat1_gpu.size(),Ainv_gpu.data(), {c_event} );
+    t_event = syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data(), {c_event});
 
-    m_queue.memcpy(Ainv.data(),Ainv_gpu.data(),Ainv.size()*sizeof(TMAT), {t_event}).wait();
+    m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {t_event}).wait();
   }
 };
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
index 46b534c0ac..20711c1ff4 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
@@ -95,33 +95,31 @@ std::complex<T> computeLogDet_sycl(sycl::queue& aq,
                                    const TMAT* restrict a,
                                    const INDEX* restrict pivot,
                                    const std::vector<cl::sycl::event>& dependencies)
-  {
-    constexpr size_t COLBS=128;
+{
+  constexpr size_t COLBS = 128;
 
-    std::complex<T> result{};
-    {
-      sycl::buffer<std::complex<T>,1> abuff(&result,{1});
-      aq.submit([&](sycl::handler& cgh) {
-          cgh.depends_on(dependencies);
-
-          size_t n_max=((n+COLBS-1)/COLBS)*COLBS;
-          sycl::global_ptr<const TMAT>  A{a};
-          sycl::global_ptr<const INDEX>  Pivot{pivot};
-          cgh.parallel_for(sycl::range<1>{n_max},
-              sycl::reduction(abuff,cgh,{T{},T{}},std::plus<std::complex<T>>()),
-              [=](sycl::id<1> i, auto& sum)
-              {
-                  std::complex<T> val{};
-                  //if(i<n) val= std::log(std::complex<T>((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i]));
-                  if(i<n)
-                  val = (Pivot[i] == i + 1) ?
-                    std::log(std::complex<T>(A[i*lda+i])) : std::log(std::complex<T>(-A[i*lda+i]));
-                  sum.combine(val);
-              });
-          });
-    } //synchronous
-    return  result;
-  }
+  std::complex<T> result{};
+  {
+    sycl::buffer<std::complex<T>, 1> abuff(&result, {1});
+    aq.submit([&](sycl::handler& cgh) {
+      cgh.depends_on(dependencies);
+
+      size_t n_max = ((n + COLBS - 1) / COLBS) * COLBS;
+      sycl::global_ptr<const TMAT> A{a};
+      sycl::global_ptr<const INDEX> Pivot{pivot};
+      cgh.parallel_for(sycl::range<1>{n_max}, sycl::reduction(abuff, cgh, {T{}, T{}}, std::plus<std::complex<T>>()),
+                       [=](sycl::id<1> i, auto& sum) {
+                         std::complex<T> val{};
+                         //if(i<n) val= std::log(std::complex<T>((Pivot[i] == i + 1) ? A[i*lda+i] : -A[i*lda+i]));
+                         if (i < n)
+                           val = (Pivot[i] == i + 1) ? std::log(std::complex<T>(A[i * lda + i]))
+                                                     : std::log(std::complex<T>(-A[i * lda + i]));
+                         sum.combine(val);
+                       });
+    });
+  } //synchronous
+  return result;
+}
 
 template std::complex<double> computeLogDet_sycl(sycl::queue& aq,
                                                  int n,
diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
index 6a7fb06933..23bcaa774a 100644
--- a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
+++ b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
@@ -20,7 +20,6 @@
 
 namespace qmcplusplus
 {
-
 template<typename T>
 sycl::event applyW_stageV_sycl(sycl::queue& aq,
                                const std::vector<cl::sycl::event>& dependencies,
@@ -38,7 +37,7 @@ std::complex<T> computeLogDet_sycl(sycl::queue& aq,
                                    int lda,
                                    const TMAT* a,
                                    const INDEX* pivot,
-                                   const std::vector<cl::sycl::event>& dependencies={});
+                                   const std::vector<cl::sycl::event>& dependencies = {});
 
-}
+} // namespace qmcplusplus
 #endif
diff --git a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp
index 8ee4b65612..0c6ecfcc20 100644
--- a/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp
+++ b/src/QMCWaveFunctions/tests/test_syclSolverInverter.cpp
@@ -20,7 +20,6 @@
 
 namespace qmcplusplus
 {
-
 template<typename T, typename T_FP>
 void test_inverse(const std::int64_t N)
 {
@@ -32,7 +31,7 @@ void test_inverse(const std::int64_t N)
   Matrix<T> m_invT(N, N);
   Matrix<T> m_invT_CPU(N, N);
   Matrix<T, SYCLAllocator<T>> m_invGPU;
-  m_invGPU.resize(N,N);
+  m_invGPU.resize(N, N);
 
   std::complex<double> log_value, log_value_cpu;
   testing::MakeRngSpdMatrix<T> makeRngSpdMatrix{};
@@ -52,14 +51,14 @@ void test_inverse(const std::int64_t N)
 
 TEST_CASE("OmpSYCL syclSolverInverter", "[SYCL]")
 {
-  const int N           = 911;
+  const int N = 911;
 
 #ifdef MIXED_PRECISION
   std::cout << "Testing Inverse for mixed precision " << std::endl;
-  test_inverse<float,double>(N);
+  test_inverse<float, double>(N);
 #else
   std::cout << "Testing Inverse for double double " << std::endl;
-  test_inverse<double,double>(N);
+  test_inverse<double, double>(N);
 #endif
 }
 

From c6ee3641fc9ad3454b7b50b7c72c8241da18064a Mon Sep 17 00:00:00 2001
From: Jeongnim Kim <jeongnim.kim@gmail.com>
Date: Thu, 9 Jun 2022 16:02:25 -0700
Subject: [PATCH 06/36] Update with interop and add waits.

---
 src/Platforms/SYCL/SYCLDeviceManager.cpp      | 134 ++++--------------
 src/Platforms/SYCL/SYCLDeviceManager.h        |   2 +
 .../Fermion/syclSolverInverter.hpp            |  14 +-
 3 files changed, 37 insertions(+), 113 deletions(-)

diff --git a/src/Platforms/SYCL/SYCLDeviceManager.cpp b/src/Platforms/SYCL/SYCLDeviceManager.cpp
index e9478451b8..6372660deb 100644
--- a/src/Platforms/SYCL/SYCLDeviceManager.cpp
+++ b/src/Platforms/SYCL/SYCLDeviceManager.cpp
@@ -17,36 +17,44 @@
 #include <string>
 #include <algorithm>
 #include "config.h"
-#include "OutputManager.h"
-#include "determineDefaultDeviceNum.h"
-#if defined(_OPENMP)
-#include <omp.h>
-#include <CL/sycl/backend.hpp>
-#include <level_zero/ze_api.h>
-#include <CL/sycl/backend/level_zero.hpp>
-#endif
-
+#include "Platforms/Host/OutputManager.h"
+#include "Platforms/Host/determineDefaultDeviceNum.h"
 namespace qmcplusplus
 {
-#if defined(_OPENMP)
-/** create SYCL device/contexts from OpenMP owned ones to ensure interoperability.
- * CUDA has the notion of primary context while SYCL requires explicitly sharing context.
- */
-static std::vector<struct syclDeviceInfo> xomp_get_sycl_devices();
-#endif
+
+std::unique_ptr<sycl::queue> SYCLDeviceManager::default_device_queue;
 
 SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices, int local_rank, int local_size)
     : sycl_default_device_num(-1)
 {
+#if defined(ENABLE_OFFLOAD)
+  const int sycl_device_count=omp_get_num_devices();
+  sycl_default_device_num = determineDefaultDeviceNum(sycl_device_count, local_rank, local_size);
+  if (default_device_num < 0)
+    default_device_num = sycl_default_device_num;
+
+  visible_devices.resize(sycl_device_count);
+  for (int id = 0; id < sycl_device_count; id++)
+  {
+    omp_interop_t interop;
+#pragma omp interop device(id) init(prefer_type("sycl"),targetsync: interop)
+    if(id == sycl_default_device_num)
+    {
+      int result;
+      default_device_queue.reset(static_cast<sycl::queue *> (omp_get_interop_ptr(interop, omp_ipr_targetsync, &result)));
+      if(result != omp_irc_success)
+        throw std::runtime_error("SYCLDeviceManager::SYCLDeviceManager fail to obtain sycl::queue by interop");
+    }
+    visible_devices[id].interop=interop;
+  } 
+
+#else
   // Potentially multiple GPU platform.
   std::vector<sycl::platform> platforms = sycl::platform::get_platforms();
   if (platforms.empty())
     throw std::runtime_error("Cannot find SYCL platforms!");
 
-    // find out devices from the first platform with GPUs.
-#if defined(ENABLE_OFFLOAD)
-  visible_devices = xomp_get_sycl_devices();
-#else
+  // find out devices from the first platform with GPUs.
   std::vector<sycl::device> devices;
   app_log() << "Visible SYCL platforms are :" << std::endl;
   for (auto& platform : platforms)
@@ -67,7 +75,6 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices,
   visible_devices.reserve(devices.size());
   for (int id = 0; id < devices.size(); id++)
     visible_devices.push_back({sycl::context(devices[id]), devices[id]});
-#endif
 
   const size_t sycl_device_count = visible_devices.size();
   if (num_devices == 0)
@@ -85,14 +92,12 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices,
       default_device_num = sycl_default_device_num;
     else if (default_device_num != sycl_default_device_num)
       throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!");
-
     default_device_queue = std::make_unique<sycl::queue>(visible_devices[sycl_default_device_num].context,
                                                          visible_devices[sycl_default_device_num].device);
   }
+#endif
 }
 
-std::unique_ptr<sycl::queue> SYCLDeviceManager::default_device_queue;
-
 sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue()
 {
   if (!default_device_queue)
@@ -100,87 +105,4 @@ sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue()
   return *default_device_queue;
 }
 
-#if defined(_OPENMP)
-static std::vector<struct syclDeviceInfo> xomp_get_sycl_devices()
-{
-  enum class Backend
-  {
-    UNKNOWN,
-    LEVEL_ZERO,
-    OPENCL
-  };
-
-  const auto num_omp_devices = omp_get_num_devices();
-  Backend selected_backend   = Backend::UNKNOWN;
-  std::vector<struct syclDeviceInfo> devices(num_omp_devices);
-  for (int id = 0; id < num_omp_devices; id++)
-  {
-    omp_interop_t o    = 0;
-    Backend my_backend = Backend::UNKNOWN;
-#pragma omp interop init(prefer_type("sycl"), targetsync : o) device(id)
-    int err = -1;
-
-    const std::string omp_backend(omp_get_interop_str(o, omp_ipr_fr_name, &err));
-    assert(err >= 0 && "omp_get_interop_str(omp_ipr_fr_name)");
-
-    if (omp_backend.find("level_zero") == 0)
-    {
-      my_backend = Backend::LEVEL_ZERO;
-
-      auto hPlatform = omp_get_interop_ptr(o, omp_ipr_platform, &err);
-      assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_platform)");
-      auto hContext = omp_get_interop_ptr(o, omp_ipr_device_context, &err);
-      assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_device_context)");
-      auto hDevice = omp_get_interop_ptr(o, omp_ipr_device, &err);
-      assert(err >= 0 && "omp_get_interop_ptr(omp_ipr_device)");
-
-      const sycl::platform sycl_platform =
-          sycl::ext::oneapi::level_zero::make_platform(reinterpret_cast<pi_native_handle>(hPlatform));
-      devices[id].device =
-          sycl::ext::oneapi::level_zero::make_device(sycl_platform, reinterpret_cast<pi_native_handle>(hDevice));
-
-      devices[id].context = sycl::ext::oneapi::level_zero::make_context({devices[id].device},
-                                                                        reinterpret_cast<pi_native_handle>(hContext),
-                                                                        true /* keep the ownership, no transfer */);
-    }
-    else if (omp_backend.find("opencl") == 0)
-    {
-      my_backend = Backend::OPENCL;
-      /*
-                auto hContext = omp_get_interop_ptr(o, omp_ipr_device_context, &err);
-                assert (err >= 0 && "omp_get_interop_ptr(omp_ipr_device_context)");
-                auto hDevice =  omp_get_interop_ptr(o, omp_ipr_device, &err);
-                assert (err >= 0 && "omp_get_interop_ptr(omp_ipr_device)");
-          devices[id].device = sycl::make_device<sycl::backend::opencl>(static_cast<cl_device>(hDevice));
-          devices[id].context = sycl::make_context<sycl::backend::opencl>(static_cast<cl_context>(hContext));
-*/
-    }
-
-#pragma omp interop destroy(o)
-
-    if (selected_backend == Backend::UNKNOWN)
-      selected_backend = my_backend;
-    else if (selected_backend != my_backend)
-      throw std::runtime_error("Inconsistent backends detected among OpenMP devices.");
-  }
-
-  if (devices.size() > 0)
-    switch (selected_backend)
-    {
-    case Backend::LEVEL_ZERO:
-      app_log() << "SYCL adopts the Level Zero backend chosen by OpenMP." << std::endl;
-      break;
-    case Backend::OPENCL:
-      throw std::runtime_error("OpenMP has chosen the OpenCL backend. "
-                               "We have not yet worked out its interoperability with SYCL. "
-                               "Please set the Level Zero backend in OpenMP!");
-      break;
-    default:
-      throw std::runtime_error("Failed in extracting OpenMP backend supported by SYCL.");
-    }
-
-  return devices;
-}
-#endif
-
 } // namespace qmcplusplus
diff --git a/src/Platforms/SYCL/SYCLDeviceManager.h b/src/Platforms/SYCL/SYCLDeviceManager.h
index 3b5dff0c33..7055c13c3a 100644
--- a/src/Platforms/SYCL/SYCLDeviceManager.h
+++ b/src/Platforms/SYCL/SYCLDeviceManager.h
@@ -18,6 +18,7 @@
 #include <vector>
 #include <memory>
 #include <CL/sycl.hpp>
+#include <omp.h>
 
 namespace qmcplusplus
 {
@@ -25,6 +26,7 @@ struct syclDeviceInfo
 {
   sycl::context context;
   sycl::device device;
+  omp_interop_t interop;
 };
 
 /** SYCL device manager
diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
index 16731cf431..694ed9a2e3 100644
--- a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
+++ b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
@@ -30,7 +30,7 @@ class syclSolverInverter
   /// scratch memory for cusolverDN
   Matrix<T_FP, SYCLAllocator<T_FP>> Mat1_gpu;
   /// pivot array + info
-  Vector<std::int64_t, SYCLHostAllocator<std::int64_t>> ipiv;
+  Vector<std::int64_t, SYCLAllocator<std::int64_t>> ipiv;
   /// workspace
   Vector<T_FP, SYCLAllocator<T_FP>> workspace;
   std::int64_t getrf_ws = 0;
@@ -72,8 +72,8 @@ class syclSolverInverter
                                        Ainv_gpu.cols(), {c_event});
     try
     {
-      c_event = syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
-                                  {t_event});
+      syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
+                        {t_event}).wait();
     }
     catch (cl::sycl::exception const& ex)
     {
@@ -82,7 +82,7 @@ class syclSolverInverter
       abort();
     }
 
-    log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data(), {c_event});
+    log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data());
 
     c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
 
@@ -111,8 +111,8 @@ class syclSolverInverter
     //getrf (LU) -> getri (inverse)
     try
     {
-      c_event = syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
-                                  {t_event});
+      syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
+                        {t_event}).wait();
     }
     catch (cl::sycl::exception const& ex)
     {
@@ -121,7 +121,7 @@ class syclSolverInverter
       abort();
     }
 
-    log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data(), {c_event});
+    log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data());
 
     c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
 

From 10c90b270ff34ca8867e3ffea1de9cc662fbc0cf Mon Sep 17 00:00:00 2001
From: Mark Dewing <markdewing@gmail.com>
Date: Mon, 13 Jun 2022 14:56:42 -0500
Subject: [PATCH 07/36] Add tests for Rotated SPOs using LCAO

A python script (using autograd) is used to generate reference values
for the values of the wavefunction and derivatives at one point in
space.
The test system is a helium atom with two orbitals and optionally a
Jastrow factor.
---
 src/QMCWaveFunctions/tests/CMakeLists.txt     |   2 +-
 .../tests/gen_rotated_lcao_wf.py              | 191 +++++++++
 .../tests/test_RotatedSPOs_LCAO.cpp           | 364 ++++++++++++++++++
 3 files changed, 556 insertions(+), 1 deletion(-)
 create mode 100644 src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py
 create mode 100644 src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp

diff --git a/src/QMCWaveFunctions/tests/CMakeLists.txt b/src/QMCWaveFunctions/tests/CMakeLists.txt
index 97a6d20717..c0d82bce66 100644
--- a/src/QMCWaveFunctions/tests/CMakeLists.txt
+++ b/src/QMCWaveFunctions/tests/CMakeLists.txt
@@ -59,7 +59,7 @@ endforeach()
 
 if(NOT QMC_CUDA)
   if(NOT QMC_COMPLEX)
-    set(MO_SRCS test_MO.cpp test_multiquintic_spline.cpp test_cartesian_ao.cpp)
+    set(MO_SRCS test_MO.cpp test_multiquintic_spline.cpp test_cartesian_ao.cpp test_RotatedSPOs_LCAO.cpp)
     if(NOT QMC_MIXED_PRECISION)
       set(MO_SRCS ${MO_SRCS} test_soa_cusp_corr.cpp)
     endif()
diff --git a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py
new file mode 100644
index 0000000000..1563ad83e7
--- /dev/null
+++ b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py
@@ -0,0 +1,191 @@
+
+# Helium atom with a combination of two orbitals and simple jastrow factor
+
+# Uses automatic differentiation via the autograd package to
+#  compute spatial and parameter derivatives
+import autograd.numpy as np
+from autograd import hessian,grad
+
+# Values used in test_RotatedSPOs_LCAO.cpp
+
+def mag(r):
+    return np.sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2])
+
+# normalized STO's correspond to the 'normalized="no"' part of the input
+#     <atomicBasisSet type="STO" elementType="He" normalized="no">
+
+
+def sto_norm1(zeta):
+  return 2*np.sqrt(zeta**3)
+
+def sto_norm2(zeta):
+  return 2*np.sqrt(3)*np.sqrt(zeta**5)/3
+
+
+def orb1(R):
+    r = mag(R)
+    Z = 2.0
+    y00 = 1/np.sqrt(4 * np.pi)
+    snorm1 = sto_norm1(Z)
+    return y00 * snorm1 * np.exp(-Z*r)
+
+def orb2(R):
+    r = mag(R)
+    zeta = 1.0
+    y00 = 1/np.sqrt(4*np.pi)
+    snorm2 = sto_norm2(zeta)
+    return snorm2* y00 * r* np.exp(-zeta*r)
+
+def rot_orb(R, theta):
+    return orb1(R) * np.cos(theta) + orb2(R) * np.sin(theta)
+
+
+def jastrow(r12, B):
+    A = 0.5
+    return np.exp(A*r12/(1.0 + B*r12) - A/B)
+
+def psi_no_jastrow(r1, r2, VP):
+    theta1 = VP[0]
+    theta2 = VP[1]
+    o1 = rot_orb(r1,theta1)
+    o2 = rot_orb(r2,theta2)
+    return o1*o2
+
+def psi_with_jastrow(r1, r2, VP):
+    theta1 = VP[0]
+    theta2 = VP[1]
+    B = VP[2]
+    o1 = rot_orb(r1,theta1)
+    o2 = rot_orb(r2,theta2)
+    r12 = r2 - r1
+    j = jastrow(r12, B)
+    return o1*o2*j
+
+def psi(r1, r2, VP):
+    global use_jastrow
+    theta1 = VP[0]
+    theta2 = VP[1]
+    j = 1.0
+    if use_jastrow:
+        B = VP[2]
+        r12 = mag(r2 - r1)
+        j = jastrow(r12, B)
+
+    o1 = rot_orb(r1,theta1)
+    o2 = rot_orb(r2,theta2)
+    return o1*o2*j
+
+def log_psi(r1, r2, B):
+    return np.log(psi(r1, r2, B))
+
+
+# Spatial derivatives
+hess0 = hessian(psi, 0)
+hess1 = hessian(psi, 1)
+
+
+hess_log_0 = hessian(log_psi, 0)
+hess_log_1 = hessian(log_psi, 1)
+
+grad0 = grad(psi, 0)
+grad1 = grad(psi, 1)
+
+# Derivative wrt parameters
+dpsi = grad(psi, 2)
+
+def lap0(r1, r2, VP):
+    h0 = np.sum(np.diag(hess_log_0(r1, r2, VP)))
+    return h0
+
+def lap1(r1, r2, VP):
+    h1 = np.sum(np.diag(hess_log_1(r1, r2, VP)))
+    return h1
+
+def lap(r1, r2, VP):
+    h0 = np.sum(np.diag(hess0(r1, r2, VP)))
+    h1 = np.sum(np.diag(hess1(r1, r2, VP)))
+    return h0 + h1
+
+def en_pot(r1, r2):
+    r1_mag = mag(r1)
+    r2_mag = mag(r2)
+    Z = 2.0
+    return -Z/r1_mag - Z/r2_mag
+
+def ee_pot(r1, r2):
+    r12 = r2 - r1
+    r12_mag = mag(r12)
+    return 1.0/r12_mag
+
+
+def local_energy(r1, r2, VP):
+    pot = en_pot(r1, r2) + ee_pot(r1, r2)
+    psi_val = psi(r1, r2, VP)
+    lapl = lap(r1, r2, VP)
+
+    h = -0.5*lapl/psi_val + pot
+    return h
+
+dlocal_energy = grad(local_energy, 2)
+
+
+
+def print_wf_values(theta1=0.0, theta2=0.0,  use_j=False, B=0.0):
+    global use_jastrow
+
+    # Adjust numpy output so arrays are printed with higher precision
+    float_formatter = "{:.15g}".format
+    np.set_printoptions(formatter={'float_kind':float_formatter})
+
+    if use_j:
+        use_jastrow = True
+        VP = np.array([theta1, theta2, B])
+        print("Values for theta = ",theta1,theta2," and jastrow B = ",B)
+    else:
+        use_jastrow = False
+        VP = np.array([theta1, theta2])
+        print("Values for theta = ",theta1,theta2," and no jastrow")
+
+
+
+    r1 = np.array([1.0, 2.0, 3.0])
+    r2 = np.array([0.0, 1.1, 2.2])
+
+    psi_val = psi(r1, r2, VP)
+    print("  wf = ",psi_val," log wf = ",np.log(np.abs(psi_val)))
+
+    g0 = grad0(r1, r2, VP)/psi_val
+    print("  grad/psi for particle 0 = ",g0[0],g0[1],g0[2])
+
+    # Using the laplacian of log psi to match internal QMCPACK values
+    lap_0 = lap0(r1, r2, VP)
+    print(" laplacian of log psi for particle 0 = ",lap_0)
+
+    lap_1 = lap1(r1, r2, VP)
+    print(" laplacian for log psi particle 1 = ",lap_1)
+
+    eloc = local_energy(r1, r2, VP)
+    print("  local energy = ",eloc)
+
+    dp = dpsi(r1, r2, VP)
+    print("  parameter derivative of log psi = ",dp / psi_val)
+
+    deloc = dlocal_energy(r1, r2, VP)
+    print("  parameter derivative of local energy = ",deloc)
+
+    print("")
+
+def print_point_values():
+    r1 = np.array([1.0, 2.0, 3.0])
+    r2 = np.array([0.0, 1.1, 2.2])
+
+    print_wf_values(theta1=0.1, theta2=0.2)
+
+    print_wf_values(theta1=0.0, theta2=0.0)
+
+    print_wf_values(theta1=0.0, theta2=0.0, use_j=True, B=0.1)
+
+
+if __name__=='__main__':
+    print_point_values()
+
diff --git a/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp b/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp
new file mode 100644
index 0000000000..d68ad67bac
--- /dev/null
+++ b/src/QMCWaveFunctions/tests/test_RotatedSPOs_LCAO.cpp
@@ -0,0 +1,364 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2022 QMCPACK developers.
+//
+// File developed by:  Mark Dewing, mdewing@anl.gov Argonne National Laboratory
+//
+// File created by: Mark Dewing, mdewing@anl.gov Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+
+#include "catch.hpp"
+
+
+#include "Message/Communicate.h"
+#include "OhmmsData/Libxml2Doc.h"
+#include "Particle/ParticleSetPool.h"
+#include "QMCWaveFunctions/WaveFunctionPool.h"
+
+
+#include <stdio.h>
+#include <string>
+#include <sstream>
+
+
+// Reference values from gen_rotated_lcao_wf.py
+
+
+namespace qmcplusplus
+{
+void setupParticleSetPool(ParticleSetPool& pp)
+{
+  // See ParticleIO/tests/test_xml_io.cpp for particle parsing
+  const char* particles = R"(<tmp>
+  <particleset name="ion0" size="1">
+    <group name="He">
+      <parameter name="charge">2</parameter>
+    </group>
+    <attrib name="position" datatype="posArray">
+      0.0 0.0 0.0
+    </attrib>
+  </particleset>
+
+  <particleset name="e" random="no">
+    <group name="u" size="1">
+      <parameter name="charge">-1</parameter>
+    <attrib name="position" datatype="posArray">
+      1.0 2.0 3.0
+    </attrib>
+    </group>
+    <group name="d" size="1">
+      <parameter name="charge">-1</parameter>
+    <attrib name="position" datatype="posArray">
+      0.0 1.1 2.2
+    </attrib>
+    </group>
+  </particleset>
+</tmp>)";
+  Libxml2Document doc;
+
+  bool okay = doc.parseFromString(particles);
+  REQUIRE(okay);
+
+  xmlNodePtr root = doc.getRoot();
+
+  xmlNodePtr part_ion = xmlFirstElementChild(root);
+  pp.put(part_ion);
+  xmlNodePtr part_elec = xmlNextElementSibling(part_ion);
+  pp.put(part_elec);
+}
+
+
+// No Jastrow, rotation angle theta1=0.1 and theta2=0.2
+TEST_CASE("Rotated LCAO WF1", "[qmcapp]")
+{
+  Communicate* c;
+  c = OHMMS::Controller;
+
+  ParticleSetPool pp(c);
+  setupParticleSetPool(pp);
+
+  WaveFunctionPool wp(pp, c);
+
+  REQUIRE(wp.empty() == true);
+
+
+  const char* wf_input = R"(<wavefunction target='e'>
+    <sposet_collection type="MolecularOrbital">
+      <!-- Use a single Slater Type Orbital (STO) for the basis. Cusp condition is correct. -->
+      <basisset keyword="STO" transform="no">
+        <atomicBasisSet type="STO" elementType="He" normalized="no">
+          <basisGroup rid="R0" l="0" m="0" type="Slater">
+             <radfunc n="1" exponent="2.0"/>
+          </basisGroup>
+          <basisGroup rid="R1" l="0" m="0" type="Slater">
+             <radfunc n="2" exponent="1.0"/>
+          </basisGroup>
+        </atomicBasisSet>
+      </basisset>
+      <sposet basisset="LCAOBSet" name="spo-up" size="2" optimize="yes">
+           <opt_vars>0.1</opt_vars>
+          <coefficient id="updetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+      <sposet basisset="LCAOBSet" name="spo-down" size="2" optimize="yes">
+           <opt_vars>0.2</opt_vars>
+          <coefficient id="downdetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+    </sposet_collection>
+    <determinantset type="MO" key="STO" transform="no" source="ion0">
+      <slaterdeterminant>
+        <determinant id="spo-up" spin="1" size="2"/>
+        <determinant id="spo-down" spin="-1" size="2"/>
+      </slaterdeterminant>
+    </determinantset>
+   </wavefunction>)";
+
+  Libxml2Document doc;
+  bool okay = doc.parseFromString(wf_input);
+  REQUIRE(okay);
+
+  xmlNodePtr root = doc.getRoot();
+
+  wp.put(root);
+
+  TrialWaveFunction* psi = wp.getWaveFunction("psi0");
+  REQUIRE(psi != nullptr);
+  REQUIRE(psi->getOrbitals().size() == 1);
+
+  opt_variables_type opt_vars;
+  psi->checkInVariables(opt_vars);
+  psi->checkOutVariables(opt_vars);
+  psi->resetParameters(opt_vars);
+
+  ParticleSet* elec = pp.getParticleSet("e");
+  elec->update();
+
+
+  double logval = psi->evaluateLog(*elec);
+  CHECK(logval == Approx(-9.26625670653773));
+
+  CHECK(elec->G[0][0] == ValueApprox(-0.2758747113720909));
+  CHECK(elec->L[0] == ValueApprox(-0.316459652026054));
+  CHECK(elec->L[1] == ValueApprox(-0.6035591598540904));
+
+  using ValueType = QMCTraits::ValueType;
+  std::vector<ValueType> dlogpsi(2);
+  std::vector<ValueType> dhpsioverpsi(2);
+  psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi);
+
+
+  CHECK(dlogpsi[0] == ValueApprox(7.58753078998516));
+  CHECK(dlogpsi[1] == ValueApprox(2.58896036829191));
+  CHECK(dhpsioverpsi[0] == ValueApprox(2.59551625714144));
+  CHECK(dhpsioverpsi[1] == ValueApprox(1.70071425070404));
+}
+
+// No Jastrow, rotation angle of 0
+TEST_CASE("Rotated LCAO WF zero angle", "[qmcapp]")
+{
+  Communicate* c;
+  c = OHMMS::Controller;
+
+  ParticleSetPool pp(c);
+  setupParticleSetPool(pp);
+
+  WaveFunctionPool wp(pp, c);
+
+  REQUIRE(wp.empty() == true);
+
+
+  const char* wf_input = R"(<wavefunction target='e'>
+    <sposet_collection type="MolecularOrbital">
+      <!-- Use a single Slater Type Orbital (STO) for the basis. Cusp condition is correct. -->
+      <basisset keyword="STO" transform="no">
+        <atomicBasisSet type="STO" elementType="He" normalized="no">
+          <basisGroup rid="R0" l="0" m="0" type="Slater">
+             <radfunc n="1" exponent="2.0"/>
+          </basisGroup>
+          <basisGroup rid="R1" l="0" m="0" type="Slater">
+             <radfunc n="2" exponent="1.0"/>
+          </basisGroup>
+        </atomicBasisSet>
+      </basisset>
+      <sposet basisset="LCAOBSet" name="spo-up" size="2" optimize="yes">
+          <coefficient id="updetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+      <sposet basisset="LCAOBSet" name="spo-down" size="2" optimize="yes">
+          <coefficient id="downdetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+    </sposet_collection>
+    <determinantset type="MO" key="STO" transform="no" source="ion0">
+      <slaterdeterminant>
+        <determinant id="spo-up" spin="1" size="2"/>
+        <determinant id="spo-down" spin="-1" size="2"/>
+      </slaterdeterminant>
+    </determinantset>
+   </wavefunction>)";
+
+  Libxml2Document doc;
+  bool okay = doc.parseFromString(wf_input);
+  REQUIRE(okay);
+
+  xmlNodePtr root = doc.getRoot();
+
+  wp.put(root);
+
+  TrialWaveFunction* psi = wp.getWaveFunction("psi0");
+  REQUIRE(psi != nullptr);
+  REQUIRE(psi->getOrbitals().size() == 1);
+
+  opt_variables_type opt_vars;
+  psi->checkInVariables(opt_vars);
+  opt_vars.resetIndex();
+  psi->checkOutVariables(opt_vars);
+  psi->resetParameters(opt_vars);
+
+  ParticleSet* elec = pp.getParticleSet("e");
+  elec->update();
+
+
+  double logval = psi->evaluateLog(*elec);
+  CHECK(logval == Approx(-11.467952668216984));
+
+  CHECK(elec->G[0][0] == ValueApprox(-0.5345224838248487));
+  CHECK(elec->L[0] == ValueApprox(-1.0690449676496974));
+  CHECK(elec->L[1] == ValueApprox(-1.626231256363484));
+
+  using ValueType = QMCTraits::ValueType;
+  std::vector<ValueType> dlogpsi(2);
+  std::vector<ValueType> dhpsioverpsi(2);
+  psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi);
+
+  CHECK(dlogpsi[0] == ValueApprox(32.2062050179872));
+  CHECK(dlogpsi[1] == ValueApprox(5.87482925187464));
+  CHECK(dhpsioverpsi[0] == ValueApprox(46.0088643114103));
+  CHECK(dhpsioverpsi[1] == ValueApprox(7.84119772047731));
+
+  RefVectorWithLeader<TrialWaveFunction> wf_list(*psi, {*psi});
+  RefVectorWithLeader<ParticleSet> p_list(*elec, {*elec});
+
+  // Test list with one wavefunction
+
+  int nparam = 2;
+  int nentry = 1;
+  RecordArray<ValueType> dlogpsi_list(nparam, nentry);
+  RecordArray<ValueType> dhpsi_over_psi_list(nparam, nentry);
+
+  TrialWaveFunction::mw_evaluateParameterDerivatives(wf_list, p_list, opt_vars, dlogpsi_list, dhpsi_over_psi_list);
+
+  CHECK(dlogpsi_list.getValue(0, 0) == ValueApprox(32.2062050179872));
+  CHECK(dlogpsi_list.getValue(1, 0) == ValueApprox(5.87482925187464));
+  CHECK(dhpsi_over_psi_list.getValue(0, 0) == ValueApprox(46.0088643114103));
+  CHECK(dhpsi_over_psi_list.getValue(1, 0) == ValueApprox(7.84119772047731));
+}
+
+// Rotation angle of 0 and add Jastrow factory
+TEST_CASE("Rotated LCAO WF with jastrow", "[qmcapp]")
+{
+  Communicate* c;
+  c = OHMMS::Controller;
+
+  ParticleSetPool pp(c);
+  setupParticleSetPool(pp);
+
+  WaveFunctionPool wp(pp, c);
+
+  REQUIRE(wp.empty() == true);
+
+
+  const char* wf_input = R"(<wavefunction target='e'>
+
+     <sposet_collection type="MolecularOrbital">
+      <!-- Use a single Slater Type Orbital (STO) for the basis. Cusp condition is correct. -->
+      <basisset keyword="STO" transform="no">
+        <atomicBasisSet type="STO" elementType="He" normalized="no">
+          <basisGroup rid="R0" l="0" m="0" type="Slater">
+             <radfunc n="1" exponent="2.0"/>
+          </basisGroup>
+          <basisGroup rid="R1" l="0" m="0" type="Slater">
+             <radfunc n="2" exponent="1.0"/>
+          </basisGroup>
+        </atomicBasisSet>
+      </basisset>
+      <sposet basisset="LCAOBSet" name="spo-up" size="2" optimize="yes">
+          <coefficient id="updetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+      <sposet basisset="LCAOBSet" name="spo-down" size="2" optimize="yes">
+          <coefficient id="updetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+    </sposet_collection>
+    <determinantset type="MO" key="STO" transform="no" source="ion0">
+      <slaterdeterminant>
+        <determinant id="spo-up" spin="1" size="2"/>
+        <determinant id="spo-down" spin="-1" size="2"/>
+      </slaterdeterminant>
+    </determinantset>
+    <jastrow name="Jee" type="Two-Body" function="pade">
+      <correlation speciesA="u" speciesB="d">
+        <var id="jud_b" name="B">0.1</var>
+      </correlation>
+    </jastrow>
+   </wavefunction>)";
+
+  Libxml2Document doc;
+  bool okay = doc.parseFromString(wf_input);
+  REQUIRE(okay);
+
+  xmlNodePtr root = doc.getRoot();
+
+  wp.put(root);
+
+  TrialWaveFunction* psi = wp.getWaveFunction("psi0");
+  REQUIRE(psi != nullptr);
+  REQUIRE(psi->getOrbitals().size() == 2);
+
+  opt_variables_type opt_vars;
+  psi->checkInVariables(opt_vars);
+  opt_vars.resetIndex();
+  psi->checkOutVariables(opt_vars);
+  psi->resetParameters(opt_vars);
+
+  ParticleSet* elec = pp.getParticleSet("e");
+  elec->update();
+
+
+  double logval = psi->evaluateLog(*elec);
+  CHECK(logval == Approx(-15.791249652199634));
+
+  CHECK(elec->G[0][0] == ValueApprox(-0.2956989647881321));
+  CHECK(elec->L[0] == ValueApprox(-0.6560429678274734));
+  CHECK(elec->L[1] == ValueApprox(-1.2132292565412577));
+
+  using ValueType = QMCTraits::ValueType;
+  std::vector<ValueType> dlogpsi(3);
+  std::vector<ValueType> dhpsioverpsi(3);
+  psi->evaluateDerivatives(*elec, opt_vars, dlogpsi, dhpsioverpsi);
+
+  CHECK(dlogpsi[0] == ValueApprox(32.206205017987166));
+  CHECK(dlogpsi[1] == ValueApprox(5.874829251874641));
+  CHECK(dlogpsi[2] == ValueApprox(49.08414605622605));
+  CHECK(dhpsioverpsi[0] == ValueApprox(32.462519534916666));
+  CHECK(dhpsioverpsi[1] == ValueApprox(10.047601212881027));
+  CHECK(dhpsioverpsi[2] == ValueApprox(2.0820644399551895));
+}
+} // namespace qmcplusplus

From 58f01adf1c6b05eb65e626e305175175ea2543d6 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Fri, 10 Jun 2022 22:01:10 -0500
Subject: [PATCH 08/36] Add data access APIs in OhmmsArray

---
 src/Containers/OhmmsPETE/OhmmsArray.h         | 82 +++++++++++++++++++
 src/Containers/OhmmsPETE/tests/test_Array.cpp |  6 ++
 2 files changed, 88 insertions(+)

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
index 0fca688529..3054120d20 100644
--- a/src/Containers/OhmmsPETE/OhmmsArray.h
+++ b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -82,7 +82,9 @@ class Array
   inline typename Container_t::const_iterator begin() const { return X.begin(); }
   inline typename Container_t::const_iterator end() const { return X.end(); }
 
+  /// access data pointer
   inline Type_t* data() { return X.data(); }
+  /// const access data pointer
   inline const Type_t* data() const { return X.data(); }
   template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
   inline Type_t* device_data()
@@ -95,6 +97,74 @@ class Array
     return X.device_data();
   }
 
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  Type_t* data(const std::array<SIZET, D>& dims)
+  {
+    size_t offset = dims[0];
+    for (int i = 1; i < dims.size(); i++)
+      offset = offset * Length[i] + dims[i];
+    return X.data() + offset;
+  }
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  const Type_t* data(const std::array<SIZET, D>& dims) const
+  {
+    size_t offset = dims[0];
+    for (int i = 1; i < dims.size(); i++)
+      offset = offset * Length[i] + dims[i];
+    return X.data() + offset;
+  }
+  template<typename SIZET     = size_t,
+           typename           = std::is_integral<SIZET>,
+           typename Allocator = ALLOC,
+           typename           = qmcplusplus::IsDualSpace<Allocator>>
+  Type_t* device_data(const std::array<SIZET, D>& dims)
+  {
+    size_t offset = dims[0];
+    for (int i = 1; i < dims.size(); i++)
+      offset = offset * Length[i] + dims[i];
+    return X.device_data() + offset;
+  }
+  template<typename SIZET     = size_t,
+           typename           = std::is_integral<SIZET>,
+           typename Allocator = ALLOC,
+           typename           = qmcplusplus::IsDualSpace<Allocator>>
+  const Type_t* device_data(const std::array<SIZET, D>& dims) const
+  {
+    size_t offset = dims[0];
+    for (int i = 1; i < dims.size(); i++)
+      offset = offset * Length[i] + dims[i];
+    return X.device_data() + offset;
+  }
+
+  /// access data pointer at (offset_1, ..., offset_D})
+  template<typename... Args>
+  Type_t* data(Args... sizes)
+  {
+    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    return data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+  }
+  /// const access data pointer at (offset_1, ..., offset_D})
+  template<typename... Args>
+  const Type_t* data(Args... sizes) const
+  {
+    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    return data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+  }
+  template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  Type_t* device_data(Args... sizes)
+  {
+    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    return device_data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+  }
+  /// const access data pointer at (offset_1, ..., offset_D})
+  template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  const Type_t* device_data(Args... sizes) const
+  {
+    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    return device_data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+  }
+
+
   inline const Type_t* first_address() const { return &(X[0]); }
 
   inline const Type_t* last_address() const { return &(X[0]) + X.size(); }
@@ -152,6 +222,18 @@ class Array
     return s;
   }
 
+  // Abstract Dual Space Transfers
+  template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  void updateTo()
+  {
+    X.updateTo();
+  }
+  template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
+  void updateFrom()
+  {
+    X.updateFrom();
+  }
+
 private:
   std::array<size_t, D> Length;
   Container_t X;
diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp
index e4bf9cb3b1..36b1c0d2d5 100644
--- a/src/Containers/OhmmsPETE/tests/test_Array.cpp
+++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp
@@ -73,6 +73,12 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]")
   CHECK(vec_copy(0).back() == 123);
 }
 
+TEST_CASE("Array::data", "[OhmmsPETE]")
+{
+  Array<double, 3> tensor(2, 4, 5);
+  CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data(1, 2, 3));
+}
+
 TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]")
 {
   const int dim = 2;

From 6685f2f07535159b42cfead01799a2e06c5b0d1b Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Fri, 10 Jun 2022 22:07:55 -0500
Subject: [PATCH 09/36] Renaming variables.

---
 src/Containers/OhmmsPETE/OhmmsArray.h | 48 +++++++++++++--------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
index 3054120d20..f27398d10c 100644
--- a/src/Containers/OhmmsPETE/OhmmsArray.h
+++ b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -98,70 +98,70 @@ class Array
   }
 
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
-  Type_t* data(const std::array<SIZET, D>& dims)
+  Type_t* data(const std::array<SIZET, D>& offsets)
   {
-    size_t offset = dims[0];
-    for (int i = 1; i < dims.size(); i++)
-      offset = offset * Length[i] + dims[i];
+    size_t offset = offsets[0];
+    for (int i = 1; i < offsets.size(); i++)
+      offset = offset * Length[i] + offsets[i];
     return X.data() + offset;
   }
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
-  const Type_t* data(const std::array<SIZET, D>& dims) const
+  const Type_t* data(const std::array<SIZET, D>& offsets) const
   {
-    size_t offset = dims[0];
-    for (int i = 1; i < dims.size(); i++)
-      offset = offset * Length[i] + dims[i];
+    size_t offset = offsets[0];
+    for (int i = 1; i < offsets.size(); i++)
+      offset = offset * Length[i] + offsets[i];
     return X.data() + offset;
   }
   template<typename SIZET     = size_t,
            typename           = std::is_integral<SIZET>,
            typename Allocator = ALLOC,
            typename           = qmcplusplus::IsDualSpace<Allocator>>
-  Type_t* device_data(const std::array<SIZET, D>& dims)
+  Type_t* device_data(const std::array<SIZET, D>& offsets)
   {
-    size_t offset = dims[0];
-    for (int i = 1; i < dims.size(); i++)
-      offset = offset * Length[i] + dims[i];
+    size_t offset = offsets[0];
+    for (int i = 1; i < offsets.size(); i++)
+      offset = offset * Length[i] + offsets[i];
     return X.device_data() + offset;
   }
   template<typename SIZET     = size_t,
            typename           = std::is_integral<SIZET>,
            typename Allocator = ALLOC,
            typename           = qmcplusplus::IsDualSpace<Allocator>>
-  const Type_t* device_data(const std::array<SIZET, D>& dims) const
+  const Type_t* device_data(const std::array<SIZET, D>& offsets) const
   {
-    size_t offset = dims[0];
-    for (int i = 1; i < dims.size(); i++)
-      offset = offset * Length[i] + dims[i];
+    size_t offset = offsets[0];
+    for (int i = 1; i < offsets.size(); i++)
+      offset = offset * Length[i] + offsets[i];
     return X.device_data() + offset;
   }
 
   /// access data pointer at (offset_1, ..., offset_D})
   template<typename... Args>
-  Type_t* data(Args... sizes)
+  Type_t* data(Args... offsets)
   {
     static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
-    return data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+    return data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
   /// const access data pointer at (offset_1, ..., offset_D})
   template<typename... Args>
-  const Type_t* data(Args... sizes) const
+  const Type_t* data(Args... offsets) const
   {
     static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
-    return data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+    return data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
-  Type_t* device_data(Args... sizes)
+  Type_t* device_data(Args... offsets)
   {
     static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
-    return device_data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+    return device_data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
   /// const access data pointer at (offset_1, ..., offset_D})
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
-  const Type_t* device_data(Args... sizes) const
+  const Type_t* device_data(Args... offsets) const
   {
     static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
-    return device_data({static_cast<std::size_t>(std::forward<Args>(sizes))...});
+    return device_data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
 
 

From 7f77018c8f1d5b936a4c3d3703aeacb9edfc0ca7 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 13 Jun 2022 19:43:49 -0500
Subject: [PATCH 10/36] Replace operator() with parameter packing.

---
 src/Containers/OhmmsPETE/OhmmsArray.h         | 43 +++++++++-----
 src/Containers/OhmmsPETE/tests/test_Array.cpp |  4 +-
 src/Estimators/TraceManager.h                 | 13 +----
 .../Fermion/BackflowTransformation.cpp        |  7 +--
 .../HarmonicOscillator/SHOSet.cpp             | 56 ++++++++-----------
 5 files changed, 58 insertions(+), 65 deletions(-)

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
index f27398d10c..8744c1b789 100644
--- a/src/Containers/OhmmsPETE/OhmmsArray.h
+++ b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -140,27 +140,27 @@ class Array
   template<typename... Args>
   Type_t* data(Args... offsets)
   {
-    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
     return data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
   /// const access data pointer at (offset_1, ..., offset_D})
   template<typename... Args>
   const Type_t* data(Args... offsets) const
   {
-    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
     return data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
   Type_t* device_data(Args... offsets)
   {
-    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
     return device_data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
   /// const access data pointer at (offset_1, ..., offset_D})
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
   const Type_t* device_data(Args... offsets) const
   {
-    static_assert(sizeof...(Args) == D, "resize arguments must match dimensionality of Array");
+    static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
     return device_data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
 
@@ -198,20 +198,33 @@ class Array
   }
 
   // Get and Set Operations
-  inline Type_t& operator()(size_t i) { return X[i]; }
-
-  inline Type_t operator()(size_t i) const { return X[i]; }
-  inline Type_t& operator()(size_t i, size_t j) { return X[j + Length[1] * i]; }
-  inline Type_t operator()(size_t i, size_t j) const { return X[j + Length[1] * i]; }
-  inline Type_t& operator()(size_t i, size_t j, size_t k) { return X[k + Length[2] * (j + Length[1] * i)]; }
-  inline Type_t operator()(size_t i, size_t j, size_t k) const { return X[k + Length[2] * (j + Length[1] * i)]; }
-  inline Type_t& operator()(size_t i, size_t j, size_t k, size_t l)
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  Type_t& operator()(const std::array<SIZET, D>& indices)
   {
-    return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))];
+    size_t offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
+    return X[offset];
   }
-  inline Type_t operator()(size_t i, size_t j, size_t k, size_t l) const
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  const Type_t& operator()(const std::array<SIZET, D>& indices) const
+  {
+    size_t offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
+    return X[offset];
+  }
+  template<typename... Args>
+  Type_t& operator()(Args... offsets)
+  {
+    static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
+    return operator()({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+  }
+  template<typename... Args>
+  const Type_t& operator()(Args... offsets) const
   {
-    return X[l + Length[3] * (k + Length[2] * (j + Length[1] * i))];
+    static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
+    return operator()({static_cast<std::size_t>(std::forward<Args>(offsets))...});
   }
 
   inline Type_t sum() const
diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp
index 36b1c0d2d5..87bd969eb2 100644
--- a/src/Containers/OhmmsPETE/tests/test_Array.cpp
+++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp
@@ -24,8 +24,8 @@ namespace qmcplusplus
 TEST_CASE("array", "[OhmmsPETE]")
 {
   using Array1D = Array<double, 1>;
-  Array1D A({3});
-  Array1D B({3});
+  Array1D A(3);
+  Array1D B(3);
 
   // iterator
   auto ia = A.begin();
diff --git a/src/Estimators/TraceManager.h b/src/Estimators/TraceManager.h
index cfb081a7a8..b676759a3e 100644
--- a/src/Estimators/TraceManager.h
+++ b/src/Estimators/TraceManager.h
@@ -1193,9 +1193,7 @@ struct TraceBuffer
       if (has_complex)
         complex_samples->combine_samples();
       //collect data from all samples into the buffer row
-      int offset = current_row * row_size;
       {
-        int boffset;
         std::vector<TraceSample<T>*>& ordered_samples = samples->ordered_samples;
         for (int s = 0; s < ordered_samples.size(); s++)
         {
@@ -1203,17 +1201,13 @@ struct TraceBuffer
           if (tsample.write)
           {
             auto& sample = tsample.sample;
-            boffset      = offset + tsample.buffer_start;
             for (int i = 0; i < sample.size(); ++i)
-            {
-              buffer(boffset + i) = sample[i];
-            }
+              buffer(current_row, tsample.buffer_start + i) = sample[i];
           }
         }
       }
       if (has_complex)
       {
-        int boffset;
         std::vector<TraceSample<std::complex<T>>*>& ordered_samples = complex_samples->ordered_samples;
         for (int s = 0; s < ordered_samples.size(); s++)
         {
@@ -1221,11 +1215,10 @@ struct TraceBuffer
           if (tsample.write)
           {
             auto& sample = tsample.sample;
-            boffset      = offset + tsample.buffer_start;
             for (int i = 0, ib = 0; i < sample.size(); ++i, ib += 2)
             {
-              buffer(boffset + ib)     = sample[i].real();
-              buffer(boffset + ib + 1) = sample[i].imag();
+              buffer(current_row, tsample.buffer_start + ib)     = sample[i].real();
+              buffer(current_row, tsample.buffer_start + ib + 1) = sample[i].imag();
             }
           }
         }
diff --git a/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp b/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp
index 62f8e259cf..7417bd09f5 100644
--- a/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp
+++ b/src/QMCWaveFunctions/Fermion/BackflowTransformation.cpp
@@ -388,8 +388,7 @@ void BackflowTransformation::evaluateDerivatives(const ParticleSet& P)
   Bmat_full = 0.0;
   Cmat      = 0.0;
   Ymat      = 0.0;
-  for (int i = 0; i < Xmat.size(); i++)
-    Xmat(i) = 0;
+  std::fill_n(Xmat.data(), Xmat.size(), 0);
   for (int i = 0; i < NumTargets; i++)
   {
     QP.R[i] = P.R[i];
@@ -414,9 +413,7 @@ void BackflowTransformation::testDeriv(const ParticleSet& P)
   Bmat_full = 0.0;
   Cmat      = 0.0;
   Ymat      = 0.0;
-  //       Xmat=DummyHess;
-  for (int i = 0; i < Xmat.size(); i++)
-    Xmat(i) = 0;
+  std::fill_n(Xmat.data(), Xmat.size(), 0);
   for (int i = 0; i < NumTargets; i++)
   {
     QP.R[i] = P.R[i];
diff --git a/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp b/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp
index dbb7f941a1..28b87f6fad 100644
--- a/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp
+++ b/src/QMCWaveFunctions/HarmonicOscillator/SHOSet.cpp
@@ -137,22 +137,21 @@ void SHOSet::evaluate_vgl(PosType r, ValueVector& psi, GradVector& dpsi, ValueVe
 
 void SHOSet::evaluate_hermite(const PosType& xpos)
 {
-  int shift = 0;
-  for (int d = 0; d < DIM; ++d, shift += nmax)
+  for (int d = 0; d < DIM; ++d)
   {
     int nh = qn_max[d];
     if (nh > 0)
     {
-      RealType x         = xpos[d];
-      hermite(0 + shift) = 1.0;
-      RealType Hnm2      = 0.0;
-      RealType Hnm1      = 1.0;
+      RealType x    = xpos[d];
+      hermite(d, 0) = 1.0;
+      RealType Hnm2 = 0.0;
+      RealType Hnm1 = 1.0;
       for (int n = 1; n < nh; ++n)
       {
-        RealType Hn        = 2 * (x * Hnm1 - (n - 1) * Hnm2);
-        hermite(n + shift) = Hn;
-        Hnm2               = Hnm1;
-        Hnm1               = Hn;
+        RealType Hn   = 2 * (x * Hnm1 - (n - 1) * Hnm2);
+        hermite(d, n) = Hn;
+        Hnm2          = Hnm1;
+        Hnm1          = Hn;
       }
     }
   }
@@ -162,24 +161,21 @@ void SHOSet::evaluate_hermite(const PosType& xpos)
 void SHOSet::evaluate_d0(const PosType& xpos, ValueVector& psi)
 {
   using std::exp;
-  int shift = 0;
-  for (int d = 0; d < DIM; ++d, shift += nmax)
+  for (int d = 0; d < DIM; ++d)
   {
     RealType x = xpos[d];
     RealType g = exp(-.5 * x * x);
     for (int n = 0; n < qn_max[d]; ++n)
     {
-      int ns      = n + shift;
-      bvalues(ns) = prefactors[n] * g * hermite(ns);
+      bvalues(d, n) = prefactors[n] * g * hermite(d, n);
     }
   }
   for (int s = 0; s < state_info.size(); ++s)
   {
     const SHOState& state = state_info[s];
     RealType phi          = 1.0;
-    int shift             = 0;
-    for (int d = 0; d < DIM; ++d, shift += nmax)
-      phi *= bvalues(shift + state.quantum_number[d]);
+    for (int d = 0; d < DIM; ++d)
+      phi *= bvalues(d, state.quantum_number[d]);
     psi[s] = phi;
   }
 }
@@ -188,26 +184,23 @@ void SHOSet::evaluate_d0(const PosType& xpos, ValueVector& psi)
 void SHOSet::evaluate_d1(const PosType& xpos, ValueVector& psi, GradVector& dpsi)
 {
   RealType ol = 1.0 / length;
-  int shift   = 0;
-  for (int d = 0; d < DIM; ++d, shift += nmax)
+  for (int d = 0; d < DIM; ++d)
   {
     RealType x    = xpos[d];
     RealType Hnm1 = 0.0;
     for (int n = 0; n < qn_max[d]; ++n)
     {
-      int ns      = n + shift;
-      RealType Hn = hermite(ns);
-      bvalues(ns) = (-x + 2 * n * Hnm1 / Hn) * ol;
-      Hnm1        = Hn;
+      RealType Hn   = hermite(d, n);
+      bvalues(d, n) = (-x + 2 * n * Hnm1 / Hn) * ol;
+      Hnm1          = Hn;
     }
   }
   for (int s = 0; s < state_info.size(); ++s)
   {
     const SHOState& state = state_info[s];
     TinyVector<ValueType, DIM> dphi;
-    int shift = 0;
-    for (int d = 0; d < DIM; ++d, shift += nmax)
-      dphi[d] = bvalues(shift + state.quantum_number[d]);
+    for (int d = 0; d < DIM; ++d)
+      dphi[d] = bvalues(d, state.quantum_number[d]);
     dphi *= psi[s];
     dpsi[s] = dphi;
   }
@@ -217,24 +210,21 @@ void SHOSet::evaluate_d1(const PosType& xpos, ValueVector& psi, GradVector& dpsi
 void SHOSet::evaluate_d2(const PosType& xpos, ValueVector& psi, ValueVector& d2psi)
 {
   RealType ol2 = 1.0 / (length * length);
-  int shift    = 0;
-  for (int d = 0; d < DIM; ++d, shift += nmax)
+  for (int d = 0; d < DIM; ++d)
   {
     RealType x  = xpos[d];
     RealType x2 = x * x;
     for (int n = 0; n < qn_max[d]; ++n)
     {
-      int ns      = n + shift;
-      bvalues(ns) = (-1.0 + x2 - 2 * n) * ol2;
+      bvalues(d, n) = (-1.0 + x2 - 2 * n) * ol2;
     }
   }
   for (int s = 0; s < state_info.size(); ++s)
   {
     const SHOState& state = state_info[s];
     ValueType d2phi       = 0.0;
-    int shift             = 0;
-    for (int d = 0; d < DIM; ++d, shift += nmax)
-      d2phi += bvalues(shift + state.quantum_number[d]);
+    for (int d = 0; d < DIM; ++d)
+      d2phi += bvalues(d, state.quantum_number[d]);
     d2phi *= psi[s];
     d2psi[s] = d2phi;
   }

From c2105272523d444c7c698a0e544d3a29303c7a41 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 13 Jun 2022 19:53:22 -0500
Subject: [PATCH 11/36] Rename data(offset) to data_at(indices)

---
 src/Containers/OhmmsPETE/OhmmsArray.h         | 64 +++++++++----------
 src/Containers/OhmmsPETE/tests/test_Array.cpp |  2 +-
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
index 8744c1b789..7f222af7e8 100644
--- a/src/Containers/OhmmsPETE/OhmmsArray.h
+++ b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -98,73 +98,73 @@ class Array
   }
 
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
-  Type_t* data(const std::array<SIZET, D>& offsets)
+  Type_t* data_at(const std::array<SIZET, D>& indices)
   {
-    size_t offset = offsets[0];
-    for (int i = 1; i < offsets.size(); i++)
-      offset = offset * Length[i] + offsets[i];
+    size_t offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
     return X.data() + offset;
   }
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
-  const Type_t* data(const std::array<SIZET, D>& offsets) const
+  const Type_t* data_at(const std::array<SIZET, D>& indices) const
   {
-    size_t offset = offsets[0];
-    for (int i = 1; i < offsets.size(); i++)
-      offset = offset * Length[i] + offsets[i];
+    size_t offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
     return X.data() + offset;
   }
   template<typename SIZET     = size_t,
            typename           = std::is_integral<SIZET>,
            typename Allocator = ALLOC,
            typename           = qmcplusplus::IsDualSpace<Allocator>>
-  Type_t* device_data(const std::array<SIZET, D>& offsets)
+  Type_t* device_data_at(const std::array<SIZET, D>& indices)
   {
-    size_t offset = offsets[0];
-    for (int i = 1; i < offsets.size(); i++)
-      offset = offset * Length[i] + offsets[i];
+    size_t offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
     return X.device_data() + offset;
   }
   template<typename SIZET     = size_t,
            typename           = std::is_integral<SIZET>,
            typename Allocator = ALLOC,
            typename           = qmcplusplus::IsDualSpace<Allocator>>
-  const Type_t* device_data(const std::array<SIZET, D>& offsets) const
+  const Type_t* device_data_at(const std::array<SIZET, D>& indices) const
   {
-    size_t offset = offsets[0];
-    for (int i = 1; i < offsets.size(); i++)
-      offset = offset * Length[i] + offsets[i];
+    size_t offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
     return X.device_data() + offset;
   }
 
-  /// access data pointer at (offset_1, ..., offset_D})
+  /// access data pointer at {index_1, ..., index_D}
   template<typename... Args>
-  Type_t* data(Args... offsets)
+  Type_t* data_at(Args... indices)
   {
     static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
-    return data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+    return data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
-  /// const access data pointer at (offset_1, ..., offset_D})
+  /// const access data pointer at {index_1, ..., index_D}
   template<typename... Args>
-  const Type_t* data(Args... offsets) const
+  const Type_t* data_at(Args... indices) const
   {
     static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
-    return data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+    return data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
+  /// access data pointer at {index_1, ..., index_D}
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
-  Type_t* device_data(Args... offsets)
+  Type_t* device_data_at(Args... indices)
   {
     static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
-    return device_data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+    return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
-  /// const access data pointer at (offset_1, ..., offset_D})
+  /// const access data pointer at {index_1, ..., index_D}
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
-  const Type_t* device_data(Args... offsets) const
+  const Type_t* device_data_at(Args... indices) const
   {
     static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
-    return device_data({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+    return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
 
-
   inline const Type_t* first_address() const { return &(X[0]); }
 
   inline const Type_t* last_address() const { return &(X[0]) + X.size(); }
@@ -215,16 +215,16 @@ class Array
     return X[offset];
   }
   template<typename... Args>
-  Type_t& operator()(Args... offsets)
+  Type_t& operator()(Args... indices)
   {
     static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
-    return operator()({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+    return operator()({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
   template<typename... Args>
-  const Type_t& operator()(Args... offsets) const
+  const Type_t& operator()(Args... indices) const
   {
     static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
-    return operator()({static_cast<std::size_t>(std::forward<Args>(offsets))...});
+    return operator()({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
 
   inline Type_t sum() const
diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp
index 87bd969eb2..ca41aa5fcc 100644
--- a/src/Containers/OhmmsPETE/tests/test_Array.cpp
+++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp
@@ -76,7 +76,7 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]")
 TEST_CASE("Array::data", "[OhmmsPETE]")
 {
   Array<double, 3> tensor(2, 4, 5);
-  CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data(1, 2, 3));
+  CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data_at(1, 2, 3));
 }
 
 TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]")

From ba4363bdb63a36ef699480c6ea39588f87ed5b04 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 13 Jun 2022 20:06:53 -0500
Subject: [PATCH 12/36] Replace redundant code with a function.

---
 src/Containers/OhmmsPETE/OhmmsArray.h | 47 +++++++++++----------------
 1 file changed, 19 insertions(+), 28 deletions(-)

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
index 7f222af7e8..e730e2449a 100644
--- a/src/Containers/OhmmsPETE/OhmmsArray.h
+++ b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -100,18 +100,12 @@ class Array
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
   Type_t* data_at(const std::array<SIZET, D>& indices)
   {
-    size_t offset = indices[0];
-    for (int i = 1; i < indices.size(); i++)
-      offset = offset * Length[i] + indices[i];
-    return X.data() + offset;
+    return X.data() + compute_offset(indices);
   }
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
   const Type_t* data_at(const std::array<SIZET, D>& indices) const
   {
-    size_t offset = indices[0];
-    for (int i = 1; i < indices.size(); i++)
-      offset = offset * Length[i] + indices[i];
-    return X.data() + offset;
+    return X.data() + compute_offset(indices);
   }
   template<typename SIZET     = size_t,
            typename           = std::is_integral<SIZET>,
@@ -119,10 +113,7 @@ class Array
            typename           = qmcplusplus::IsDualSpace<Allocator>>
   Type_t* device_data_at(const std::array<SIZET, D>& indices)
   {
-    size_t offset = indices[0];
-    for (int i = 1; i < indices.size(); i++)
-      offset = offset * Length[i] + indices[i];
-    return X.device_data() + offset;
+    return X.device_data() + compute_offset(indices);
   }
   template<typename SIZET     = size_t,
            typename           = std::is_integral<SIZET>,
@@ -130,10 +121,7 @@ class Array
            typename           = qmcplusplus::IsDualSpace<Allocator>>
   const Type_t* device_data_at(const std::array<SIZET, D>& indices) const
   {
-    size_t offset = indices[0];
-    for (int i = 1; i < indices.size(); i++)
-      offset = offset * Length[i] + indices[i];
-    return X.device_data() + offset;
+    return X.device_data() + compute_offset(indices);
   }
 
   /// access data pointer at {index_1, ..., index_D}
@@ -165,13 +153,13 @@ class Array
     return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
 
-  inline const Type_t* first_address() const { return &(X[0]); }
+  inline const Type_t* first_address() const { return X.data(); }
 
-  inline const Type_t* last_address() const { return &(X[0]) + X.size(); }
+  inline const Type_t* last_address() const { return X.data() + X.size(); }
 
-  inline Type_t* first_address() { return &(X[0]); }
+  inline Type_t* first_address() { return X.data(); }
 
-  inline Type_t* last_address() { return &(X[0]) + X.size(); }
+  inline Type_t* last_address() { return X.data() + X.size(); }
 
   This_t& operator=(const T& rhs)
   {
@@ -201,18 +189,12 @@ class Array
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
   Type_t& operator()(const std::array<SIZET, D>& indices)
   {
-    size_t offset = indices[0];
-    for (int i = 1; i < indices.size(); i++)
-      offset = offset * Length[i] + indices[i];
-    return X[offset];
+    return X[compute_offset(indices)];
   }
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
   const Type_t& operator()(const std::array<SIZET, D>& indices) const
   {
-    size_t offset = indices[0];
-    for (int i = 1; i < indices.size(); i++)
-      offset = offset * Length[i] + indices[i];
-    return X[offset];
+    return X[compute_offset(indices)];
   }
   template<typename... Args>
   Type_t& operator()(Args... indices)
@@ -258,6 +240,15 @@ class Array
       total *= dims[i];
     return total;
   }
+
+  template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
+  SIZET compute_offset(const std::array<SIZET, D>& indices) const
+  {
+    SIZET offset = indices[0];
+    for (int i = 1; i < indices.size(); i++)
+      offset = offset * Length[i] + indices[i];
+    return offset;
+  }
 };
 
 template<class T, unsigned D, class Alloc>

From 036d6196c8d998466c4cf7ff5515f64f3b0debd2 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 13 Jun 2022 20:22:45 -0500
Subject: [PATCH 13/36] Grouping doxygen comments.

---
 src/Containers/OhmmsPETE/OhmmsArray.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/Containers/OhmmsPETE/OhmmsArray.h b/src/Containers/OhmmsPETE/OhmmsArray.h
index e730e2449a..02a36f0cff 100644
--- a/src/Containers/OhmmsPETE/OhmmsArray.h
+++ b/src/Containers/OhmmsPETE/OhmmsArray.h
@@ -82,9 +82,9 @@ class Array
   inline typename Container_t::const_iterator begin() const { return X.begin(); }
   inline typename Container_t::const_iterator end() const { return X.end(); }
 
-  /// access data pointer
+  ///@{
+  /// access the container data pointer
   inline Type_t* data() { return X.data(); }
-  /// const access data pointer
   inline const Type_t* data() const { return X.data(); }
   template<typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
   inline Type_t* device_data()
@@ -96,7 +96,10 @@ class Array
   {
     return X.device_data();
   }
+  ///@}
 
+  ///@{
+  /// access the data pointer at {index_1, ..., index_D}
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
   Type_t* data_at(const std::array<SIZET, D>& indices)
   {
@@ -124,34 +127,31 @@ class Array
     return X.device_data() + compute_offset(indices);
   }
 
-  /// access data pointer at {index_1, ..., index_D}
   template<typename... Args>
   Type_t* data_at(Args... indices)
   {
     static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
     return data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
-  /// const access data pointer at {index_1, ..., index_D}
   template<typename... Args>
   const Type_t* data_at(Args... indices) const
   {
     static_assert(sizeof...(Args) == D, "data arguments must match dimensionality of Array");
     return data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
-  /// access data pointer at {index_1, ..., index_D}
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
   Type_t* device_data_at(Args... indices)
   {
     static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
     return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
-  /// const access data pointer at {index_1, ..., index_D}
   template<typename... Args, typename Allocator = ALLOC, typename = qmcplusplus::IsDualSpace<Allocator>>
   const Type_t* device_data_at(Args... indices) const
   {
     static_assert(sizeof...(Args) == D, "device_data arguments must match dimensionality of Array");
     return device_data_at({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
+  ///@}
 
   inline const Type_t* first_address() const { return X.data(); }
 
@@ -185,7 +185,8 @@ class Array
     return *this;
   }
 
-  // Get and Set Operations
+  ///@{
+  /// access the element at {index_1, ..., index_D}
   template<typename SIZET = size_t, typename = std::is_integral<SIZET>>
   Type_t& operator()(const std::array<SIZET, D>& indices)
   {
@@ -208,6 +209,7 @@ class Array
     static_assert(sizeof...(Args) == D, "operator() arguments must match dimensionality of Array");
     return operator()({static_cast<std::size_t>(std::forward<Args>(indices))...});
   }
+  ///@}
 
   inline Type_t sum() const
   {

From 908f40ed7ea893a4d431a42efcc3be58f9c01b6e Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 13 Jun 2022 20:31:42 -0500
Subject: [PATCH 14/36] Expand a bit unit test.

---
 src/Containers/OhmmsPETE/tests/test_Array.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Containers/OhmmsPETE/tests/test_Array.cpp b/src/Containers/OhmmsPETE/tests/test_Array.cpp
index ca41aa5fcc..5903feeb53 100644
--- a/src/Containers/OhmmsPETE/tests/test_Array.cpp
+++ b/src/Containers/OhmmsPETE/tests/test_Array.cpp
@@ -75,8 +75,13 @@ TEST_CASE("array NestedContainers", "[OhmmsPETE]")
 
 TEST_CASE("Array::data", "[OhmmsPETE]")
 {
-  Array<double, 3> tensor(2, 4, 5);
+  Array<float, 3> tensor(2, 4, 5);
+  REQUIRE(tensor.size() == 40);
+
   CHECK(tensor.data() + 1 * 4 * 5 + 2 * 5 + 3 == tensor.data_at(1, 2, 3));
+
+  tensor(1, 2, 3) = 0.5f;
+  CHECK(*tensor.data_at(1, 2, 3) == 0.5f);
 }
 
 TEST_CASE("Array::dimension sizes constructor", "[OhmmsPETE]")

From d362348457f2d56dac7e61d4faced8b7d3a65e8c Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Sat, 19 Mar 2022 14:50:58 -0500
Subject: [PATCH 15/36] Adjust phi_vgl layout.

---
 .../BsplineFactory/SplineC2ROMPTarget.cpp     | 124 ++++++++-------
 .../Fermion/MatrixDelayedUpdateCUDA.h         |  54 +++----
 .../Fermion/MatrixUpdateOMPTarget.h           |  28 ++--
 src/QMCWaveFunctions/SPOSet.cpp               |  11 +-
 .../detail/CUDA/matrix_update_helper.cu       | 149 +++++++++---------
 .../detail/CUDA/matrix_update_helper.hpp      |  36 ++---
 6 files changed, 200 insertions(+), 202 deletions(-)

diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp
index 60c0c7d5fe..683703bc08 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp
@@ -25,7 +25,6 @@ inline void assign_v(ST x,
                      ST y,
                      ST z,
                      TT* restrict results_scratch_ptr,
-                     size_t orb_size,
                      const ST* restrict offload_scratch_ptr,
                      const ST* restrict myKcart_ptr,
                      size_t myKcart_padded_size,
@@ -61,8 +60,8 @@ inline void assign_vgl(ST x,
                        ST y,
                        ST z,
                        TT* restrict results_scratch_ptr,
+                       size_t orb_padded_size,
                        const ST* mKK_ptr,
-                       size_t orb_size,
                        const ST* restrict offload_scratch_ptr,
                        size_t spline_padded_size,
                        const ST symGGt[6],
@@ -92,10 +91,6 @@ inline void assign_vgl(ST x,
   const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8;
   const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9;
 
-  TT* restrict psi   = results_scratch_ptr;
-  TT* restrict dpsi  = results_scratch_ptr + orb_size;
-  TT* restrict d2psi = results_scratch_ptr + orb_size * 4;
-
   const size_t jr = index << 1;
   const size_t ji = jr + 1;
 
@@ -131,20 +126,27 @@ inline void assign_vgl(ST x,
   const ST lap_r   = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
   const ST lap_i   = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
 
-  const size_t psiIndex  = first_spo + index + omptarget::min(index, nComplexBands);
-  psi[psiIndex]          = c * val_r - s * val_i;
-  d2psi[psiIndex]        = c * lap_r - s * lap_i;
-  dpsi[psiIndex * 3]     = c * gX_r - s * gX_i;
-  dpsi[psiIndex * 3 + 1] = c * gY_r - s * gY_i;
-  dpsi[psiIndex * 3 + 2] = c * gZ_r - s * gZ_i;
+  TT* restrict psi    = results_scratch_ptr;
+  TT* restrict dpsi_x = results_scratch_ptr + orb_padded_size;
+  TT* restrict dpsi_y = results_scratch_ptr + orb_padded_size * 2;
+  TT* restrict dpsi_z = results_scratch_ptr + orb_padded_size * 3;
+  TT* restrict d2psi  = results_scratch_ptr + orb_padded_size * 4;
+
+  const size_t psiIndex = first_spo + index + omptarget::min(index, nComplexBands);
+
+  psi[psiIndex]    = c * val_r - s * val_i;
+  d2psi[psiIndex]  = c * lap_r - s * lap_i;
+  dpsi_x[psiIndex] = c * gX_r - s * gX_i;
+  dpsi_y[psiIndex] = c * gY_r - s * gY_i;
+  dpsi_z[psiIndex] = c * gZ_r - s * gZ_i;
 
   if (index < nComplexBands)
   {
-    psi[psiIndex + 1]      = c * val_i + s * val_r;
-    d2psi[psiIndex + 1]    = c * lap_i + s * lap_r;
-    dpsi[psiIndex * 3 + 3] = c * gX_i + s * gX_r;
-    dpsi[psiIndex * 3 + 4] = c * gY_i + s * gY_r;
-    dpsi[psiIndex * 3 + 5] = c * gZ_i + s * gZ_r;
+    psi[psiIndex + 1]    = c * val_i + s * val_r;
+    d2psi[psiIndex + 1]  = c * lap_i + s * lap_r;
+    dpsi_x[psiIndex + 1] = c * gX_i + s * gX_r;
+    dpsi_y[psiIndex + 1] = c * gY_i + s * gY_r;
+    dpsi_z[psiIndex + 1] = c * gZ_i + s * gZ_r;
   }
 }
 } // namespace C2R
@@ -277,7 +279,7 @@ void SplineC2ROMPTarget<ST>::evaluateValue(const ParticleSet& P, const int iat,
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2R::assign_v(x, y, z, psi_ptr, orb_size, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size,
+          C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size,
                         first_spo_local, nComplexBands_local, index);
       }
     }
@@ -316,7 +318,7 @@ void SplineC2ROMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
   const auto padded_size = myV.size();
   offload_scratch.resize(padded_size * nVP);
   const auto orb_size = psiinv.size();
-  results_scratch.resize(orb_size * nVP);
+  results_scratch.resize(padded_size * nVP);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr           = SplineInst->getSplinePtr();
@@ -341,7 +343,7 @@ void SplineC2ROMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
-        auto* restrict psi_iat_ptr             = results_scratch_ptr + orb_size * iat;
+        auto* restrict psi_iat_ptr             = results_scratch_ptr + padded_size * iat;
         auto* restrict pos_scratch             = psiinv_ptr + orb_size;
 
         int ix, iy, iz;
@@ -358,7 +360,7 @@ void SplineC2ROMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
-                        psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
+                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
                         first_spo_local, nComplexBands_local, index);
 
         const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
@@ -438,7 +440,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
   mw_ratios_private.resize(mw_nVP, NumTeams);
   const auto padded_size = myV.size();
   mw_offload_scratch.resize(padded_size * mw_nVP);
-  mw_results_scratch.resize(orb_size * mw_nVP);
+  mw_results_scratch.resize(padded_size * mw_nVP);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr           = SplineInst->getSplinePtr();
@@ -463,7 +465,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
-        auto* restrict psi_iat_ptr             = results_scratch_ptr + orb_size * iat;
+        auto* restrict psi_iat_ptr             = results_scratch_ptr + padded_size * iat;
         auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(TT));
         auto* restrict psiinv_ptr  = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
         auto* restrict pos_scratch = reinterpret_cast<TT*>(buffer_H2D_ptr + nw * sizeof(ValueType*));
@@ -482,7 +484,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
-                        psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
+                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
                         first_spo_local, nComplexBands_local, index);
 
         const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
@@ -647,7 +649,7 @@ void SplineC2ROMPTarget<ST>::evaluateVGL(const ParticleSet& P,
   offload_scratch.resize(padded_size * 10);
   const auto orb_size = psi.size();
   // for V(1)G(3)L(1) final result
-  results_scratch.resize(orb_size * 5);
+  results_scratch.resize(padded_size * 5);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr    = SplineInst->getSplinePtr();
@@ -666,7 +668,7 @@ void SplineC2ROMPTarget<ST>::evaluateVGL(const ParticleSet& P,
   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
-                map(always, from: results_scratch_ptr[0:orb_size*5])")
+                map(always, from: results_scratch_ptr[0:padded_size*5])")
     for (int team_id = 0; team_id < NumTeams; team_id++)
     {
       const size_t first = ChunkSizePerTeam * team_id;
@@ -691,7 +693,7 @@ void SplineC2ROMPTarget<ST>::evaluateVGL(const ParticleSet& P,
       const size_t last_cplx  = omptarget::min(last / 2, orb_size);
       PRAGMA_OFFLOAD("omp parallel for")
       for (int index = first_cplx; index < last_cplx; index++)
-        C2R::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G,
+        C2R::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, symGGt, G,
                         myKcart_ptr, myKcart_padded_size, first_spo_local, nComplexBands_local, index);
     }
   }
@@ -699,10 +701,10 @@ void SplineC2ROMPTarget<ST>::evaluateVGL(const ParticleSet& P,
   for (size_t i = 0; i < orb_size; i++)
   {
     psi[i]     = results_scratch[i];
-    dpsi[i][0] = results_scratch[orb_size + i * 3];
-    dpsi[i][1] = results_scratch[orb_size + i * 3 + 1];
-    dpsi[i][2] = results_scratch[orb_size + i * 3 + 2];
-    d2psi[i]   = results_scratch[orb_size * 4 + i];
+    dpsi[i][0] = results_scratch[i + padded_size * 1];
+    dpsi[i][1] = results_scratch[i + padded_size * 2];
+    dpsi[i][2] = results_scratch[i + padded_size * 3];
+    d2psi[i]   = results_scratch[i + padded_size * 4];
   }
 }
 
@@ -722,7 +724,7 @@ void SplineC2ROMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
   offload_scratch.resize(padded_size * num_pos * 10);
   const auto orb_size = psi_v_list[0].get().size();
   // for V(1)G(3)L(1) final result
-  results_scratch.resize(orb_size * num_pos * 5);
+  results_scratch.resize(padded_size * num_pos * 5);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr           = SplineInst->getSplinePtr();
@@ -741,7 +743,7 @@ void SplineC2ROMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
                     map(always, to: pos_copy_ptr[0:num_pos*6]) \
-                    map(always, from: results_scratch_ptr[0:orb_size*num_pos*5])")
+                    map(always, from: results_scratch_ptr[0:padded_size*num_pos*5])")
     for (int iw = 0; iw < num_pos; iw++)
       for (int team_id = 0; team_id < NumTeams; team_id++)
       {
@@ -749,7 +751,7 @@ void SplineC2ROMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10;
-        auto* restrict psi_iw_ptr             = results_scratch_ptr + orb_size * iw * 5;
+        auto* restrict psi_iw_ptr             = results_scratch_ptr + padded_size * iw * 5;
 
         int ix, iy, iz;
         ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
@@ -773,25 +775,25 @@ void SplineC2ROMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2R::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr, mKK_ptr,
-                          orb_size, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size,
-                          first_spo_local, nComplexBands_local, index);
+          C2R::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
+                          padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr,
+                          myKcart_padded_size, first_spo_local, nComplexBands_local, index);
       }
   }
 
   for (int iw = 0; iw < num_pos; ++iw)
   {
-    auto* restrict results_iw_ptr = results_scratch_ptr + orb_size * iw * 5;
+    auto* restrict results_iw_ptr = results_scratch_ptr + padded_size * iw * 5;
     ValueVector& psi_v(psi_v_list[iw]);
     GradVector& dpsi_v(dpsi_v_list[iw]);
     ValueVector& d2psi_v(d2psi_v_list[iw]);
     for (size_t i = 0; i < orb_size; i++)
     {
       psi_v[i]     = results_iw_ptr[i];
-      dpsi_v[i][0] = results_iw_ptr[orb_size + i * 3];
-      dpsi_v[i][1] = results_iw_ptr[orb_size + i * 3 + 1];
-      dpsi_v[i][2] = results_iw_ptr[orb_size + i * 3 + 2];
-      d2psi_v[i]   = results_iw_ptr[orb_size * 4 + i];
+      dpsi_v[i][0] = results_iw_ptr[i + padded_size];
+      dpsi_v[i][1] = results_iw_ptr[i + padded_size * 2];
+      dpsi_v[i][2] = results_iw_ptr[i + padded_size * 3];
+      d2psi_v[i]   = results_iw_ptr[i + padded_size * 4];
     }
   }
 }
@@ -883,7 +885,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
   mw_offload_scratch.resize(padded_size * num_pos * 10);
   const auto orb_size = phi_vgl_v.size() / num_pos;
   // for V(1)G(3)L(1) final result
-  mw_results_scratch.resize(orb_size * num_pos * 5);
+  mw_results_scratch.resize(padded_size * num_pos * 5);
   // per team ratio and grads
   rg_private.resize(num_pos, NumTeams * 4);
 
@@ -916,7 +918,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10;
-        auto* restrict psi_iw_ptr             = results_scratch_ptr + orb_size * iw * 5;
+        auto* restrict psi_iw_ptr             = results_scratch_ptr + padded_size * iw * 5;
         const auto* restrict pos_iw_ptr       = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
         const auto* restrict invRow_iw_ptr =
             *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);
@@ -943,17 +945,21 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size,
+          C2R::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr,
                           offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size,
                           first_spo_local, nComplexBands_local, index);
 
-        ValueType* restrict psi   = psi_iw_ptr;
-        ValueType* restrict dpsi  = psi_iw_ptr + orb_size;
-        ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4;
+        ValueType* restrict psi    = psi_iw_ptr;
+        ValueType* restrict dpsi_x = psi_iw_ptr + padded_size;
+        ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2;
+        ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3;
+        ValueType* restrict d2psi  = psi_iw_ptr + padded_size * 4;
 
-        ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size;
-        ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3;
-        ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size;
+        ValueType* restrict out_phi    = phi_vgl_ptr + iw * orb_size;
+        ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
+        ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
+        ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
+        ValueType* restrict out_d2phi  = out_dphi_z + phi_vgl_stride;
 
         const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
         const size_t last_real  = last_cplx + omptarget::min(nComplexBands_local, last_cplx);
@@ -961,16 +967,16 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
         PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
         for (size_t j = first_real; j < last_real; j++)
         {
-          out_phi_v[j]         = psi[j];
-          out_phi_l[j]         = d2psi[j];
-          out_phi_g[j * 3]     = dpsi[j * 3];
-          out_phi_g[j * 3 + 1] = dpsi[j * 3 + 1];
-          out_phi_g[j * 3 + 2] = dpsi[j * 3 + 2];
+          out_phi[j]    = psi[j];
+          out_dphi_x[j] = dpsi_x[j];
+          out_dphi_y[j] = dpsi_y[j];
+          out_dphi_z[j] = dpsi_z[j];
+          out_d2phi[j]  = d2psi[j];
 
           ratio += psi[j] * invRow_iw_ptr[j];
-          grad_x += dpsi[j * 3] * invRow_iw_ptr[j];
-          grad_y += dpsi[j * 3 + 1] * invRow_iw_ptr[j];
-          grad_z += dpsi[j * 3 + 2] * invRow_iw_ptr[j];
+          grad_x += dpsi_x[j] * invRow_iw_ptr[j];
+          grad_y += dpsi_y[j] * invRow_iw_ptr[j];
+          grad_z += dpsi_z[j] * invRow_iw_ptr[j];
         }
 
         rg_private_ptr[(iw * NumTeams + team_id) * 4]     = ratio;
diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
index 7775b36441..6d37f5fc54 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
@@ -303,11 +303,11 @@ class MatrixDelayedUpdateCUDA
     const int lda              = engine_leader.get_ref_psiMinv().cols();
     mw_temp.resize(norb * n_accepted);
     mw_rcopy.resize(norb * n_accepted);
-    updateRow_buffer_H2D.resize((sizeof(Value*) * 8 + sizeof(Value)) * n_accepted);
+    updateRow_buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted);
 
     // to handle T** of Ainv, psi_v, temp, rcopy
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateRow_buffer_H2D.data()), 8, n_accepted);
-    Value* c_ratio_inv = reinterpret_cast<Value*>(updateRow_buffer_H2D.data() + sizeof(Value*) * 8 * n_accepted);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateRow_buffer_H2D.data()), 6, n_accepted);
+    Value* c_ratio_inv = reinterpret_cast<Value*>(updateRow_buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted);
     for (int iw = 0, count = 0; iw < isAccepted.size(); iw++)
       if (isAccepted[iw])
       {
@@ -317,8 +317,6 @@ class MatrixDelayedUpdateCUDA
         ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count;
         ptr_buffer[4][count] = psiM_g_list[count];
         ptr_buffer[5][count] = psiM_l_list[count];
-        ptr_buffer[6][count] = const_cast<Value*>(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw);
-        ptr_buffer[7][count] = const_cast<Value*>(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw);
 
         c_ratio_inv[count] = Value(-1) / ratios[iw];
         count++;
@@ -333,7 +331,8 @@ class MatrixDelayedUpdateCUDA
 
     {
       Value** Ainv_mw_ptr = reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data());
-      Value** phiV_mw_ptr = reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted);
+      Value** phiVGL_mw_ptr =
+          reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted);
       Value** temp_mw_ptr =
           reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 2);
       Value** rcopy_mw_ptr =
@@ -342,21 +341,18 @@ class MatrixDelayedUpdateCUDA
           reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 4);
       Value** d2psiM_mw_out =
           reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 5);
-      Value** dpsiM_mw_in =
-          reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6);
-      Value** d2psiM_mw_in =
-          reinterpret_cast<Value**>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 7);
       Value* ratio_inv_mw =
-          reinterpret_cast<Value*>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 8);
+          reinterpret_cast<Value*>(updateRow_buffer_H2D.device_data() + sizeof(Value*) * n_accepted * 6);
+
 
       // invoke the Fahy's variant of Sherman-Morrison update.
       cudaErrorCheck(cuBLAS_MFs::gemv_batched(hstream, 'T', norb, norb, cone_vec.device_data(), Ainv_mw_ptr, lda,
-                                              phiV_mw_ptr, 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted),
+                                              phiVGL_mw_ptr, 1, czero_vec.device_data(), temp_mw_ptr, 1, n_accepted),
                      "cuBLAS_MFs::gemv_batched failed!");
 
       cudaErrorCheck(CUDA::copyAinvRow_saveGL_cuda(hstream, rowchanged, norb, Ainv_mw_ptr, lda, temp_mw_ptr,
-                                                   rcopy_mw_ptr, dpsiM_mw_in, d2psiM_mw_in, dpsiM_mw_out, d2psiM_mw_out,
-                                                   n_accepted),
+                                                   rcopy_mw_ptr, phiVGL_mw_ptr, phi_vgl_stride, dpsiM_mw_out,
+                                                   d2psiM_mw_out, n_accepted),
                      "CUDA::copyAinvRow_saveGL_cuda failed!");
 
 
@@ -579,11 +575,11 @@ class MatrixDelayedUpdateCUDA
     const int lda                     = engine_leader.get_psiMinv().cols();
     const int nw                      = engines.size();
     const int n_accepted              = psiM_g_list.size();
-    accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 14 + sizeof(Value)) * nw);
+    accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 12 + sizeof(Value)) * nw);
     engine_leader.resize_fill_constant_arrays(nw);
 
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.data()), 14, nw);
-    Value* c_ratio_inv = reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 14 * nw);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.data()), 12, nw);
+    Value* c_ratio_inv = reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 12 * nw);
     for (int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++)
     {
       This_t& engine = engines[iw];
@@ -599,10 +595,8 @@ class MatrixDelayedUpdateCUDA
         ptr_buffer[7][count_accepted]  = reinterpret_cast<Value*>(engine.delay_list_gpu.data());
         ptr_buffer[8][count_accepted]  = engine.V_gpu.data() + norb * delay_count;
         ptr_buffer[9][count_accepted]  = const_cast<Value*>(phi_vgl_v_dev_ptr + norb * iw);
-        ptr_buffer[10][count_accepted] = const_cast<Value*>(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw);
-        ptr_buffer[11][count_accepted] = const_cast<Value*>(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw);
-        ptr_buffer[12][count_accepted] = psiM_g_list[count_accepted];
-        ptr_buffer[13][count_accepted] = psiM_l_list[count_accepted];
+        ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted];
+        ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted];
         c_ratio_inv[count_accepted]    = Value(1) / ratios[iw];
         count_accepted++;
       }
@@ -640,18 +634,14 @@ class MatrixDelayedUpdateCUDA
         reinterpret_cast<int**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 7);
     Value** V_row_mw_ptr =
         reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 8);
-    Value** phiV_mw_ptr =
+    Value** phiVGL_mw_ptr =
         reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 9);
-    Value** dpsiM_mw_in =
-        reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10);
-    Value** d2psiM_mw_in =
-        reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11);
     Value** dpsiM_mw_out =
-        reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12);
+        reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 10);
     Value** d2psiM_mw_out =
-        reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 13);
+        reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 11);
     Value* ratio_inv_mw_ptr =
-        reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 14);
+        reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.device_data() + sizeof(Value*) * nw * 12);
 
     //std::copy_n(Ainv[rowchanged], norb, V[delay_count]);
     cudaErrorCheck(cuBLAS_MFs::copy_batched(hstream, norb, invRow_mw_ptr, 1, V_row_mw_ptr, 1, nw),
@@ -660,7 +650,7 @@ class MatrixDelayedUpdateCUDA
     // the new Binv is [[X Y] [Z sigma]]
     //BLAS::gemv('T', norb, delay_count + 1, cminusone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
     cudaErrorCheck(cuBLAS_MFs::gemv_batched(hstream, 'T', norb, delay_count, cminusone_vec.device_data(), V_mw_ptr,
-                                            norb, phiV_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted),
+                                            norb, phiVGL_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted),
                    "cuBLAS_MFs::gemv_batched failed!");
     // Y
     //BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero, Binv.data() + delay_count,
@@ -677,8 +667,8 @@ class MatrixDelayedUpdateCUDA
                    "cuBLAS_MFs::ger_batched failed!");
     // sigma and Z
     cudaErrorCheck(CUDA::add_delay_list_save_sigma_VGL_batched(hstream, delay_list_mw_ptr, rowchanged, delay_count,
-                                                               Binv_mw_ptr, lda_Binv, ratio_inv_mw_ptr, phiV_mw_ptr,
-                                                               dpsiM_mw_in, d2psiM_mw_in, U_row_mw_ptr, dpsiM_mw_out,
+                                                               Binv_mw_ptr, lda_Binv, ratio_inv_mw_ptr, phiVGL_mw_ptr,
+                                                               phi_vgl_stride, U_row_mw_ptr, dpsiM_mw_out,
                                                                d2psiM_mw_out, norb, n_accepted, nw),
                    "CUDA::add_delay_list_save_y_VGL_batched failed!");
     delay_count++;
diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
index 185bd0a530..8a5bea5f3f 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
@@ -261,9 +261,9 @@ class MatrixUpdateOMPTarget
     engine_leader.resize_scratch_arrays(norb, n_accepted);
 
     // to handle Value** of Ainv, psi_v, temp, rcopy
-    buffer_H2D.resize((sizeof(Value*) * 8 + sizeof(Value)) * n_accepted);
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(buffer_H2D.data()), 8, n_accepted);
-    Value* c_ratio_inv = reinterpret_cast<Value*>(buffer_H2D.data() + sizeof(Value*) * 8 * n_accepted);
+    buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(buffer_H2D.data()), 6, n_accepted);
+    Value* c_ratio_inv = reinterpret_cast<Value*>(buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted);
     for (int iw = 0, count = 0; iw < isAccepted.size(); iw++)
       if (isAccepted[iw])
       {
@@ -273,8 +273,6 @@ class MatrixUpdateOMPTarget
         ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count;
         ptr_buffer[4][count] = psiM_g_list[count];
         ptr_buffer[5][count] = psiM_l_list[count];
-        ptr_buffer[6][count] = const_cast<Value*>(phi_vgl_v_dev_ptr + phi_vgl_stride + norb * 3 * iw);
-        ptr_buffer[7][count] = const_cast<Value*>(phi_vgl_v_dev_ptr + phi_vgl_stride * 4 + norb * iw);
 
         c_ratio_inv[count] = Value(-1) / ratios[iw];
         count++;
@@ -294,23 +292,21 @@ class MatrixUpdateOMPTarget
                     use_device_ptr(buffer_H2D_ptr, cone_ptr, czero_ptr)")
     {
       Value** Ainv_mw_ptr   = reinterpret_cast<Value**>(buffer_H2D_ptr);
-      Value** phiV_mw_ptr   = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted);
+      Value** phiVGL_mw_ptr = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted);
       Value** temp_mw_ptr   = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 2);
       Value** rcopy_mw_ptr  = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 3);
       Value** dpsiM_mw_out  = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 4);
       Value** d2psiM_mw_out = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 5);
-      Value** dpsiM_mw_in   = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 6);
-      Value** d2psiM_mw_in  = reinterpret_cast<Value**>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 7);
-      Value* ratio_inv_mw   = reinterpret_cast<Value*>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 8);
+      Value* ratio_inv_mw   = reinterpret_cast<Value*>(buffer_H2D_ptr + sizeof(Value*) * n_accepted * 6);
 
       // invoke the Fahy's variant of Sherman-Morrison update.
-      success = ompBLAS::gemv_batched(dummy_handle, 'T', norb, norb, cone_ptr, Ainv_mw_ptr, lda, phiV_mw_ptr, 1,
+      success = ompBLAS::gemv_batched(dummy_handle, 'T', norb, norb, cone_ptr, Ainv_mw_ptr, lda, phiVGL_mw_ptr, 1,
                                       czero_ptr, temp_mw_ptr, 1, n_accepted);
       if (success != 0)
         throw std::runtime_error("ompBLAS::gemv_batched failed.");
 
       PRAGMA_OFFLOAD("omp target teams distribute num_teams(n_accepted) is_device_ptr(Ainv_mw_ptr, temp_mw_ptr, \
-                     rcopy_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, dpsiM_mw_in, d2psiM_mw_in)")
+                     rcopy_mw_ptr, dpsiM_mw_out, d2psiM_mw_out, phiVGL_mw_ptr)")
       for (int iw = 0; iw < n_accepted; iw++)
       {
         Value* __restrict__ Ainv_ptr   = Ainv_mw_ptr[iw];
@@ -318,8 +314,8 @@ class MatrixUpdateOMPTarget
         Value* __restrict__ rcopy_ptr  = rcopy_mw_ptr[iw];
         Value* __restrict__ dpsiM_out  = dpsiM_mw_out[iw];
         Value* __restrict__ d2psiM_out = d2psiM_mw_out[iw];
-        Value* __restrict__ dpsiM_in   = dpsiM_mw_in[iw];
-        Value* __restrict__ d2psiM_in  = d2psiM_mw_in[iw];
+        Value* __restrict__ dpsiM_in   = phiVGL_mw_ptr[iw] + phi_vgl_stride;
+        Value* __restrict__ d2psiM_in  = phiVGL_mw_ptr[iw] + phi_vgl_stride * 4;
 
         temp_ptr[rowchanged] -= cone;
         PRAGMA_OFFLOAD("omp parallel for simd")
@@ -328,9 +324,9 @@ class MatrixUpdateOMPTarget
           rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i];
           // the following copying data on the device is not part of SM-1
           // it is intended to copy dpsiM and d2psiM from temporary to final without a separate kernel.
-          dpsiM_out[i * 3]     = dpsiM_in[i * 3];
-          dpsiM_out[i * 3 + 1] = dpsiM_in[i * 3 + 1];
-          dpsiM_out[i * 3 + 2] = dpsiM_in[i * 3 + 2];
+          dpsiM_out[i * 3]     = dpsiM_in[i];
+          dpsiM_out[i * 3 + 1] = dpsiM_in[i + phi_vgl_stride];
+          dpsiM_out[i * 3 + 2] = dpsiM_in[i + phi_vgl_stride * 2];
           d2psiM_out[i]        = d2psiM_in[i];
         }
       }
diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp
index 88292d3a97..d2919ade80 100644
--- a/src/QMCWaveFunctions/SPOSet.cpp
+++ b/src/QMCWaveFunctions/SPOSet.cpp
@@ -117,18 +117,27 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& s
                                             std::vector<GradType>& grads) const
 {
   assert(this == &spo_list.getLeader());
+  assert(OrbitalSetSize * nw == phi_vgl_v.size());
   const size_t nw             = spo_list.size();
   const size_t norb_requested = phi_vgl_v.size() / nw;
+  GradVector dphi_v(norb_requested);
 #pragma omp parallel for
   for (int iw = 0; iw < nw; iw++)
   {
     ValueVector phi_v(phi_vgl_v.data() + norb_requested * iw, norb_requested);
-    GradVector dphi_v(reinterpret_cast<GradType*>(phi_vgl_v.data(1)) + norb_requested * iw, norb_requested);
     ValueVector d2phi_v(phi_vgl_v.data(4) + norb_requested * iw, norb_requested);
     spo_list[iw].evaluateVGL(P_list[iw], iat, phi_v, dphi_v, d2phi_v);
 
     ratios[iw] = simd::dot(invRow_ptr_list[iw], phi_v.data(), norb_requested);
     grads[iw]  = simd::dot(invRow_ptr_list[iw], dphi_v.data(), norb_requested) / ratios[iw];
+
+    // transpose the array of gradients to SoA in phi_vgl_v
+    for (size_t idim = 0; idim < DIM; idim++)
+    {
+      ValueType* phi_g = phi_vgl_v.data(idim + 1) + norb_requested * iw;
+      for (size_t iorb = 0; iorb < OrbitalSetSize; iorb++)
+        phi_g[iorb] = dphi_v[iorb][idim];
+    }
   }
 }
 
diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu
index 238e03e3d4..e152d15439 100644
--- a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu
+++ b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu
@@ -21,7 +21,7 @@ namespace CUDA
 {
 using namespace thrust::cuda_cub::core;
 }
-}
+} // namespace qmcplusplus
 #else
 #include <hip/hip_complex.h>
 #include "ROCm/cuda2hip.h"
@@ -44,19 +44,18 @@ __global__ void copyAinvRow_saveGL_kernel(const int rowchanged,
                                           const int lda,
                                           T* const temp[],
                                           T* const rcopy[],
-                                          const T* const dphi_in[],
-                                          const T* const d2phi_in[],
+                                          const T* const phi_vgl_in[],
+                                          const size_t phi_vgl_stride,
                                           T* const dphi_out[],
                                           T* const d2phi_out[])
 {
-  const int iw                      = blockIdx.x;
-  const T* __restrict__ Ainv_iw     = Ainv[iw];
-  T* __restrict__ temp_iw           = temp[iw];
-  T* __restrict__ rcopy_iw          = rcopy[iw];
-  const T* __restrict__ dphi_in_iw  = dphi_in[iw];
-  const T* __restrict__ d2phi_in_iw = d2phi_in[iw];
-  T* __restrict__ dphi_out_iw       = dphi_out[iw];
-  T* __restrict__ d2phi_out_iw      = d2phi_out[iw];
+  const int iw                    = blockIdx.x;
+  const T* __restrict__ Ainv_iw   = Ainv[iw];
+  T* __restrict__ temp_iw         = temp[iw];
+  T* __restrict__ rcopy_iw        = rcopy[iw];
+  const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
+  T* __restrict__ dphi_out_iw     = dphi_out[iw];
+  T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
 
   const int tid = threadIdx.x;
   if (tid == 0)
@@ -72,10 +71,10 @@ __global__ void copyAinvRow_saveGL_kernel(const int rowchanged,
 
       // the following copying data on the device is not part of SM-1
       // it is intended to copy dphiV and d2phiV from temporary to final without a separate kernel.
-      dphi_out_iw[col_id * 3]     = dphi_in_iw[col_id * 3];
-      dphi_out_iw[col_id * 3 + 1] = dphi_in_iw[col_id * 3 + 1];
-      dphi_out_iw[col_id * 3 + 2] = dphi_in_iw[col_id * 3 + 2];
-      d2phi_out_iw[col_id]        = d2phi_in_iw[col_id];
+      dphi_out_iw[col_id * 3]     = phi_in_iw[col_id + phi_vgl_stride];
+      dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
+      dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
+      d2phi_out_iw[col_id]        = phi_in_iw[col_id + phi_vgl_stride * 4];
     }
   }
 }
@@ -87,8 +86,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     float* const temp[],
                                     float* const rcopy[],
-                                    const float* const dphi_in[],
-                                    const float* const d2phi_in[],
+                                    const float* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     float* const dphi_out[],
                                     float* const d2phi_out[],
                                     const int batch_count)
@@ -100,7 +99,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
   copyAinvRow_saveGL_kernel<float, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(rowchanged, n, Ainv, lda, temp, rcopy,
-                                                                             dphi_in, d2phi_in, dphi_out, d2phi_out);
+                                                                             phi_vgl_in, phi_vgl_stride, dphi_out,
+                                                                             d2phi_out);
   return cudaPeekAtLastError();
 }
 
@@ -111,8 +111,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     double* const temp[],
                                     double* const rcopy[],
-                                    const double* const dphi_in[],
-                                    const double* const d2phi_in[],
+                                    const double* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     double* const dphi_out[],
                                     double* const d2phi_out[],
                                     const int batch_count)
@@ -124,7 +124,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
   copyAinvRow_saveGL_kernel<double, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(rowchanged, n, Ainv, lda, temp, rcopy,
-                                                                              dphi_in, d2phi_in, dphi_out, d2phi_out);
+                                                                              phi_vgl_in, phi_vgl_stride, dphi_out,
+                                                                              d2phi_out);
   return cudaPeekAtLastError();
 }
 
@@ -135,8 +136,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     std::complex<float>* const temp[],
                                     std::complex<float>* const rcopy[],
-                                    const std::complex<float>* const dphi_in[],
-                                    const std::complex<float>* const d2phi_in[],
+                                    const std::complex<float>* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     std::complex<float>* const dphi_out[],
                                     std::complex<float>* const d2phi_out[],
                                     const int batch_count)
@@ -147,8 +148,10 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
   const int COLBS = 64;
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
-  copyAinvRow_saveGL_kernel<cuComplex, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(rowchanged, n, (const cuComplex**)Ainv, lda, (cuComplex**)temp, (cuComplex**)rcopy,
-                                                                             (const cuComplex**)dphi_in, (const cuComplex**)d2phi_in, (cuComplex**)dphi_out, (cuComplex**)d2phi_out);
+  copyAinvRow_saveGL_kernel<cuComplex, COLBS>
+      <<<dimGrid, dimBlock, 0, hstream>>>(rowchanged, n, (const cuComplex**)Ainv, lda, (cuComplex**)temp,
+                                          (cuComplex**)rcopy, (const cuComplex**)phi_vgl_in, phi_vgl_stride,
+                                          (cuComplex**)dphi_out, (cuComplex**)d2phi_out);
   return cudaPeekAtLastError();
 }
 
@@ -159,8 +162,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     std::complex<double>* const temp[],
                                     std::complex<double>* const rcopy[],
-                                    const std::complex<double>* const dphi_in[],
-                                    const std::complex<double>* const d2phi_in[],
+                                    const std::complex<double>* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     std::complex<double>* const dphi_out[],
                                     std::complex<double>* const d2phi_out[],
                                     const int batch_count)
@@ -171,8 +174,10 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
   const int COLBS = 64;
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
-  copyAinvRow_saveGL_kernel<cuDoubleComplex, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(rowchanged, n, (const cuDoubleComplex**)Ainv, lda, (cuDoubleComplex**)temp, (cuDoubleComplex**)rcopy,
-                                                                              (const cuDoubleComplex**)dphi_in, (const cuDoubleComplex**)d2phi_in, (cuDoubleComplex**)dphi_out, (cuDoubleComplex**)d2phi_out);
+  copyAinvRow_saveGL_kernel<cuDoubleComplex, COLBS>
+      <<<dimGrid, dimBlock, 0, hstream>>>(rowchanged, n, (const cuDoubleComplex**)Ainv, lda, (cuDoubleComplex**)temp,
+                                          (cuDoubleComplex**)rcopy, (const cuDoubleComplex**)phi_vgl_in, phi_vgl_stride,
+                                          (cuDoubleComplex**)dphi_out, (cuDoubleComplex**)d2phi_out);
   return cudaPeekAtLastError();
 }
 
@@ -261,7 +266,9 @@ cudaError_t calcGradients_cuda(cudaStream_t& hstream,
   const int COLBS = 64;
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
-  calcGradients_kernel<thrust::complex<float>, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(n, (const thrust::complex<float>**)Ainvrow, (const thrust::complex<float>**)dpsiMrow, (thrust::complex<float>*)grads_now);
+  calcGradients_kernel<thrust::complex<float>, COLBS>
+      <<<dimGrid, dimBlock, 0, hstream>>>(n, (const thrust::complex<float>**)Ainvrow,
+                                          (const thrust::complex<float>**)dpsiMrow, (thrust::complex<float>*)grads_now);
   return cudaPeekAtLastError();
 }
 
@@ -278,7 +285,10 @@ cudaError_t calcGradients_cuda(cudaStream_t& hstream,
   const int COLBS = 64;
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
-  calcGradients_kernel<thrust::complex<double>, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(n, (const thrust::complex<double>**)Ainvrow, (const thrust::complex<double>**)dpsiMrow, (thrust::complex<double>*)grads_now);
+  calcGradients_kernel<thrust::complex<double>, COLBS>
+      <<<dimGrid, dimBlock, 0, hstream>>>(n, (const thrust::complex<double>**)Ainvrow,
+                                          (const thrust::complex<double>**)dpsiMrow,
+                                          (thrust::complex<double>*)grads_now);
   return cudaPeekAtLastError();
 }
 
@@ -289,9 +299,8 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
                                                      T* const binv[],
                                                      const int binv_lda,
                                                      const T* const ratio_inv,
-                                                     const T* const phi_in[],
-                                                     const T* const dphi_in[],
-                                                     const T* const d2phi_in[],
+                                                     const T* const phi_vgl_in[],
+                                                     const size_t phi_vgl_stride,
                                                      T* const phi_out[],
                                                      T* const dphi_out[],
                                                      T* const d2phi_out[],
@@ -304,14 +313,12 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
   if (iw < n_accepted)
   {
     // real accept, settle y and Z
-    int* __restrict__ delay_list_iw   = delay_list[iw];
-    T* __restrict__ binvrow_iw        = binv[iw] + delay_count * binv_lda;
-    const T* __restrict__ phi_in_iw   = phi_in[iw];
-    const T* __restrict__ dphi_in_iw  = dphi_in[iw];
-    const T* __restrict__ d2phi_in_iw = d2phi_in[iw];
-    T* __restrict__ phi_out_iw        = phi_out[iw];
-    T* __restrict__ dphi_out_iw       = dphi_out[iw];
-    T* __restrict__ d2phi_out_iw      = d2phi_out[iw];
+    int* __restrict__ delay_list_iw = delay_list[iw];
+    T* __restrict__ binvrow_iw      = binv[iw] + delay_count * binv_lda;
+    const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
+    T* __restrict__ phi_out_iw      = phi_out[iw];
+    T* __restrict__ dphi_out_iw     = dphi_out[iw];
+    T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
 
     if (tid == 0)
     {
@@ -335,10 +342,10 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
       {
         // copy phiV, dphiV and d2phiV from temporary to final without a separate kernel.
         phi_out_iw[col_id]          = phi_in_iw[col_id];
-        dphi_out_iw[col_id * 3]     = dphi_in_iw[col_id * 3];
-        dphi_out_iw[col_id * 3 + 1] = dphi_in_iw[col_id * 3 + 1];
-        dphi_out_iw[col_id * 3 + 2] = dphi_in_iw[col_id * 3 + 2];
-        d2phi_out_iw[col_id]        = d2phi_in_iw[col_id];
+        dphi_out_iw[col_id * 3]     = phi_in_iw[col_id + phi_vgl_stride];
+        dphi_out_iw[col_id * 3 + 1] = phi_in_iw[col_id + phi_vgl_stride * 2];
+        dphi_out_iw[col_id * 3 + 2] = phi_in_iw[col_id + phi_vgl_stride * 3];
+        d2phi_out_iw[col_id]        = phi_in_iw[col_id + phi_vgl_stride * 4];
       }
     }
   }
@@ -379,9 +386,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   float* const binv[],
                                                   const int binv_lda,
                                                   const float* const ratio_inv,
-                                                  const float* const phi_in[],
-                                                  const float* const dphi_in[],
-                                                  const float* const d2phi_in[],
+                                                  const float* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   float* const phi_out[],
                                                   float* const dphi_out[],
                                                   float* const d2phi_out[],
@@ -396,8 +402,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
   add_delay_list_save_sigma_VGL_kernel<float, COLBS>
-      <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_in,
-                                          dphi_in, d2phi_in, phi_out, dphi_out, d2phi_out, norb, n_accepted);
+      <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_vgl_in,
+                                          phi_vgl_stride, phi_out, dphi_out, d2phi_out, norb, n_accepted);
   return cudaPeekAtLastError();
 }
 
@@ -408,9 +414,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   double* const binv[],
                                                   const int binv_lda,
                                                   const double* const ratio_inv,
-                                                  const double* const phi_in[],
-                                                  const double* const dphi_in[],
-                                                  const double* const d2phi_in[],
+                                                  const double* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   double* const phi_out[],
                                                   double* const dphi_out[],
                                                   double* const d2phi_out[],
@@ -425,8 +430,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
   add_delay_list_save_sigma_VGL_kernel<double, COLBS>
-      <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_in,
-                                          dphi_in, d2phi_in, phi_out, dphi_out, d2phi_out, norb, n_accepted);
+      <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, rowchanged, delay_count, binv, binv_lda, ratio_inv, phi_vgl_in,
+                                          phi_vgl_stride, phi_out, dphi_out, d2phi_out, norb, n_accepted);
   return cudaPeekAtLastError();
 }
 
@@ -437,9 +442,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   std::complex<float>* const binv[],
                                                   const int binv_lda,
                                                   const std::complex<float>* const ratio_inv,
-                                                  const std::complex<float>* const phi_in[],
-                                                  const std::complex<float>* const dphi_in[],
-                                                  const std::complex<float>* const d2phi_in[],
+                                                  const std::complex<float>* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   std::complex<float>* const phi_out[],
                                                   std::complex<float>* const dphi_out[],
                                                   std::complex<float>* const d2phi_out[],
@@ -456,11 +460,9 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
   add_delay_list_save_sigma_VGL_kernel<thrust::complex<float>, COLBS>
       <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, rowchanged, delay_count, (thrust::complex<float>**)binv, binv_lda,
                                           (const thrust::complex<float>*)ratio_inv,
-                                          (const thrust::complex<float>**)phi_in,
-                                          (const thrust::complex<float>**)dphi_in,
-                                          (const thrust::complex<float>**)d2phi_in, (thrust::complex<float>**)phi_out,
-                                          (thrust::complex<float>**)dphi_out, (thrust::complex<float>**)d2phi_out, norb,
-                                          n_accepted);
+                                          (const thrust::complex<float>**)phi_vgl_in, phi_vgl_stride,
+                                          (thrust::complex<float>**)phi_out, (thrust::complex<float>**)dphi_out,
+                                          (thrust::complex<float>**)d2phi_out, norb, n_accepted);
   return cudaPeekAtLastError();
 }
 
@@ -471,9 +473,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   std::complex<double>* const binv[],
                                                   const int binv_lda,
                                                   const std::complex<double>* const ratio_inv,
-                                                  const std::complex<double>* const phi_in[],
-                                                  const std::complex<double>* const dphi_in[],
-                                                  const std::complex<double>* const d2phi_in[],
+                                                  const std::complex<double>* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   std::complex<double>* const phi_out[],
                                                   std::complex<double>* const dphi_out[],
                                                   std::complex<double>* const d2phi_out[],
@@ -490,11 +491,9 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
   add_delay_list_save_sigma_VGL_kernel<thrust::complex<double>, COLBS>
       <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, rowchanged, delay_count, (thrust::complex<double>**)binv,
                                           binv_lda, (const thrust::complex<double>*)ratio_inv,
-                                          (const thrust::complex<double>**)phi_in,
-                                          (const thrust::complex<double>**)dphi_in,
-                                          (const thrust::complex<double>**)d2phi_in, (thrust::complex<double>**)phi_out,
-                                          (thrust::complex<double>**)dphi_out, (thrust::complex<double>**)d2phi_out,
-                                          norb, n_accepted);
+                                          (const thrust::complex<double>**)phi_vgl_in, phi_vgl_stride,
+                                          (thrust::complex<double>**)phi_out, (thrust::complex<double>**)dphi_out,
+                                          (thrust::complex<double>**)d2phi_out, norb, n_accepted);
   return cudaPeekAtLastError();
 }
 
@@ -566,7 +565,8 @@ cudaError_t applyW_batched(cudaStream_t& hstream,
   const int COLBS = 32;
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
-  applyW_kernel<cuComplex, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(delay_list, delay_count, (cuComplex**)tempMat, lda);
+  applyW_kernel<cuComplex, COLBS>
+      <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, delay_count, (cuComplex**)tempMat, lda);
   return cudaPeekAtLastError();
 }
 
@@ -583,7 +583,8 @@ cudaError_t applyW_batched(cudaStream_t& hstream,
   const int COLBS = 32;
   dim3 dimBlock(COLBS);
   dim3 dimGrid(batch_count);
-  applyW_kernel<cuDoubleComplex, COLBS><<<dimGrid, dimBlock, 0, hstream>>>(delay_list, delay_count, (cuDoubleComplex**)tempMat, lda);
+  applyW_kernel<cuDoubleComplex, COLBS>
+      <<<dimGrid, dimBlock, 0, hstream>>>(delay_list, delay_count, (cuDoubleComplex**)tempMat, lda);
   return cudaPeekAtLastError();
 }
 
diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp
index 3a7e9a27eb..a6e5433808 100644
--- a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp
+++ b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.hpp
@@ -40,8 +40,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     float* const temp[],
                                     float* const rcopy[],
-                                    const float* const dphi_in[],
-                                    const float* const d2phi_in[],
+                                    const float* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     float* const dphi_out[],
                                     float* const d2phi_out[],
                                     const int batch_count);
@@ -53,8 +53,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     double* const temp[],
                                     double* const rcopy[],
-                                    const double* const dphi_in[],
-                                    const double* const d2phi_in[],
+                                    const double* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     double* const dphi_out[],
                                     double* const d2phi_out[],
                                     const int batch_count);
@@ -66,8 +66,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     std::complex<float>* const temp[],
                                     std::complex<float>* const rcopy[],
-                                    const std::complex<float>* const dphi_in[],
-                                    const std::complex<float>* const d2phi_in[],
+                                    const std::complex<float>* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     std::complex<float>* const dphi_out[],
                                     std::complex<float>* const d2phi_out[],
                                     const int batch_count);
@@ -79,8 +79,8 @@ cudaError_t copyAinvRow_saveGL_cuda(cudaStream_t& hstream,
                                     const int lda,
                                     std::complex<double>* const temp[],
                                     std::complex<double>* const rcopy[],
-                                    const std::complex<double>* const dphi_in[],
-                                    const std::complex<double>* const d2phi_in[],
+                                    const std::complex<double>* const phi_vgl_in[],
+                                    const size_t phi_vgl_stride,
                                     std::complex<double>* const dphi_out[],
                                     std::complex<double>* const d2phi_out[],
                                     const int batch_count);
@@ -121,9 +121,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   float* const binv[],
                                                   const int binv_lda,
                                                   const float* const ratio_inv,
-                                                  const float* const phi_in[],
-                                                  const float* const dphi_in[],
-                                                  const float* const d2phi_in[],
+                                                  const float* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   float* const phi_out[],
                                                   float* const dphi_out[],
                                                   float* const d2phi_out[],
@@ -138,9 +137,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   double* const binv[],
                                                   const int binv_lda,
                                                   const double* const ratio_inv,
-                                                  const double* const phi_in[],
-                                                  const double* const dphi_in[],
-                                                  const double* const d2phi_in[],
+                                                  const double* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   double* const phi_out[],
                                                   double* const dphi_out[],
                                                   double* const d2phi_out[],
@@ -155,9 +153,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   std::complex<float>* const binv[],
                                                   const int binv_lda,
                                                   const std::complex<float>* const ratio_inv,
-                                                  const std::complex<float>* const phi_in[],
-                                                  const std::complex<float>* const dphi_in[],
-                                                  const std::complex<float>* const d2phi_in[],
+                                                  const std::complex<float>* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   std::complex<float>* const phi_out[],
                                                   std::complex<float>* const dphi_out[],
                                                   std::complex<float>* const d2phi_out[],
@@ -172,9 +169,8 @@ cudaError_t add_delay_list_save_sigma_VGL_batched(cudaStream_t& hstream,
                                                   std::complex<double>* const binv[],
                                                   const int binv_lda,
                                                   const std::complex<double>* const ratio_inv,
-                                                  const std::complex<double>* const phi_in[],
-                                                  const std::complex<double>* const dphi_in[],
-                                                  const std::complex<double>* const d2phi_in[],
+                                                  const std::complex<double>* const phi_vgl_in[],
+                                                  const size_t phi_vgl_stride,
                                                   std::complex<double>* const phi_out[],
                                                   std::complex<double>* const dphi_out[],
                                                   std::complex<double>* const d2phi_out[],

From 4c96e70da5e144a2e70dafc01fde63f2889bf497 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Fri, 22 Apr 2022 20:31:41 -0500
Subject: [PATCH 16/36] Align SplineC2COMPTarget layout as SplineC2ROMPTarget

---
 .../BsplineFactory/SplineC2COMPTarget.cpp     | 112 +++++++++---------
 1 file changed, 58 insertions(+), 54 deletions(-)

diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp
index ceb6a9c756..736b944040 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp
@@ -26,7 +26,6 @@ inline void assign_v(ST x,
                      ST y,
                      ST z,
                      TT* restrict results_scratch_ptr,
-                     size_t orb_size,
                      const ST* restrict offload_scratch_ptr,
                      const ST* restrict myKcart_ptr,
                      size_t myKcart_padded_size,
@@ -56,8 +55,8 @@ inline void assign_vgl(ST x,
                        ST y,
                        ST z,
                        TT* restrict results_scratch_ptr,
+                       size_t orb_padded_size,
                        const ST* mKK_ptr,
-                       size_t orb_size,
                        const ST* restrict offload_scratch_ptr,
                        size_t spline_padded_size,
                        const ST symGGt[6],
@@ -86,10 +85,6 @@ inline void assign_vgl(ST x,
   const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8;
   const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9;
 
-  TT* restrict psi   = results_scratch_ptr;
-  TT* restrict dpsi  = results_scratch_ptr + orb_size;
-  TT* restrict d2psi = results_scratch_ptr + orb_size * 4;
-
   const size_t jr = index << 1;
   const size_t ji = jr + 1;
 
@@ -125,12 +120,18 @@ inline void assign_vgl(ST x,
   const ST lap_r   = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
   const ST lap_i   = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
 
-  const size_t psiIndex  = first_spo + index;
-  psi[psiIndex]          = TT(c * val_r - s * val_i, c * val_i + s * val_r);
-  d2psi[psiIndex]        = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
-  dpsi[psiIndex * 3]     = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-  dpsi[psiIndex * 3 + 1] = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-  dpsi[psiIndex * 3 + 2] = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+  TT* restrict psi    = results_scratch_ptr;
+  TT* restrict dpsi_x = results_scratch_ptr + orb_padded_size;
+  TT* restrict dpsi_y = results_scratch_ptr + orb_padded_size * 2;
+  TT* restrict dpsi_z = results_scratch_ptr + orb_padded_size * 3;
+  TT* restrict d2psi  = results_scratch_ptr + orb_padded_size * 4;
+
+  const size_t psiIndex = first_spo + index;
+  psi[psiIndex]         = TT(c * val_r - s * val_i, c * val_i + s * val_r);
+  d2psi[psiIndex]       = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
+  dpsi_x[psiIndex]      = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+  dpsi_y[psiIndex]      = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+  dpsi_z[psiIndex]      = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
 }
 } // namespace C2C
 
@@ -236,7 +237,7 @@ void SplineC2COMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
   const auto padded_size = myV.size();
   offload_scratch.resize(padded_size * nVP);
   const auto orb_size = psiinv.size();
-  results_scratch.resize(orb_size * nVP);
+  results_scratch.resize(padded_size * nVP);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
@@ -260,7 +261,7 @@ void SplineC2COMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
-        auto* restrict psi_iat_ptr             = results_scratch_ptr + orb_size * iat;
+        auto* restrict psi_iat_ptr             = results_scratch_ptr + padded_size * iat;
         auto* restrict pos_scratch             = reinterpret_cast<RealType*>(psiinv_ptr + orb_size);
 
         int ix, iy, iz;
@@ -277,8 +278,7 @@ void SplineC2COMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
-                        psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
-                        first_spo_local, index);
+                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
 
         ComplexT sum(0);
         PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
@@ -355,7 +355,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
   mw_ratios_private.resize(mw_nVP, NumTeams);
   const auto padded_size = myV.size();
   mw_offload_scratch.resize(padded_size * mw_nVP);
-  mw_results_scratch.resize(orb_size * mw_nVP);
+  mw_results_scratch.resize(padded_size * mw_nVP);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
@@ -379,7 +379,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
-        auto* restrict psi_iat_ptr             = results_scratch_ptr + orb_size * iat;
+        auto* restrict psi_iat_ptr             = results_scratch_ptr + padded_size * iat;
         auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
         auto* restrict psiinv_ptr  = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
         auto* restrict pos_scratch = reinterpret_cast<ST*>(buffer_H2D_ptr + nw * sizeof(ValueType*));
@@ -397,7 +397,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr, orb_size,
+          C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr,
                         offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
 
         ComplexT sum(0);
@@ -505,7 +505,7 @@ void SplineC2COMPTarget<ST>::evaluateVGL(const ParticleSet& P,
   offload_scratch.resize(padded_size * 10);
   const auto orb_size = psi.size();
   // for V(1)G(3)L(1) final result
-  results_scratch.resize(orb_size * 5);
+  results_scratch.resize(padded_size * 5);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr    = SplineInst->getSplinePtr();
@@ -523,7 +523,7 @@ void SplineC2COMPTarget<ST>::evaluateVGL(const ParticleSet& P,
   {
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
-                map(always, from: results_scratch_ptr[0:orb_size*5])")
+                map(always, from: results_scratch_ptr[0:padded_size*5])")
     for (int team_id = 0; team_id < NumTeams; team_id++)
     {
       const size_t first = ChunkSizePerTeam * team_id;
@@ -548,7 +548,7 @@ void SplineC2COMPTarget<ST>::evaluateVGL(const ParticleSet& P,
       const size_t last_cplx  = omptarget::min(last / 2, orb_size);
       PRAGMA_OFFLOAD("omp parallel for")
       for (int index = first_cplx; index < last_cplx; index++)
-        C2C::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G,
+        C2C::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, symGGt, G,
                         myKcart_ptr, myKcart_padded_size, first_spo_local, index);
     }
   }
@@ -556,10 +556,10 @@ void SplineC2COMPTarget<ST>::evaluateVGL(const ParticleSet& P,
   for (size_t i = 0; i < orb_size; i++)
   {
     psi[i]     = results_scratch[i];
-    dpsi[i][0] = results_scratch[orb_size + i * 3];
-    dpsi[i][1] = results_scratch[orb_size + i * 3 + 1];
-    dpsi[i][2] = results_scratch[orb_size + i * 3 + 2];
-    d2psi[i]   = results_scratch[orb_size * 4 + i];
+    dpsi[i][0] = results_scratch[i + padded_size];
+    dpsi[i][1] = results_scratch[i + padded_size * 2];
+    dpsi[i][2] = results_scratch[i + padded_size * 3];
+    d2psi[i]   = results_scratch[i + padded_size * 4];
   }
 }
 
@@ -579,7 +579,7 @@ void SplineC2COMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
   offload_scratch.resize(padded_size * num_pos * 10);
   const auto orb_size = psi_v_list[0].get().size();
   // for V(1)G(3)L(1) final result
-  results_scratch.resize(orb_size * num_pos * 5);
+  results_scratch.resize(padded_size * num_pos * 5);
 
   // Ye: need to extract sizes and pointers before entering target region
   const auto* spline_ptr         = SplineInst->getSplinePtr();
@@ -597,7 +597,7 @@ void SplineC2COMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
     ScopedTimer offload(offload_timer_);
     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
                     map(always, to: pos_copy_ptr[0:num_pos*6]) \
-                    map(always, from: results_scratch_ptr[0:orb_size*num_pos*5])")
+                    map(always, from: results_scratch_ptr[0:padded_size*num_pos*5])")
     for (int iw = 0; iw < num_pos; iw++)
       for (int team_id = 0; team_id < NumTeams; team_id++)
       {
@@ -605,7 +605,7 @@ void SplineC2COMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10;
-        auto* restrict psi_iw_ptr             = results_scratch_ptr + orb_size * iw * 5;
+        auto* restrict psi_iw_ptr             = results_scratch_ptr + padded_size * iw * 5;
 
         int ix, iy, iz;
         ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
@@ -629,25 +629,25 @@ void SplineC2COMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedA
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr, mKK_ptr,
-                          orb_size, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size,
-                          first_spo_local, index);
+          C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
+                          padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr,
+                          myKcart_padded_size, first_spo_local, index);
       }
   }
 
   for (int iw = 0; iw < num_pos; ++iw)
   {
-    auto* restrict results_iw_ptr = results_scratch_ptr + orb_size * iw * 5;
+    auto* restrict results_iw_ptr = results_scratch_ptr + padded_size * iw * 5;
     ValueVector& psi_v(psi_v_list[iw]);
     GradVector& dpsi_v(dpsi_v_list[iw]);
     ValueVector& d2psi_v(d2psi_v_list[iw]);
     for (size_t i = 0; i < orb_size; i++)
     {
       psi_v[i]     = results_iw_ptr[i];
-      dpsi_v[i][0] = results_iw_ptr[orb_size + i * 3];
-      dpsi_v[i][1] = results_iw_ptr[orb_size + i * 3 + 1];
-      dpsi_v[i][2] = results_iw_ptr[orb_size + i * 3 + 2];
-      d2psi_v[i]   = results_iw_ptr[orb_size * 4 + i];
+      dpsi_v[i][0] = results_iw_ptr[i + padded_size];
+      dpsi_v[i][1] = results_iw_ptr[i + padded_size * 2];
+      dpsi_v[i][2] = results_iw_ptr[i + padded_size * 3];
+      d2psi_v[i]   = results_iw_ptr[i + padded_size * 4];
     }
   }
 }
@@ -739,7 +739,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
   mw_offload_scratch.resize(padded_size * num_pos * 10);
   const auto orb_size = phi_vgl_v.size() / num_pos;
   // for V(1)G(3)L(1) final result
-  mw_results_scratch.resize(orb_size * num_pos * 5);
+  mw_results_scratch.resize(padded_size * num_pos * 5);
   // per team ratio and grads
   rg_private.resize(num_pos, NumTeams * 4);
 
@@ -771,7 +771,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
         const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
 
         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10;
-        auto* restrict psi_iw_ptr             = results_scratch_ptr + orb_size * iw * 5;
+        auto* restrict psi_iw_ptr             = results_scratch_ptr + padded_size * iw * 5;
         const auto* restrict pos_iw_ptr       = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
         const auto* restrict invRow_iw_ptr =
             *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);
@@ -798,17 +798,21 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size,
+          C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr,
                           offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size,
                           first_spo_local, index);
 
-        ValueType* restrict psi   = psi_iw_ptr;
-        ValueType* restrict dpsi  = psi_iw_ptr + orb_size;
-        ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4;
+        ValueType* restrict psi    = psi_iw_ptr;
+        ValueType* restrict dpsi_x = psi_iw_ptr + padded_size;
+        ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2;
+        ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3;
+        ValueType* restrict d2psi  = psi_iw_ptr + padded_size * 4;
 
-        ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size;
-        ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3;
-        ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size;
+        ValueType* restrict out_phi    = phi_vgl_ptr + iw * orb_size;
+        ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
+        ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
+        ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
+        ValueType* restrict out_d2phi  = out_dphi_z + phi_vgl_stride;
 
         ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
         PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
@@ -816,16 +820,16 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
         {
           const size_t psiIndex = first_spo_local + j;
 
-          out_phi_v[psiIndex]         = psi[psiIndex];
-          out_phi_l[psiIndex]         = d2psi[psiIndex];
-          out_phi_g[psiIndex * 3]     = dpsi[psiIndex * 3];
-          out_phi_g[psiIndex * 3 + 1] = dpsi[psiIndex * 3 + 1];
-          out_phi_g[psiIndex * 3 + 2] = dpsi[psiIndex * 3 + 2];
+          out_phi[psiIndex]    = psi[psiIndex];
+          out_dphi_x[psiIndex] = dpsi_x[psiIndex];
+          out_dphi_y[psiIndex] = dpsi_y[psiIndex];
+          out_dphi_z[psiIndex] = dpsi_z[psiIndex];
+          out_d2phi[psiIndex]  = d2psi[psiIndex];
 
           ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
-          grad_x += dpsi[psiIndex * 3] * invRow_iw_ptr[psiIndex];
-          grad_y += dpsi[psiIndex * 3 + 1] * invRow_iw_ptr[psiIndex];
-          grad_z += dpsi[psiIndex * 3 + 2] * invRow_iw_ptr[psiIndex];
+          grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex];
+          grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex];
+          grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex];
         }
 
         rg_private_ptr[(iw * NumTeams + team_id) * 4]     = ratio;

From 8ce4c90d893bf3aa0032f9ffd9fed3795f5713de Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Fri, 10 Jun 2022 21:54:49 -0500
Subject: [PATCH 17/36] Adopt OhmmsArray in SPOSet.

---
 src/Configuration.h                           |  3 +-
 .../BsplineFactory/SplineC2COMPTarget.cpp     |  8 ++--
 .../BsplineFactory/SplineC2COMPTarget.h       |  2 +-
 .../BsplineFactory/SplineC2ROMPTarget.cpp     | 20 +++++-----
 .../BsplineFactory/SplineC2ROMPTarget.h       |  2 +-
 .../Fermion/DiracDeterminantBatched.cpp       | 21 +++-------
 .../Fermion/DiracDeterminantBatched.h         |  6 ++-
 .../Fermion/MatrixDelayedUpdateCUDA.h         | 40 +++++++++----------
 .../Fermion/MatrixUpdateOMPTarget.h           | 34 ++++++++--------
 src/QMCWaveFunctions/SPOSet.cpp               | 17 ++++----
 src/QMCWaveFunctions/SPOSet.h                 |  9 +++--
 11 files changed, 80 insertions(+), 82 deletions(-)

diff --git a/src/Configuration.h b/src/Configuration.h
index 42ba461299..44230d95a2 100644
--- a/src/Configuration.h
+++ b/src/Configuration.h
@@ -50,7 +50,8 @@ struct QMCTraits
 {
   enum
   {
-    DIM = OHMMS_DIM
+    DIM = OHMMS_DIM,
+    DIM_VGL = OHMMS_DIM + 2
   };
   using QTBase      = QMCTypes<OHMMS_PRECISION, DIM>;
   using QTFull      = QMCTypes<OHMMS_PRECISION_FULL, DIM>;
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp
index 736b944040..3844d07b37 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp
@@ -699,7 +699,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
                                                             const RefVectorWithLeader<ParticleSet>& P_list,
                                                             int iat,
                                                             const std::vector<const ValueType*>& invRow_ptr_list,
-                                                            VGLVector& phi_vgl_v,
+                                                            OffloadMWVGLArray& phi_vgl_v,
                                                             std::vector<ValueType>& ratios,
                                                             std::vector<GradType>& grads) const
 {
@@ -732,12 +732,12 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
   }
 
   const size_t num_pos          = nwalkers;
+  const auto orb_size           = phi_vgl_v.size(2);
+  const auto padded_size        = myV.size();
   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-  const auto padded_size        = myV.size();
   // for V(1)G(3)H(6) intermediate result
   mw_offload_scratch.resize(padded_size * num_pos * 10);
-  const auto orb_size = phi_vgl_v.size() / num_pos;
   // for V(1)G(3)L(1) final result
   mw_results_scratch.resize(padded_size * num_pos * 5);
   // per team ratio and grads
@@ -757,7 +757,7 @@ void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
   auto* rg_private_ptr           = rg_private.data();
   const size_t buffer_H2D_stride = buffer_H2D.cols();
   const size_t first_spo_local   = first_spo;
-  const size_t phi_vgl_stride    = phi_vgl_v.capacity();
+  const size_t phi_vgl_stride    = num_pos * orb_size;
 
   {
     ScopedTimer offload(offload_timer_);
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h
index 2a006b54d9..25e5d1c8d8 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h
@@ -267,7 +267,7 @@ class SplineC2COMPTarget : public BsplineSet
                                               const RefVectorWithLeader<ParticleSet>& P_list,
                                               int iat,
                                               const std::vector<const ValueType*>& invRow_ptr_list,
-                                              VGLVector& phi_vgl_v,
+                                              OffloadMWVGLArray& phi_vgl_v,
                                               std::vector<ValueType>& ratios,
                                               std::vector<GradType>& grads) const override;
 
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp
index 683703bc08..387d24a0ee 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp
@@ -279,8 +279,8 @@ void SplineC2ROMPTarget<ST>::evaluateValue(const ParticleSet& P, const int iat,
         const size_t last_cplx  = omptarget::min(last / 2, orb_size);
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
-          C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size,
-                        first_spo_local, nComplexBands_local, index);
+          C2R::assign_v(x, y, z, psi_ptr, offload_scratch_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
+                        nComplexBands_local, index);
       }
     }
   }
@@ -360,8 +360,8 @@ void SplineC2ROMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
-                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
-                        first_spo_local, nComplexBands_local, index);
+                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
+                        nComplexBands_local, index);
 
         const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
         const size_t last_real  = last_cplx + omptarget::min(nComplexBands_local, last_cplx);
@@ -484,8 +484,8 @@ void SplineC2ROMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOS
         PRAGMA_OFFLOAD("omp parallel for")
         for (int index = first_cplx; index < last_cplx; index++)
           C2R::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
-                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
-                        first_spo_local, nComplexBands_local, index);
+                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local,
+                        nComplexBands_local, index);
 
         const size_t first_real = first_cplx + omptarget::min(nComplexBands_local, first_cplx);
         const size_t last_real  = last_cplx + omptarget::min(nComplexBands_local, last_cplx);
@@ -845,7 +845,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
                                                             const RefVectorWithLeader<ParticleSet>& P_list,
                                                             int iat,
                                                             const std::vector<const ValueType*>& invRow_ptr_list,
-                                                            VGLVector& phi_vgl_v,
+                                                            OffloadMWVGLArray& phi_vgl_v,
                                                             std::vector<ValueType>& ratios,
                                                             std::vector<GradType>& grads) const
 {
@@ -878,12 +878,12 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
   }
 
   const size_t num_pos          = nwalkers;
+  const auto orb_size           = phi_vgl_v.size(2);
+  const auto padded_size        = myV.size();
   const size_t ChunkSizePerTeam = 512;
   const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-  const auto padded_size        = myV.size();
   // for V(1)G(3)H(6) intermediate result
   mw_offload_scratch.resize(padded_size * num_pos * 10);
-  const auto orb_size = phi_vgl_v.size() / num_pos;
   // for V(1)G(3)L(1) final result
   mw_results_scratch.resize(padded_size * num_pos * 5);
   // per team ratio and grads
@@ -903,7 +903,7 @@ void SplineC2ROMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithL
   auto* rg_private_ptr             = rg_private.data();
   const size_t buffer_H2D_stride   = buffer_H2D.cols();
   const size_t first_spo_local     = first_spo;
-  const size_t phi_vgl_stride      = phi_vgl_v.capacity();
+  const size_t phi_vgl_stride      = num_pos * orb_size;
   const size_t nComplexBands_local = nComplexBands;
 
   {
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h
index c1f311ee78..b183e6256c 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h
@@ -276,7 +276,7 @@ class SplineC2ROMPTarget : public BsplineSet
                                               const RefVectorWithLeader<ParticleSet>& P_list,
                                               int iat,
                                               const std::vector<const ValueType*>& invRow_ptr_list,
-                                              VGLVector& phi_vgl_v,
+                                              OffloadMWVGLArray& phi_vgl_v,
                                               std::vector<ValueType>& ratios,
                                               std::vector<GradType>& grads) const override;
 
diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp
index b70f3431d0..fd877e858e 100644
--- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp
+++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp
@@ -232,13 +232,11 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_ratioGrad(const RefVectorWithLeader
 
     auto psiMinv_row_dev_ptr_list = DET_ENGINE::mw_getInvRow(engine_list, WorkingIndex, !Phi->isOMPoffload());
 
-    phi_vgl_v.resize(NumOrbitals * wfc_list.size());
+    phi_vgl_v.resize(DIM_VGL, wfc_list.size(), NumOrbitals);
     ratios_local.resize(wfc_list.size());
     grad_new_local.resize(wfc_list.size());
 
-    VectorSoaContainer<Value, DIM + 2> phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(),
-                                                      phi_vgl_v.capacity());
-    wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view,
+    wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v,
                                                    ratios_local, grad_new_local);
   }
 
@@ -317,15 +315,8 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_accept_rejectMove(
     det.curRatio = 1.0;
   }
 
-  if (!Phi->isOMPoffload() && n_accepted > 0)
-  {
-    auto* phi_vgl_v_ptr = phi_vgl_v.data();
-    // transfer host to device, total size 5, v(1) + g(3) + l(1)
-    PRAGMA_OFFLOAD("omp target update to(phi_vgl_v_ptr[:phi_vgl_v.capacity()*5])")
-  }
-
   DET_ENGINE::mw_accept_rejectRow(engine_list, WorkingIndex, psiM_g_dev_ptr_list, psiM_l_dev_ptr_list, isAccepted,
-                                  phi_vgl_v.device_data(), phi_vgl_v.capacity(), ratios_local);
+                                  phi_vgl_v, ratios_local);
 
   if (!safe_to_delay)
     DET_ENGINE::mw_updateInvMat(engine_list);
@@ -595,15 +586,13 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_calcRatio(const RefVectorWithLeader
 
     auto psiMinv_row_dev_ptr_list = DET_ENGINE::mw_getInvRow(engine_list, WorkingIndex, !Phi->isOMPoffload());
 
-    phi_vgl_v.resize(NumOrbitals * wfc_list.size());
+    phi_vgl_v.resize(DIM_VGL, wfc_list.size(), NumOrbitals);
     ratios_local.resize(wfc_list.size());
     grad_new_local.resize(wfc_list.size());
 
-    VectorSoaContainer<Value, DIM + 2> phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(),
-                                                      phi_vgl_v.capacity());
     // calling Phi->mw_evaluateVGLandDetRatioGrads is a temporary workaround.
     // We may implement mw_evaluateVandDetRatio in the future.
-    wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view,
+    wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v,
                                                    ratios_local, grad_new_local);
   }
 
diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h
index 176a00db44..04c5de28f6 100644
--- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h
+++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h
@@ -55,6 +55,8 @@ class DiracDeterminantBatched : public DiracDeterminantBase
   using DualMatrix    = Matrix<DT, PinnedDualAllocator<DT>>;
   using DualVGLVector = VectorSoaContainer<Value, DIM + 2, PinnedDualAllocator<Value>>;
 
+  using OffloadMWVGLArray = typename SPOSet::OffloadMWVGLArray;
+
   struct DiracDeterminantBatchedMultiWalkerResource : public Resource
   {
     DiracDeterminantBatchedMultiWalkerResource() : Resource("DiracDeterminantBatched") {}
@@ -64,8 +66,8 @@ class DiracDeterminantBatched : public DiracDeterminantBase
 
     Resource* makeClone() const override { return new DiracDeterminantBatchedMultiWalkerResource(*this); }
     DualVector<LogValue> log_values;
-    /// value, grads, laplacian of single-particle orbital for particle-by-particle update and multi walker [5][nw*norb]
-    DualVGLVector phi_vgl_v;
+    /// value, grads, laplacian of single-particle orbital for particle-by-particle update and multi walker [5][nw][norb]
+    OffloadMWVGLArray phi_vgl_v;
     // multi walker of ratio
     std::vector<Value> ratios_local;
     // multi walker of grads
diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
index 6d37f5fc54..e8837edebd 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
@@ -60,6 +60,8 @@ class MatrixDelayedUpdateCUDA
   using DualMatrix = Matrix<DT, PinnedDualAllocator<DT>>;
   template<typename DT>
   using DualVGLVector = VectorSoaContainer<DT, QMCTraits::DIM + 2, PinnedDualAllocator<DT>>;
+  template<typename DT>
+  using OffloadMWVGLArray = Array<DT, 3, OffloadPinnedAllocator<DT>>; // [VGL, walker, Orbs]
 
   struct MatrixDelayedUpdateCUDAMultiWalkerMem : public Resource
   {
@@ -269,8 +271,7 @@ class MatrixDelayedUpdateCUDA
    *  \param[in] psiM_g_list        device ptrs
    *  \param[in] psiM_l_list        device ptrs
    *  \param[in] isAccepted         bool but wait some lists are also filtered
-   *  \param[in] phi_vgl_v_dev_ptr  device ptr
-   *  \param[in] phi_vgl_stride     size of each "vector" in phi_vgl_v
+   *  \param[in] phi_vgl_v          multiple walker orbital VGL
    *  \param[inout] ratios
    */
   static void mw_updateRow(const RefVectorWithLeader<This_t>& engines,
@@ -278,8 +279,7 @@ class MatrixDelayedUpdateCUDA
                            const std::vector<Value*>& psiM_g_list,
                            const std::vector<Value*>& psiM_l_list,
                            const std::vector<bool>& isAccepted,
-                           const Value* phi_vgl_v_dev_ptr,
-                           const size_t phi_vgl_stride,
+                           const OffloadMWVGLArray<Value>& phi_vgl_v,
                            const std::vector<Value>& ratios)
   {
     auto& engine_leader = engines.getLeader();
@@ -293,14 +293,16 @@ class MatrixDelayedUpdateCUDA
     if (n_accepted == 0)
       return;
 
-    auto& hstream              = engine_leader.cuda_handles_->hstream;
-    auto& updateRow_buffer_H2D = engine_leader.mw_mem_->updateRow_buffer_H2D;
-    auto& mw_temp              = engine_leader.mw_mem_->mw_temp;
-    auto& mw_rcopy             = engine_leader.mw_mem_->mw_rcopy;
-    auto& cone_vec             = engine_leader.mw_mem_->cone_vec;
-    auto& czero_vec            = engine_leader.mw_mem_->czero_vec;
-    const int norb             = engine_leader.get_ref_psiMinv().rows();
-    const int lda              = engine_leader.get_ref_psiMinv().cols();
+    auto& hstream               = engine_leader.cuda_handles_->hstream;
+    auto& updateRow_buffer_H2D  = engine_leader.mw_mem_->updateRow_buffer_H2D;
+    auto& mw_temp               = engine_leader.mw_mem_->mw_temp;
+    auto& mw_rcopy              = engine_leader.mw_mem_->mw_rcopy;
+    auto& cone_vec              = engine_leader.mw_mem_->cone_vec;
+    auto& czero_vec             = engine_leader.mw_mem_->czero_vec;
+    const int norb              = engine_leader.get_ref_psiMinv().rows();
+    const int lda               = engine_leader.get_ref_psiMinv().cols();
+    const int nw                = engines.size();
+    const size_t phi_vgl_stride = nw * norb;
     mw_temp.resize(norb * n_accepted);
     mw_rcopy.resize(norb * n_accepted);
     updateRow_buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted);
@@ -312,7 +314,7 @@ class MatrixDelayedUpdateCUDA
       if (isAccepted[iw])
       {
         ptr_buffer[0][count] = engines[iw].get_ref_psiMinv().device_data();
-        ptr_buffer[1][count] = const_cast<Value*>(phi_vgl_v_dev_ptr + norb * iw);
+        ptr_buffer[1][count] = const_cast<Value*>(phi_vgl_v.device_data_at(0, iw, 0));
         ptr_buffer[2][count] = mw_temp.device_data() + norb * count;
         ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count;
         ptr_buffer[4][count] = psiM_g_list[count];
@@ -540,8 +542,7 @@ class MatrixDelayedUpdateCUDA
    *  \param[in] psiM_g_list
    *  \param[in] psiM_l_list
    *  \param[in] isAccepted
-   *  \param[in] phi_vgl_v_dev_ptr
-   *  \param[in] phi_vgl_stride     size of each "vector" in phi_vgl_v
+   *  \param[in] phi_vgl_v          multiple walker orbital VGL
    *  \param[inout] ratios
    */
   static void mw_accept_rejectRow(const RefVectorWithLeader<This_t>& engines,
@@ -549,8 +550,7 @@ class MatrixDelayedUpdateCUDA
                                   const std::vector<Value*>& psiM_g_list,
                                   const std::vector<Value*>& psiM_l_list,
                                   const std::vector<bool>& isAccepted,
-                                  const Value* phi_vgl_v_dev_ptr,
-                                  const size_t phi_vgl_stride,
+                                  const OffloadMWVGLArray<Value>& phi_vgl_v,
                                   const std::vector<Value>& ratios)
   {
     auto& engine_leader = engines.getLeader();
@@ -559,8 +559,7 @@ class MatrixDelayedUpdateCUDA
 
     if (engine_leader.isSM1())
     {
-      mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v_dev_ptr, phi_vgl_stride,
-                   ratios);
+      mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios);
       return;
     }
 
@@ -575,6 +574,7 @@ class MatrixDelayedUpdateCUDA
     const int lda                     = engine_leader.get_psiMinv().cols();
     const int nw                      = engines.size();
     const int n_accepted              = psiM_g_list.size();
+    const size_t phi_vgl_stride       = nw * norb;
     accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 12 + sizeof(Value)) * nw);
     engine_leader.resize_fill_constant_arrays(nw);
 
@@ -594,7 +594,7 @@ class MatrixDelayedUpdateCUDA
         ptr_buffer[6][count_accepted]  = engine.Binv_gpu.data() + delay_count;
         ptr_buffer[7][count_accepted]  = reinterpret_cast<Value*>(engine.delay_list_gpu.data());
         ptr_buffer[8][count_accepted]  = engine.V_gpu.data() + norb * delay_count;
-        ptr_buffer[9][count_accepted]  = const_cast<Value*>(phi_vgl_v_dev_ptr + norb * iw);
+        ptr_buffer[9][count_accepted]  = const_cast<Value*>(phi_vgl_v.device_data_at(0, iw, 0));
         ptr_buffer[10][count_accepted] = psiM_g_list[count_accepted];
         ptr_buffer[11][count_accepted] = psiM_l_list[count_accepted];
         c_ratio_inv[count_accepted]    = Value(1) / ratios[iw];
diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
index 8a5bea5f3f..94ff60e448 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
@@ -54,6 +54,8 @@ class MatrixUpdateOMPTarget
   using OffloadMatrix = Matrix<DT, OffloadPinnedAllocator<DT>>;
   template<typename DT>
   using OffloadVGLVector = VectorSoaContainer<DT, QMCTraits::DIM + 2, OffloadPinnedAllocator<DT>>;
+  template<typename DT>
+  using OffloadMWVGLArray = Array<DT, 3, OffloadPinnedAllocator<DT>>; // [VGL, walker, Orbs]
 
   struct MatrixUpdateOMPTargetMultiWalkerMem : public Resource
   {
@@ -240,23 +242,24 @@ class MatrixUpdateOMPTarget
                            const std::vector<Value*>& psiM_g_list,
                            const std::vector<Value*>& psiM_l_list,
                            const std::vector<bool>& isAccepted,
-                           const Value* phi_vgl_v_dev_ptr,
-                           const size_t phi_vgl_stride,
+                           const OffloadMWVGLArray<Value>& phi_vgl_v,
                            const std::vector<Value>& ratios)
   {
     const size_t n_accepted = psiM_g_list.size();
     if (n_accepted == 0)
       return;
 
-    auto& engine_leader = engines.getLeader();
-    auto& buffer_H2D    = engine_leader.mw_mem_->buffer_H2D;
-    auto& grads_value_v = engine_leader.mw_mem_->grads_value_v;
-    auto& cone_vec      = engine_leader.mw_mem_->cone_vec;
-    auto& czero_vec     = engine_leader.mw_mem_->czero_vec;
-    auto& mw_temp       = engine_leader.mw_mem_->mw_temp;
-    auto& mw_rcopy      = engine_leader.mw_mem_->mw_rcopy;
-    const int norb      = engine_leader.get_psiMinv().rows();
-    const int lda       = engine_leader.get_psiMinv().cols();
+    auto& engine_leader         = engines.getLeader();
+    auto& buffer_H2D            = engine_leader.mw_mem_->buffer_H2D;
+    auto& grads_value_v         = engine_leader.mw_mem_->grads_value_v;
+    auto& cone_vec              = engine_leader.mw_mem_->cone_vec;
+    auto& czero_vec             = engine_leader.mw_mem_->czero_vec;
+    auto& mw_temp               = engine_leader.mw_mem_->mw_temp;
+    auto& mw_rcopy              = engine_leader.mw_mem_->mw_rcopy;
+    const int norb              = engine_leader.get_psiMinv().rows();
+    const int lda               = engine_leader.get_psiMinv().cols();
+    const size_t nw             = isAccepted.size();
+    const size_t phi_vgl_stride = nw * norb;
 
     engine_leader.resize_scratch_arrays(norb, n_accepted);
 
@@ -264,11 +267,11 @@ class MatrixUpdateOMPTarget
     buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted);
     Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(buffer_H2D.data()), 6, n_accepted);
     Value* c_ratio_inv = reinterpret_cast<Value*>(buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted);
-    for (int iw = 0, count = 0; iw < isAccepted.size(); iw++)
+    for (int iw = 0, count = 0; iw < nw; iw++)
       if (isAccepted[iw])
       {
         ptr_buffer[0][count] = engines[iw].get_ref_psiMinv().device_data();
-        ptr_buffer[1][count] = const_cast<Value*>(phi_vgl_v_dev_ptr + norb * iw);
+        ptr_buffer[1][count] = const_cast<Value*>(phi_vgl_v.device_data_at(0, iw, 0));
         ptr_buffer[2][count] = mw_temp.device_data() + norb * count;
         ptr_buffer[3][count] = mw_rcopy.device_data() + norb * count;
         ptr_buffer[4][count] = psiM_g_list[count];
@@ -343,11 +346,10 @@ class MatrixUpdateOMPTarget
                                   const std::vector<Value*>& psiM_g_list,
                                   const std::vector<Value*>& psiM_l_list,
                                   const std::vector<bool>& isAccepted,
-                                  const Value* phi_vgl_v_dev_ptr,
-                                  const size_t phi_vgl_stride,
+                                  const OffloadMWVGLArray<Value>& phi_vgl_v,
                                   const std::vector<Value>& ratios)
   {
-    mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v_dev_ptr, phi_vgl_stride, ratios);
+    mw_updateRow(engines, rowchanged, psiM_g_list, psiM_l_list, isAccepted, phi_vgl_v, ratios);
   }
 
   /** update the full Ainv and reset delay_count
diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp
index d2919ade80..93ade40335 100644
--- a/src/QMCWaveFunctions/SPOSet.cpp
+++ b/src/QMCWaveFunctions/SPOSet.cpp
@@ -112,20 +112,20 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& s
                                             const RefVectorWithLeader<ParticleSet>& P_list,
                                             int iat,
                                             const std::vector<const ValueType*>& invRow_ptr_list,
-                                            VGLVector& phi_vgl_v,
+                                            OffloadMWVGLArray& phi_vgl_v,
                                             std::vector<ValueType>& ratios,
                                             std::vector<GradType>& grads) const
 {
   assert(this == &spo_list.getLeader());
-  assert(OrbitalSetSize * nw == phi_vgl_v.size());
+  assert(phi_vgl_v.size(0) == DIM_VGL);
+  assert(phi_vgl_v.size(1) == spo_list.size());
   const size_t nw             = spo_list.size();
-  const size_t norb_requested = phi_vgl_v.size() / nw;
+  const size_t norb_requested = phi_vgl_v.size(2);
   GradVector dphi_v(norb_requested);
-#pragma omp parallel for
   for (int iw = 0; iw < nw; iw++)
   {
-    ValueVector phi_v(phi_vgl_v.data() + norb_requested * iw, norb_requested);
-    ValueVector d2phi_v(phi_vgl_v.data(4) + norb_requested * iw, norb_requested);
+    ValueVector phi_v(phi_vgl_v.data_at(0, iw, 0), norb_requested);
+    ValueVector d2phi_v(phi_vgl_v.data_at(4, iw, 0), norb_requested);
     spo_list[iw].evaluateVGL(P_list[iw], iat, phi_v, dphi_v, d2phi_v);
 
     ratios[iw] = simd::dot(invRow_ptr_list[iw], phi_v.data(), norb_requested);
@@ -134,11 +134,12 @@ void SPOSet::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& s
     // transpose the array of gradients to SoA in phi_vgl_v
     for (size_t idim = 0; idim < DIM; idim++)
     {
-      ValueType* phi_g = phi_vgl_v.data(idim + 1) + norb_requested * iw;
-      for (size_t iorb = 0; iorb < OrbitalSetSize; iorb++)
+      ValueType* phi_g = phi_vgl_v.data_at(idim + 1, iw, 0);
+      for (size_t iorb = 0; iorb < norb_requested; iorb++)
         phi_g[iorb] = dphi_v[iorb][idim];
     }
   }
+  phi_vgl_v.updateTo();
 }
 
 void SPOSet::evaluateThirdDeriv(const ParticleSet& P, int first, int last, GGGMatrix& grad_grad_grad_logdet)
diff --git a/src/QMCWaveFunctions/SPOSet.h b/src/QMCWaveFunctions/SPOSet.h
index bc031a4591..4c17b1bb58 100644
--- a/src/QMCWaveFunctions/SPOSet.h
+++ b/src/QMCWaveFunctions/SPOSet.h
@@ -27,6 +27,7 @@
 #ifdef QMC_CUDA
 #include "type_traits/CUDATypes.h"
 #endif
+#include "OMPTarget/OffloadAlignedAllocators.hpp"
 
 namespace qmcplusplus
 {
@@ -56,6 +57,8 @@ class SPOSet : public QMCTraits
   using Walker_t    = ParticleSet::Walker_t;
   using SPOPool_t   = std::map<std::string, SPOSet*>;
 
+  using OffloadMWVGLArray = Array<ValueType, 3, OffloadPinnedAllocator<ValueType>>; // [VGL, walker, Orbs]
+
   /** constructor */
   SPOSet(bool use_OMP_offload = false, bool ion_deriv = false, bool optimizable = false);
 
@@ -283,8 +286,8 @@ class SPOSet : public QMCTraits
                                       const RefVector<ValueVector>& d2psi_v_list,
                                       const RefVector<ValueVector>& dspin_v_list) const;
 
-  /** evaluate the values, gradients and laplacians of this single-particle orbital sets
-   *  and determinant ratio and grads of multiple walkers
+  /** evaluate the values, gradients and laplacians of this single-particle orbital sets and determinant ratio
+   *  and grads of multiple walkers. Device data of phi_vgl_v must be up-to-date upon return
    * @param spo_list the list of SPOSet pointers in a walker batch
    * @param P_list the list of ParticleSet pointers in a walker batch
    * @param iat active particle
@@ -295,7 +298,7 @@ class SPOSet : public QMCTraits
                                               const RefVectorWithLeader<ParticleSet>& P_list,
                                               int iat,
                                               const std::vector<const ValueType*>& invRow_ptr_list,
-                                              VGLVector& phi_vgl_v,
+                                              OffloadMWVGLArray& phi_vgl_v,
                                               std::vector<ValueType>& ratios,
                                               std::vector<GradType>& grads) const;
 

From 6ad3a904b5e095d52d261c9113c4d13cb90587e6 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 13 Jun 2022 23:05:33 -0500
Subject: [PATCH 18/36] Verbose magic number

---
 src/Configuration.h                           |  2 +-
 .../Fermion/MatrixDelayedUpdateCUDA.h         | 37 ++++++++++++-------
 .../Fermion/MatrixUpdateOMPTarget.h           | 16 ++++----
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/src/Configuration.h b/src/Configuration.h
index 44230d95a2..0244546422 100644
--- a/src/Configuration.h
+++ b/src/Configuration.h
@@ -51,7 +51,7 @@ struct QMCTraits
   enum
   {
     DIM = OHMMS_DIM,
-    DIM_VGL = OHMMS_DIM + 2
+    DIM_VGL = OHMMS_DIM + 2 // Value(1) + Gradients(OHMMS_DIM) + Laplacian(1)
   };
   using QTBase      = QMCTypes<OHMMS_PRECISION, DIM>;
   using QTFull      = QMCTypes<OHMMS_PRECISION_FULL, DIM>;
diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
index e8837edebd..55c187a4b9 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
@@ -206,11 +206,13 @@ class MatrixDelayedUpdateCUDA
     const int norb                   = engine_leader.get_psiMinv().rows();
     const int nw                     = engines.size();
     int& delay_count                 = engine_leader.delay_count;
-    prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * 7 * nw);
+
+    constexpr size_t num_ptrs_packed = 7; // it must match packing and unpacking
+    prepare_inv_row_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
     engine_leader.resize_fill_constant_arrays(nw);
 
     const int lda_Binv = engine_leader.Binv_gpu.cols();
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.data()), 7, nw);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(prepare_inv_row_buffer_H2D.data()), num_ptrs_packed, nw);
     for (int iw = 0; iw < nw; iw++)
     {
       This_t& engine    = engines[iw];
@@ -305,11 +307,14 @@ class MatrixDelayedUpdateCUDA
     const size_t phi_vgl_stride = nw * norb;
     mw_temp.resize(norb * n_accepted);
     mw_rcopy.resize(norb * n_accepted);
-    updateRow_buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted);
+
+    constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking
+    updateRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted);
 
     // to handle T** of Ainv, psi_v, temp, rcopy
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateRow_buffer_H2D.data()), 6, n_accepted);
-    Value* c_ratio_inv = reinterpret_cast<Value*>(updateRow_buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateRow_buffer_H2D.data()), num_ptrs_packed, n_accepted);
+    Value* c_ratio_inv =
+        reinterpret_cast<Value*>(updateRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted);
     for (int iw = 0, count = 0; iw < isAccepted.size(); iw++)
       if (isAccepted[iw])
       {
@@ -448,9 +453,10 @@ class MatrixDelayedUpdateCUDA
     auto& evalGrad_buffer_H2D = engine_leader.mw_mem_->evalGrad_buffer_H2D;
     auto& grads_value_v       = engine_leader.mw_mem_->grads_value_v;
 
-    const int nw = engines.size();
-    evalGrad_buffer_H2D.resize(sizeof(Value*) * 2 * nw);
-    Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(evalGrad_buffer_H2D.data()), 2, nw);
+    const int nw                     = engines.size();
+    constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking
+    evalGrad_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
+    Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(evalGrad_buffer_H2D.data()), num_ptrs_packed, nw);
     for (int iw = 0; iw < nw; iw++)
     {
       if (engine_leader.isSM1())
@@ -575,11 +581,14 @@ class MatrixDelayedUpdateCUDA
     const int nw                      = engines.size();
     const int n_accepted              = psiM_g_list.size();
     const size_t phi_vgl_stride       = nw * norb;
-    accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * 12 + sizeof(Value)) * nw);
+
+    constexpr size_t num_ptrs_packed = 12; // it must match packing and unpacking
+    accept_rejectRow_buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * nw);
     engine_leader.resize_fill_constant_arrays(nw);
 
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.data()), 12, nw);
-    Value* c_ratio_inv = reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * 12 * nw);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(accept_rejectRow_buffer_H2D.data()), num_ptrs_packed, nw);
+    Value* c_ratio_inv =
+        reinterpret_cast<Value*>(accept_rejectRow_buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * nw);
     for (int iw = 0, count_accepted = 0, count_rejected = 0; iw < nw; iw++)
     {
       This_t& engine = engines[iw];
@@ -693,10 +702,12 @@ class MatrixDelayedUpdateCUDA
     const int norb             = engine_leader.get_psiMinv().rows();
     const int lda              = engine_leader.get_psiMinv().cols();
     const int nw               = engines.size();
-    updateInv_buffer_H2D.resize(sizeof(Value*) * 6 * nw);
+
+    constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking
+    updateInv_buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
     engine_leader.resize_fill_constant_arrays(nw);
 
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateInv_buffer_H2D.data()), 6, nw);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(updateInv_buffer_H2D.data()), num_ptrs_packed, nw);
     for (int iw = 0; iw < nw; iw++)
     {
       This_t& engine    = engines[iw];
diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
index 94ff60e448..6ce11595b1 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
@@ -157,10 +157,11 @@ class MatrixUpdateOMPTarget
     auto& buffer_H2D    = engine_leader.mw_mem_->buffer_H2D;
     auto& grads_value_v = engine_leader.mw_mem_->grads_value_v;
 
-    const int norb = engine_leader.get_psiMinv().rows();
-    const int nw   = engines.size();
-    buffer_H2D.resize(sizeof(Value*) * 2 * nw);
-    Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(buffer_H2D.data()), 2, nw);
+    const int norb                   = engine_leader.get_psiMinv().rows();
+    const int nw                     = engines.size();
+    constexpr size_t num_ptrs_packed = 2; // it must match packing and unpacking
+    buffer_H2D.resize(sizeof(Value*) * num_ptrs_packed * nw);
+    Matrix<const Value*> ptr_buffer(reinterpret_cast<const Value**>(buffer_H2D.data()), num_ptrs_packed, nw);
     for (int iw = 0; iw < nw; iw++)
     {
       ptr_buffer[0][iw] = engines[iw].get_psiMinv().device_data() + rowchanged * engine_leader.get_psiMinv().cols();
@@ -264,9 +265,10 @@ class MatrixUpdateOMPTarget
     engine_leader.resize_scratch_arrays(norb, n_accepted);
 
     // to handle Value** of Ainv, psi_v, temp, rcopy
-    buffer_H2D.resize((sizeof(Value*) * 6 + sizeof(Value)) * n_accepted);
-    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(buffer_H2D.data()), 6, n_accepted);
-    Value* c_ratio_inv = reinterpret_cast<Value*>(buffer_H2D.data() + sizeof(Value*) * 6 * n_accepted);
+    constexpr size_t num_ptrs_packed = 6; // it must match packing and unpacking
+    buffer_H2D.resize((sizeof(Value*) * num_ptrs_packed + sizeof(Value)) * n_accepted);
+    Matrix<Value*> ptr_buffer(reinterpret_cast<Value**>(buffer_H2D.data()), num_ptrs_packed, n_accepted);
+    Value* c_ratio_inv = reinterpret_cast<Value*>(buffer_H2D.data() + sizeof(Value*) * num_ptrs_packed * n_accepted);
     for (int iw = 0, count = 0; iw < nw; iw++)
       if (isAccepted[iw])
       {

From 0d4d5e9db8dcda2e2f35b7d68dc89afe5981b47d Mon Sep 17 00:00:00 2001
From: Hyeondeok-Shin <hyeondeok.shin@gmail.com>
Date: Tue, 14 Jun 2022 11:49:53 -0500
Subject: [PATCH 19/36] Adjust tolerance for deterministic test for SOREP

---
 tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt b/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt
index fee8c897d9..c130bbde4b 100644
--- a/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt
+++ b/tests/solids/bccMo_2x1x1_SOREP/CMakeLists.txt
@@ -80,11 +80,11 @@ if(QMC_COMPLEX)
 #Deterministic test
 
   if(NOT QMC_MIXED_PRECISION)
-    list(APPEND DET_BCC_MO_VMC_SCALARS "totenergy" "-136.89324595 0.000001")
+    list(APPEND DET_BCC_MO_VMC_SCALARS "totenergy" "-136.89324595 0.000002")
     list(APPEND DET_BCC_MO_VMC_SCALARS "kinetic" "48.19451127 0.000001")
-    list(APPEND DET_BCC_MO_VMC_SCALARS "potential" "-185.08775723 0.000001")
+    list(APPEND DET_BCC_MO_VMC_SCALARS "potential" "-185.08775723 0.000002")
     list(APPEND DET_BCC_MO_VMC_SCALARS "eeenergy" "0.73302788 0.000001")
-    list(APPEND DET_BCC_MO_VMC_SCALARS "ionion" "-119.82754227 0.000001")
+    list(APPEND DET_BCC_MO_VMC_SCALARS "ionion" "-119.82754227 0.000002")
     list(APPEND DET_BCC_MO_VMC_SCALARS "localecp" "-90.13681859 0.000001")
     list(APPEND DET_BCC_MO_VMC_SCALARS "nonlocalecp" "24.16055828 0.000001")
     list(APPEND DET_BCC_MO_VMC_SCALARS "soecp" "-0.01698252 0.000001")
@@ -102,11 +102,11 @@ if(QMC_COMPLEX)
       DET_BCC_MO_VMC_SCALARS # VMC
     )
 
-    list(APPEND DET_BCC_MO_DMC_SCALARS "totenergy" "-135.29589113 0.000001")
+    list(APPEND DET_BCC_MO_DMC_SCALARS "totenergy" "-135.29589113 0.000002")
     list(APPEND DET_BCC_MO_DMC_SCALARS "kinetic" "29.19088964 0.000001")
-    list(APPEND DET_BCC_MO_DMC_SCALARS "potential" "-164.48678078 0.000001")
+    list(APPEND DET_BCC_MO_DMC_SCALARS "potential" "-164.48678078 0.000002")
     list(APPEND DET_BCC_MO_DMC_SCALARS "eeenergy" "-2.19402623 0.000001")
-    list(APPEND DET_BCC_MO_DMC_SCALARS "ionion" "-119.82754227 0.000001")
+    list(APPEND DET_BCC_MO_DMC_SCALARS "ionion" "-119.82754227 0.000002")
     list(APPEND DET_BCC_MO_DMC_SCALARS "localecp" "-64.57264081 0.000001")
     list(APPEND DET_BCC_MO_DMC_SCALARS "nonlocalecp" "22.11861933 0.000001")
     list(APPEND DET_BCC_MO_DMC_SCALARS "soecp" "-0.01119080 0.000001")

From 21c6bf1801df18a67fef2af25228ebb629c0ff86 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Wed, 15 Jun 2022 16:15:41 -0500
Subject: [PATCH 20/36] Fix legacy CUDA compilation issue.

---
 src/Particle/ParticleSet.cpp | 7 -------
 src/Particle/Walker.h        | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/Particle/ParticleSet.cpp b/src/Particle/ParticleSet.cpp
index 8e69f67fbc..b6c72606fc 100644
--- a/src/Particle/ParticleSet.cpp
+++ b/src/Particle/ParticleSet.cpp
@@ -33,13 +33,6 @@ namespace qmcplusplus
 {
 using WP = WalkerProperties::Indexes;
 
-//using namespace particle_info;
-
-#ifdef QMC_CUDA
-template<>
-int ParticleSet::Walker_t::cuda_DataSize = 0;
-#endif
-
 enum PSetTimers
 {
   PS_newpos,
diff --git a/src/Particle/Walker.h b/src/Particle/Walker.h
index 03347145d5..91847fcaba 100644
--- a/src/Particle/Walker.h
+++ b/src/Particle/Walker.h
@@ -154,7 +154,7 @@ class Walker
 
   /// Data for GPU-vectorized versions
 #ifdef QMC_CUDA
-  static inline int cuda_DataSize;
+  static inline int cuda_DataSize = 0;
   using cuda_Buffer_t = gpu::device_vector<CTS::ValueType>;
   cuda_Buffer_t cuda_DataSet;
   // Note that R_GPU has size N+1.  The last element contains the

From a25bf089bca193b0f619827311cf6c5460c90ccd Mon Sep 17 00:00:00 2001
From: Hyeondeok-Shin <hyeondeok.shin@gmail.com>
Date: Wed, 15 Jun 2022 16:22:29 -0500
Subject: [PATCH 21/36] Adjust tolerance for unit_test_estimators for One-body
 densty matrices on mixed precision build

---
 src/Estimators/tests/test_OneBodyDensityMatrices.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Estimators/tests/test_OneBodyDensityMatrices.cpp b/src/Estimators/tests/test_OneBodyDensityMatrices.cpp
index fc1e704a73..b448c5ef8c 100644
--- a/src/Estimators/tests/test_OneBodyDensityMatrices.cpp
+++ b/src/Estimators/tests/test_OneBodyDensityMatrices.cpp
@@ -140,7 +140,7 @@ class OneBodyDensityMatricesTests
       auto* test_data = reinterpret_cast<std::complex<Real>*>(test_in);
       for (size_t id = 0; id < size; id += 2)
 #if defined(MIXED_PRECISION)
-        CHECK(ref_data[id] == ComplexApprox(test_data[id]).epsilon(1e-4));
+        CHECK(ref_data[id] == ComplexApprox(test_data[id]).epsilon(6e-4));
 #else
         CHECK(ref_data[id] == ComplexApprox(test_data[id]));
 #endif

From e0df1eb58bbdff5ea324568f36a9de34944a54bc Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Wed, 15 Jun 2022 17:27:28 -0500
Subject: [PATCH 22/36] Derive OpenMP OFFLOAD_ARCH based on HIP_ARCH.

---
 CMake/ClangCompilers.cmake | 22 ++++++++++++++++++----
 CMakeLists.txt             |  2 +-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake
index 00d6f28cb4..c8ba9ddf9b 100644
--- a/CMake/ClangCompilers.cmake
+++ b/CMake/ClangCompilers.cmake
@@ -32,10 +32,6 @@ if(QMC_OMP)
       set(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Wno-linker-warnings")
     endif()
 
-    if(NOT DEFINED OFFLOAD_ARCH AND OFFLOAD_TARGET MATCHES "amdgcn")
-      set(OFFLOAD_ARCH gfx906)
-    endif()
-
     if(NOT DEFINED OFFLOAD_ARCH
        AND OFFLOAD_TARGET MATCHES "nvptx64"
        AND DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -51,6 +47,24 @@ if(QMC_OMP)
       endif()
     endif()
 
+    if(NOT DEFINED OFFLOAD_ARCH
+       AND OFFLOAD_TARGET MATCHES "amdgcn")
+      if (DEFINED HIP_ARCH)
+        list(LENGTH HIP_ARCH NUMBER_HIP_ARCHITECTURES)
+        if(NUMBER_HIP_ARCHITECTURES EQUAL "1")
+          set(OFFLOAD_ARCH ${HIP_ARCH})
+        else()
+          message(
+            FATAL_ERROR
+              "LLVM does not yet support offload to multiple architectures! "
+              "Deriving OFFLOAD_ARCH from HIP_ARCH failed. "
+              "Please keep only one entry in HIP_ARCH or set OFFLOAD_ARCH.")
+        endif()
+      else()
+        set(OFFLOAD_ARCH gfx906)
+      endif()
+    endif()
+
     if(DEFINED OFFLOAD_ARCH)
       set(OPENMP_OFFLOAD_COMPILE_OPTIONS
           "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7517985f36..46f4eaeeab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -779,7 +779,7 @@ if(ENABLE_ROCM)
     message(STATUS "ROCM_ROOT not provided. Searching for FindHIP.cmake file.")
     find_path(
       HIP_MODULE_FILE_DIR FindHIP.cmake
-      HINTS /opt/rocm
+      HINTS $ENV{ROCM_PATH} /opt/rocm
       PATH_SUFFIXES hip/cmake)
     if(HIP_MODULE_FILE_DIR)
       message(STATUS "Found FindHIP.cmake file. ROCM_ROOT will be derived.")

From 1aed50d855c20d871fb62736fc5ad16c14c9e991 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Wed, 15 Jun 2022 18:12:48 -0500
Subject: [PATCH 23/36] Update OpenMP offload build instruction.

---
 docs/installation.rst | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/docs/installation.rst b/docs/installation.rst
index fd7fa722ff..b5cd4fbe24 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -282,20 +282,20 @@ the path to the source directory.
 
   ::
 
-    QMC_CUDA              Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no)
     QMC_COMPLEX           Build the complex (general twist/k-point) version (1:yes, 0:no)
     QMC_MIXED_PRECISION   Build the mixed precision (mixing double/float) version
                           (1:yes (QMC_CUDA=1 default), 0:no (QMC_CUDA=0 default)).
                           Mixed precision calculations can be signifiantly faster but should be
                           carefully checked validated against full double precision runs,
                           particularly for large electron counts.
+    ENABLE_OFFLOAD        ON/OFF(default). Enable OpenMP target offload for GPU acceleration.
+    QMC_CUDA              Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no)
     ENABLE_CUDA           ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration.
-                          Production quality for AFQMC. Pre-production quality for real-space.
+                          Production quality for AFQMC and real-space performance portable implementation.
                           Use CMAKE_CUDA_ARCHITECTURES, default 70, to set the actual GPU architecture.
-    ENABLE_OFFLOAD        ON/OFF(default). Enable OpenMP target offload for GPU acceleration.
-    ENABLE_TIMERS         ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse
-                          to avoid potential slowdown in tiny systems.
-                          For systems beyond tiny sizes (100+ electrons) there is no risk.
+    QMC_CUDA2HIP          ON/OFF(default). To be set ON, it requires either QMC_CUDA or ENABLE_CUDA to be ON.
+                          Compile CUDA source code as HIP and use ROCm libraries for AMD GPUs.
+    ENABLE_SYCL           ON/OFF(default). Enable SYCL code path. Only support Intel GPUs and OneAPI compilers.
 
 - General build options
 
@@ -327,6 +327,9 @@ the path to the source directory.
 
   ::
 
+    ENABLE_TIMERS          ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse
+                           to avoid potential slowdown in tiny systems.
+                           For systems beyond tiny sizes (100+ electrons) there is no risk.
     QE_BIN                 Location of Quantum ESPRESSO binaries including pw2qmcpack.x
     RMG_BIN                Location of RMG binary (rmg-cpu)
     QMC_DATA               Specify data directory for QMCPACK performance and integration tests
@@ -412,7 +415,7 @@ and is not suitable for production. Additional implementation in QMCPACK as
 well as improvements in open-source and vendor compilers is required for production status 
 to be reached. The following compilers have been verified:
 
-- LLVM Clang 11. Support NVIDIA GPUs.
+- LLVM Clang 14. Support NVIDIA GPUs.
 
   ::
 
@@ -425,31 +428,43 @@ to be reached. The following compilers have been verified:
     OFFLOAD_TARGET for the offload target. default nvptx64-nvidia-cuda.
     OFFLOAD_ARCH for the target architecture (sm_80, gfx906, ...) if not using the compiler default.
 
-- AMD AOMP Clang 11.8. Support AMD GPUs.
+- AMD ROCm/AOMP LLVM-based compilers. Support AMD GPUs.
   
   ::
   
     -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=amdgcn-amd-amdhsa -D OFFLOAD_ARCH=gfx906
 
-- Intel oneAPI beta08. Support Intel GPUs.
+- Intel oneAPI 2022.1.0 icx/icpx compilers. Support Intel GPUs.
   
   ::
   
     -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=spir64
 
-- HPE Cray 11. It is derived from Clang and supports NVIDIA and AMD GPUs.
+- HPE Cray 13. It is derived from Clang and supports NVIDIA and AMD GPUs.
   
   ::
   
     -D ENABLE_OFFLOAD=ON -D OFFLOAD_TARGET=nvptx64-nvidia-cuda -D OFFLOAD_ARCH=sm_80
 
 OpenMP offload features can be used together with vendor specific code paths to maximize QMCPACK performance.
-Some new CUDA functionality has been implemented to improve efficiency on NVIDIA GPUs in conjunction with the Offload code paths:
-For example, using Clang 11 on Summit.
+Some new CUDA functionality has been implemented to improve performance on NVIDIA GPUs in conjunction with the offload code paths:
+For example, using Clang 14 on Summit.
 
   ::
   
-    -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70 -D CMAKE_CUDA_HOST_COMPILER=`which gcc`
+    -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=70
+
+Similarly, HIP features can be enabled in conjunction with the offload code path to improve performance on AMD GPUs.
+
+  ::
+
+    -D ENABLE_OFFLOAD=ON -D ENABLE_CUDA=ON -D QMC_CUDA2HIP=ON -DHIP_ARCH=gfx906
+
+Similarly, SYCL features can be enabled in conjunction with the offload code path to improve performance on Intel GPUs.
+
+  ::
+
+    -D ENABLE_OFFLOAD=ON -D ENABLE_SYCL=ON
 
 
 Installation from CMake

From 19acb573dfa04913a00cff4af847a6b9308a9898 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 20 Jun 2022 09:07:19 -0500
Subject: [PATCH 24/36] Update nightly_anl_bora.sh

---
 .../nightly_test_scripts/nightly_anl_bora.sh  | 80 ++++++++++++++-----
 1 file changed, 60 insertions(+), 20 deletions(-)

diff --git a/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh b/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh
index 0e1731e8fa..0799667cbe 100755
--- a/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh
+++ b/tests/test_automation/nightly_test_scripts/nightly_anl_bora.sh
@@ -13,11 +13,14 @@ fi
 
 module load cmake
 module load intel-mkl
-module load cuda/10.1
+
+module use /nfs/gce/projects/QMCPACK_dev/spack/share/spack/lmod/linux-ubuntu18.04-x86_64/Core
+module load cuda/11.2.2-lo3x6k7
 
 export TEST_SITE_NAME=bora.alcf.anl.gov
 export N_PROCS=16
 export N_PROCS_BUILD=16
+export N_CONCURRENT_TESTS=16
 
 # run on socket 1
 NUMA_ID=0
@@ -26,7 +29,7 @@ QE_BIN=/scratch/opt/qe-stable/qe-6.4.1/bin
 QMC_DATA=/scratch/opt/h5data
 
 #Must be an absolute path
-place=/scratch/QMCPACK_CI_BUILDS_DO_NOT_REMOVE
+place=/scratch2/QMCPACK_CI_BUILDS_DO_NOT_REMOVE
 
 if [ ! -e $place ]; then
 mkdir $place
@@ -56,9 +59,11 @@ cd $entry
 
 git checkout $branch
 
-for sys in ClangDev-Offload-CUDA-Real ClangDev-Offload-CUDA-Complex ClangDev-Offload-CUDA-Real-Mixed ClangDev-Offload-CUDA-Complex-Mixed \
-           Intel18-Real Intel18-Real-Mixed Intel18-Complex Intel18-Complex-Mixed \
-           Intel18-Real-Mixed-CUDA2 Intel18-Complex-Mixed-CUDA2 Intel18-Real-Mixed-legacy-CUDA Intel18-Complex-Mixed-legacy-CUDA
+for sys in ROCm-CUDA2HIP-Real ROCm-CUDA2HIP-Complex ROCm-CUDA2HIP-Real-Mixed ROCm-CUDA2HIP-Complex-Mixed \
+           Intel19-Real Intel19-Real-Mixed Intel19-Complex Intel19-Complex-Mixed \
+           ClangDev-Offload-Real ClangDev-Offload-Complex ClangDev-Offload-Real-Mixed ClangDev-Offload-Complex-Mixed \
+           ClangDev-Offload-CUDA-Real ClangDev-Offload-CUDA-Complex ClangDev-Offload-CUDA-Real-Mixed ClangDev-Offload-CUDA-Complex-Mixed \
+           Intel19-CUDA2-Real-Mixed Intel19-CUDA2-Complex-Mixed Intel19-legacy-CUDA-Real-Mixed Intel19-legacy-CUDA-Complex-Mixed
 do
 
 echo --- Building for $sys `date`
@@ -78,42 +83,70 @@ if [[ $sys == *"ClangDev"* ]]; then
   #define and load compiler
   module load llvm/master-latest
   module load openmpi/4.0.2-llvm
-  export CC=mpicc
-  export CXX=mpicxx
 
-  CTEST_FLAGS="$CTEST_FLAGS -DCMAKE_C_FLAGS=-march=skylake -DCMAKE_CXX_FLAGS=-march=skylake"
   if [[ $sys == *"Offload-CUDA"* ]]; then
-    CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DUSE_OBJECT_TARGET=ON;-DCMAKE_CUDA_HOST_COMPILER=`which gcc`;-DQMC_NIO_MAX_SIZE=16'"
-    CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61"
-    CTEST_LABELS="-L deterministic -LE unstable"
-    export N_CONCURRENT_TESTS=4
+    CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DUSE_OBJECT_TARGET=ON;-DQMC_NIO_MAX_SIZE=8'"
+    CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=61 -DBUILD_AFQMC=ON"
+  elif [[ $sys == *"Offload"* ]]; then
+    CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DOFFLOAD_ARCH=sm_61;-DUSE_OBJECT_TARGET=ON;-DQMC_NIO_MAX_SIZE=8'"
+    CTEST_FLAGS="$CTEST_FLAGS -DBUILD_AFQMC=ON"
   else
     CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN"
-    CTEST_LABELS="-L 'deterministic|performance' -LE unstable"
-    export N_CONCURRENT_TESTS=16
+  fi
+
+  export LIBOMP_USE_HIDDEN_HELPER_TASK=OFF
+  if [[ $sys == *"Real-Mixed"* ]]; then
+    #CTEST_LABELS="-E 'long|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp'"
+    CTEST_LABELS="-E 'restart|save|long|developer|He_param|example|cpu_limit|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp|short-C'"
+  else
+    CTEST_LABELS="-L 'deterministic|QMCPACK' -LE unstable -E long"
+  fi
+elif [[ $sys == *"ROCm"* ]]; then
+  #define and load compiler
+  module load rocm/afar001-432 aomp/afar001-432
+  module load openmpi/4.0.2-llvm
+
+  if [[ $sys == *"Offload-CUDA2HIP"* ]]; then
+    CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DHIP_ARCH=gfx1030;-DENABLE_OFFLOAD=ON;-DQMC_NIO_MAX_SIZE=8'"
+    CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON"
+  elif [[ $sys == *"CUDA2HIP"* ]]; then
+    CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DHIP_ARCH=gfx1030;-DQMC_NIO_MAX_SIZE=8'"
+    CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=ON -DQMC_CUDA2HIP=ON"
+  elif [[ $sys == *"Offload"* ]]; then
+    CTEST_FLAGS="$CTEST_FLAGS -DQMC_OPTIONS='-DENABLE_OFFLOAD=ON;-DOFFLOAD_TARGET=amdgcn-amd-amdhsa;-DOFFLOAD_ARCH=gfx1030;-DQMC_NIO_MAX_SIZE=8'"
+  else
+    CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN"
+  fi
+
+  if [[ $sys == *"Real-Mixed"* ]]; then
+    #CTEST_LABELS="-E 'long|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp'"
+    CTEST_LABELS="-E 'restart|save|long|developer|He_param|example|cpu_limit|short-LiH|short-bccH_1x1x1|short-H2O_dimer|short-diamondC_1x1x1|short-diamondC_2x1x1_pp-dmc|short-li2_sto|short-NiO_a4_e48_pp|short-C'"
+  else
+    CTEST_LABELS="-L 'deterministic|QMCPACK' -LE unstable -E long"
   fi
 elif [[ $sys == *"Intel"* ]]; then
   #define and load compiler
-  module load intel/18.4
+  module load intel/20.2
   module load openmpi/4.0.2-intel
-  export CC=mpicc
-  export CXX=mpicxx
 
   CTEST_FLAGS="$CTEST_FLAGS -DCMAKE_C_FLAGS=-xCOMMON-AVX512 -DCMAKE_CXX_FLAGS=-xCOMMON-AVX512"
   if [[ $sys == *"-CUDA2"* ]]; then
     CTEST_FLAGS="$CTEST_FLAGS -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61 -DQMC_OPTIONS='-DQMC_NIO_MAX_SIZE=16'"
-    CTEST_LABELS="-L 'deterministic|performance' -LE unstable"
+    CTEST_LABELS="-L 'deterministic|performance' -LE unstable -E long"
     export N_CONCURRENT_TESTS=4
   elif [[ $sys == *"-legacy-CUDA"* ]]; then
     CTEST_FLAGS="$CTEST_FLAGS -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=61 -DQMC_OPTIONS='-DQMC_NIO_MAX_SIZE=16'"
-    CTEST_LABELS="-L 'deterministic|performance' -LE unstable"
+    CTEST_LABELS="-L 'deterministic|performance' -LE unstable -E long"
     export N_CONCURRENT_TESTS=1
   else
     CTEST_FLAGS="$CTEST_FLAGS -DQE_BIN=$QE_BIN"
+    CTEST_LABELS="-E long"
     export N_CONCURRENT_TESTS=16
   fi
 fi
 
+CTEST_FLAGS="-DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx $CTEST_FLAGS"
+
 # compiler independent options
 if [[ $sys == *"-Complex"* ]]; then
   CTEST_FLAGS="$CTEST_FLAGS -DQMC_COMPLEX=1"
@@ -132,8 +165,15 @@ fi
 mkdir $folder
 cd $folder
 
+logfile=$place/log/$entry/$mydate/${QMCPACK_TEST_SUBMIT_NAME}.log
+echo "$CTEST_FLAGS $CTEST_LABELS" | tee $logfile
+echo PATH=$PATH >> $logfile
+echo OMPI_CXX=${OMPI_CXX} >> $logfile
+mpicxx -v >> $logfile
+echo >> $logfile
+
 numactl -N $NUMA_ID \
-ctest $CTEST_FLAGS $CTEST_LABELS -S $PWD/../CMake/ctest_script.cmake,release -VV -E 'long' --timeout 800 &> $place/log/$entry/$mydate/${QMCPACK_TEST_SUBMIT_NAME}.log
+ctest $CTEST_FLAGS $CTEST_LABELS -S $PWD/../CMake/ctest_script.cmake,release -VV --timeout 800 &>> $logfile
 
 cd ..
 echo --- Finished $sys `date`

From 4d21eb1bd418a4a9678da2ad92c02f0847705b99 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Mon, 20 Jun 2022 09:21:56 -0500
Subject: [PATCH 25/36] Update test scripts and remove unused scripts.

---
 tests/test_automation/Jenkinsfile_ornl_vm     |   4 -
 tests/test_automation/README.md               |   5 +-
 .../add_fortran_for_spack_clang.py            |  26 ---
 tests/test_automation/jenkins_ornl_test.sh    |  36 ----
 .../nightly_alcf_hyperion.sh                  | 178 ------------------
 .../spack_supported_package_versions.sh       |  50 -----
 tests/test_automation/start_spack_env.sh      |  14 --
 tests/test_automation/vm_setup.sh             | 164 ----------------
 tests/test_automation/vm_spack_env_cpu.sh     |  25 ---
 9 files changed, 1 insertion(+), 501 deletions(-)
 delete mode 100644 tests/test_automation/Jenkinsfile_ornl_vm
 delete mode 100755 tests/test_automation/add_fortran_for_spack_clang.py
 delete mode 100755 tests/test_automation/jenkins_ornl_test.sh
 delete mode 100755 tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh
 delete mode 100644 tests/test_automation/spack_supported_package_versions.sh
 delete mode 100644 tests/test_automation/start_spack_env.sh
 delete mode 100755 tests/test_automation/vm_setup.sh
 delete mode 100644 tests/test_automation/vm_spack_env_cpu.sh

diff --git a/tests/test_automation/Jenkinsfile_ornl_vm b/tests/test_automation/Jenkinsfile_ornl_vm
deleted file mode 100644
index 66ac751dff..0000000000
--- a/tests/test_automation/Jenkinsfile_ornl_vm
+++ /dev/null
@@ -1,4 +0,0 @@
-library 'qmcpack_shared_jenkins'
-
-common_pipeline(name:"cpu", spack_path:"/var/lib/jenkins/spack", prefix:"vm")
-
diff --git a/tests/test_automation/README.md b/tests/test_automation/README.md
index 5eddb9409d..8ed2b715c3 100644
--- a/tests/test_automation/README.md
+++ b/tests/test_automation/README.md
@@ -18,7 +18,4 @@ correspond to exactly those used in production, since these are
 sometimes tweaked for runtime, minor software version updates etc.
 
 # CI configuration
-Most of these files begin with jenkins. Scrutinize any updates to them
-carefully there is basically no reason they should be touched except by maintainers.
-They depend on a common jenkins pipeline library pulled from QMCPACK/qmcpack_shared_jenkins.
-Updates to that library don't seem to propagate faster than ~10 minutes.
+See github-actions subdirectory and <QMCPACK_ROOT>/.github/workflows/
diff --git a/tests/test_automation/add_fortran_for_spack_clang.py b/tests/test_automation/add_fortran_for_spack_clang.py
deleted file mode 100755
index 859b8ca772..0000000000
--- a/tests/test_automation/add_fortran_for_spack_clang.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env spack python
-
-import spack.config
-# spack uses this as well so it must be there
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description='For chosen spec of gcc add gfortran as fortran compiler for all clang compiler configs')
-parser.add_argument('gcc_spec', type=str, help='spack spec to get gfortran from')
-args = parser.parse_args()
-
-conf_scope = spack.config.ConfigScope('user/linux', os.path.join(os.environ['HOME'],'.spack','linux'))
-compilers_config = conf_scope.get_section_filename('compilers')
-
-gcc_configs = [ c for c in conf_scope.get_section('compilers')['compilers'] if args.gcc_spec in c['compiler']['spec'] ]
-if len(gcc_configs) != 1:
-    raise NameError('gcc version must evaluate to a single gcc spec')
-gcc = gcc_configs[0]
-
-llvm_configs = [ c for c in conf_scope.get_section('compilers')['compilers'] if "clang" in c['compiler']['spec'] ]
-for lconf in llvm_configs:
-    lconf['compiler']['paths']['f77'] = gcc['compiler']['paths']['f77']
-    lconf['compiler']['paths']['fc'] = gcc['compiler']['paths']['fc']
-    lconf['compiler']['modules'].append(args.gcc_spec)
-
-conf_scope.write_section('compilers')
diff --git a/tests/test_automation/jenkins_ornl_test.sh b/tests/test_automation/jenkins_ornl_test.sh
deleted file mode 100755
index ddd35608e6..0000000000
--- a/tests/test_automation/jenkins_ornl_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash -x
-
-exit_code=0
-
-echo "starting spack using ${SPACK_ENV_FILE}"
-
-# this depends on SPACK_ROOT being set in Jenkinsfile_xxx
-# it also supplies QMC_IMMUTABLE_FLAGS which makes it a bit more than the
-# environment from the set of loaded spack packages.
-. ${SPACK_ENV_FILE}
-
-cd build_${1}_${2}
-BUILD_DIR=$(pwd)
-echo "BUILD_DIR: ${BUILD_DIR}"
-
-export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
-
-module list
-
-# this keeps tee from eating the exit status
-set -o pipefail
-
-# exclude failing hamiltonian test
-EXCLUDE_FLAG=''
-if [[ $2 == 'mixed' ]]; then
-    EXCLUDE_FLAG='-E unit_test_hamiltonian'
-fi
-
-
-ctest -j${JNK_THREADS} -L unit ${EXCLUDE_FLAG} --output-on-failure --timeout 120 2>&1 | tee ${1}_${2}_ctest.out
-ret=$?
-if [[ ${ret} -ne 0 ]] ; then
-  exit_code=${ret}
-fi
-
-exit ${exit_code}
diff --git a/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh b/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh
deleted file mode 100755
index 3cee8e5e17..0000000000
--- a/tests/test_automation/nightly_test_scripts/nightly_alcf_hyperion.sh
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/bin/bash
-# source environment
-# MPI wrappers, MKL, and Intel and GCC compiler
-export PATH=/opt/cmake/current/bin:$PATH
-source /opt/rh/devtoolset-6/enable
-source /opt/intel/2018/parallel_studio_xe_2018/psxevars.sh intel64
-
-export N_PROCS_BUILD=64
-export N_PROCS=64
-
-# KNL NUMA + Memory Mode
-# quit if in a hybrid mode, could run out of memory
-if [grep -i hybrid /var/run/hwloc/knl_memoryside_cache]; then
-    echo Memory Mode equal Hybrid. Quitting.
-    exit 1
-else
-    cat /var/run/hwloc/knl_memoryside_cache
-fi
-
-# timeout
-timeout=1800
-
-# topdir must exist, otherwise it fails
-topdir=/data/ci
-QMC_DATA=/data/ci/NiO
-export BOOST_ROOT=/data/ci/boost_1_61_0
-testdir=${topdir}/scratch/QMCPACK_CI_BUILDS_DO_NOT_REMOVE
-
-if [ -e $topdir ]; then
-
-if [ ! -e $testdir ]; then
-mkdir -p $testdir
-fi
-
-if [ -e $testdir ]; then
-cd $testdir
-
-# Minimize load of GitHub by maintaining a local cloned git used for all builds
-if [ ! -e qmcpack ]; then
-echo --- Cloning QMCPACK git `date`
-git clone https://github.com/QMCPACK/qmcpack.git --depth 1
-else
-cd qmcpack
-echo --- Updating local QMCPACK git `date`
-git pull
-cd ..
-fi
-
-# Sanity check cmake config file present
-if [ -e qmcpack/CMakeLists.txt ]; then
-echo --- QMCPACK git repo contains CMakeLists.txt
-
-# Build Quantum Espresso
-# Compiled only once with the Intel Compiler
-QE_VERSION=6.3
-QE_sysdir=${testdir}/intel2018_QE
-QE_BIN=${QE_sysdir}/qe-${QE_VERSION}/bin
-echo --- QE_BIN set to ${QE_BIN}
-
-# Always start from clean build, just in case we updated the QE patch.
-if [ -e ${QE_sysdir} ]; then
-    rm -rf ${QE_sysdir}
-fi
-
-mkdir ${QE_sysdir}
-
-# Download QE and patch it		
-cd ${QE_sysdir}
-cp -p ../qmcpack/external_codes/quantum_espresso/*${QE_VERSION}* .
-./download_and_patch_qe${QE_VERSION}.sh
-cd qe-${QE_VERSION}
-
-# Hack to get QE build to build and link against proper libraries on KNL
-# Eventually, Copy make.sys that is known to work. 
-cp /data/ci/auxfiles/configure-qe-knl-omp.sh .
-cp /data/ci/auxfiles/configure-qe-libxsmm.mak .
-cp /data/ci/auxfiles/configure-qe-tbbmalloc.mak .
-
-echo --- Configure QE ${QE_VERSION}$
-./configure-qe-knl-omp.sh
-# HDF5 support in QE 6.3 is buggy, revert to older file I/O format
-sed -i 's/D__HDF5/D__HDF5_C/' make.inc 
-echo --- Building QE ${QE_VERSION}$
-# make pwall # parallel build fails due to incorrect dependency
-make -j 64 pw
-make -j 64 pp
-
-# Make fault-tolerant, maybe QE did not download properly or there
-# was a build failure
-if [ -e ${QE_BIN}/pw.x ]; then
-    echo -- QE ${QE_VERSION} was built properly.
-else
-    echo -- QE ${QE_VERSION} failed to build.
-    exit 1
-fi
-
-
-
-echo --- Starting test builds and tests
-
-for sys in build_gcc build_gcc_complex build_gcc_mixed build_gcc_complex_mixed build_intel2018 build_intel2018_complex build_intel2018_mixed build_intel2018_complex_mixed
-do
-
-echo --- Building for $sys `date`
-
-cd ${testdir}
-
-if [ -e $sys ]; then
-rm -rf $sys
-fi
-mkdir $sys
-cd $sys
-
-# export PATH=/opt/local/bin:/opt/local/sbin:/usr/local/cuda/bin/:/usr/lib64/openmpi/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
-# export LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:/usr/local/cuda-7.0/lib64
-
-case $sys in
-    "build_gcc")
-	export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-SoA-Release
-	ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_gcc_complex")
-	export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Complex-SoA-Release
-	ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_gcc_mixed")
-	export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Mixed-SoA-Release
-	ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_gcc_complex_mixed")
-	export QMCPACK_TEST_SUBMIT_NAME=GCC-MKL-Complex-Mixed-SoA-Release
-	ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_intel2018")
-	# intel compiler should already be loaded
-	export QMCPACK_TEST_SUBMIT_NAME=Intel2018-SoA-Release
-	ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_intel2018_complex")
-	# intel compiler should already be loaded
-	export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Complex-SoA-Release
-	ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=0 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_intel2018_mixed")
-	# intel compiler should already be loaded
-	export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Mixed-SoA-Release
-	ctest -DQMC_COMPLEX=0 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    "build_intel2018_complex_mixed")
-	# intel compiler should already be loaded
-	export QMCPACK_TEST_SUBMIT_NAME=Intel2018-Complex-Mixed-SoA-Release
-	ctest -DQMC_COMPLEX=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -DQE_BIN=${QE_BIN} -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -E 'long' -VV --timeout $timeout
-	;;
-    *)
-	echo "ERROR: Unknown build type $sys"
-	;;
-esac
-
-done # termination for $sys loop
-
-else
-    echo  "ERROR: No CMakeLists. Bad git clone or update."
-    exit 1
-fi # termination for if block which checks CMakeLists.txt
-
-
-else
-echo "ERROR: Unable to make test directory ${testdir}"
-exit 1
-fi # termination for if block which checks testing directory
-
-else
-echo "ERROR: No top level directory"
-exit 1
-fi # termination for if block which checks top level directory
-
-
-
diff --git a/tests/test_automation/spack_supported_package_versions.sh b/tests/test_automation/spack_supported_package_versions.sh
deleted file mode 100644
index e16336bbc8..0000000000
--- a/tests/test_automation/spack_supported_package_versions.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-# GCC
-# Dates at https://gcc.gnu.org/releases.html
-gcc_vnew=9.2.0 # 2019-08-12
-gcc_vold=7.3.0 # 2018-01-25
-
-#For Intel:
-gcc_vintel=7.4.0 # 2018-12-06
-
-#PGI 19.4
-# makelocalrc configured with 8.3.0 currently
-gcc_vpgi=8.3.0 # 2019-02-22
-
-# For CUDA toolkit compatibility
-gcc_vcuda=8.3.0 #  2019-02-22
-
-# LLVM 
-# Dates at http://releases.llvm.org/
-llvm_vnew=9.0.0 # 2019-09-19
-llvm_vold=5.0.1 # 2017-12-21
-# for CUDA 10.1 update 2
-llvm_vcuda=8.0.0 # 2019-03-
-
-# HDF5
-hdf5_vnew=1.10.5 # Releeased 2019-02-28
-hdf5_vold=1.8.19 # Released 2017-06-16
-
-# CMake 
-# Dates at https://cmake.org/files/
-cmake_vnew=3.16.2 # Released 2019-12-19
-cmake_vold=3.10.2 # Released 2018-01-18
-
-# OpenMPI
-# Dates at https://www.open-mpi.org/software/ompi/v4.0/
-ompi_vnew=4.0.2 # Released 2019-10-07
-ompi_vold=2.1.2 # Released 2017-09-20
-
-libxml2_vnew=2.9.9 # Released 2019-01-03 See http://xmlsoft.org/sources/
-libxml2_vold=2.9.1 # Released 2013-04-19
-
-# FFTW
-# Dates at http://www.fftw.org/release-notes.html
-fftw_vnew=3.3.8 # Released 2018-05-28
-fftw_vold=3.3.4 # Released 2014-03-16
-
-# BOOST
-# Dates at https://www.boost.org/users/history/
-boost_vnew=1.70.0 # Released 2019-04-12
-boost_vold=1.65.1 # Released 2016-05-13
diff --git a/tests/test_automation/start_spack_env.sh b/tests/test_automation/start_spack_env.sh
deleted file mode 100644
index 4dc0f19cd5..0000000000
--- a/tests/test_automation/start_spack_env.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# This shared script fragment start spack and loads in QMCPACKS currently supported versions of compilers
-# and libraries
-
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-# This is to deal with the fact that we don't know the relationship between the working directory
-# and checkout directory in a relative way.
-# this doesn't deal with symlinks
-SRC="${BASH_SOURCE[0]}"
-SRC_DIR="$( cd -P "$( dirname "$SRC" )" >/dev/null 2>&1 && pwd )"
-
-. ${SRC_DIR}/spack_supported_package_versions.sh
diff --git a/tests/test_automation/vm_setup.sh b/tests/test_automation/vm_setup.sh
deleted file mode 100755
index 0d8ef048f1..0000000000
--- a/tests/test_automation/vm_setup.sh
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/bin/bash
-
-#
-# Installs compilers & libraries for QMCPACK testing via SPACK on a clean VM
-#   - assumes 32 jobs is reasonable 
-#
-#  ./vm_setup.sh        Builds current CI requirements (use with caution specs could become ambiguous)
-#  ./vm_setup.sh clean  *** DESTROYS EXISTING INSTALLATION ***
-#
-#
-
-module() { eval `/usr/bin/modulecmd bash $*`; }
-
-if [[ $1 == "clean" ]]; then
-   rm -r -f $HOME/spack $HOME/.spack
-   mkdir $HOME/.spack
-   cat >$HOME/.spack/config.yaml<<EOF
-config:
-  build_jobs: 32
-  shared_linking: 'rpath'
-EOF
-
-  cd $HOME
-  git clone https://github.com/spack/spack.git
-  cd spack
-# For reproducibility, use a specific version of Spack
-# Use tagged releases https://github.com/spack/spack/releases
-# git checkout v0.13.3
-  git checkout b9dc263801ab8b9ce46e83adec8002c299fe2e44
-#Author: Justin S <3630356+codeandkey@users.noreply.github.com>
-#Date:   Fri Jan 3 15:52:59 2020 -0600
-#
-#    py-intervaltree: new package at 3.0.2 (#14277)
-
-  cd bin
-  ./spack bootstrap
-fi
-
-export SPACK_ROOT=$HOME/spack
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-echo --- Spack list
-spack find
-echo --- Spack compilers
-spack compilers
-echo --- Modules list
-module list
-echo --- End listings
-
-#
-# Versions should be consistent with setup script
-#
-
-# GCC
-# Dates at https://gcc.gnu.org/releases.html
-gcc_vnew=9.2.0 # 2019-08-12
-gcc_vold=7.3.0 # 2018-01-25
-
-#For Intel:
-gcc_vintel=7.4.0 # 2018-12-06
-
-#PGI 19.4
-# makelocalrc configured with 8.3.0 currently
-gcc_vpgi=8.3.0 # 2019-02-22
-
-# For CUDA toolkit compatibility
-gcc_vcuda=8.3.0 #  2019-02-22
-
-# LLVM 
-# Dates at http://releases.llvm.org/
-llvm_vnew=9.0.0 # 2019-09-19
-llvm_vold=5.0.1 # 2017-12-21
-# for CUDA 10.1 update 2
-llvm_vcuda=8.0.0 # 2019-03-
-
-# HDF5
-hdf5_vnew=1.10.5 # Releeased 2019-02-28
-hdf5_vold=1.8.19 # Released 2017-06-16
-
-# CMake 
-# Dates at https://cmake.org/files/
-cmake_vnew=3.16.2 # Released 2019-12-19
-cmake_vold=3.10.2 # Released 2018-01-18
-
-# OpenMPI
-# Dates at https://www.open-mpi.org/software/ompi/v4.0/
-ompi_vnew=4.0.2 # Released 2019-10-07
-ompi_vold=2.1.2 # Released 2017-09-20
-
-libxml2_vnew=2.9.9 # Released 2019-01-03 See http://xmlsoft.org/sources/
-libxml2_vold=2.9.1 # Released 2013-04-19
-
-# FFTW
-# Dates at http://www.fftw.org/release-notes.html
-fftw_vnew=3.3.8 # Released 2018-05-28
-fftw_vold=3.3.4 # Released 2014-03-16
-
-# BOOST
-# Dates at https://www.boost.org/users/history/
-boost_vnew=1.70.0 # Released 2019-04-12
-boost_vold=1.65.1 # Released 2016-05-13
-
-echo --- START env `date`
-echo --- gcc@${gcc_vold}
-spack install gcc@${gcc_vold}
-spack load gcc@${gcc_vold}
-spack compiler find
-echo --- Spack compilers
-spack compilers
-
-if [[ $1 == "clean" ]]; then
-   cat >$HOME/.spack/packages.yaml<<EOF
-packages:
-  all:
-    compiler: [gcc@${gcc_vold}]
-EOF
-fi
-
-spack install --no-checksum libxml2@${libxml2_vold}%gcc@${gcc_vold}
-spack install cmake@${cmake_vold}%gcc@${gcc_vold}
-spack install boost@${boost_vold}%gcc@${gcc_vold}
-spack install openmpi@${ompi_vold}%gcc@${gcc_vold}
-spack install hdf5@${hdf5_vold}~mpi %gcc@${gcc_vold}
-spack install fftw@${fftw_vold}%gcc@${gcc_vold}
-spack unload gcc@${gcc_vold}
-echo --- gcc@${gcc_vnew}
-spack install gcc@${gcc_vnew}
-spack load gcc@${gcc_vnew}
-spack compiler find
-spack install --no-checksum libxml2@${libxml2_vnew}%gcc@${gcc_vnew}
-#spack install --no-checksum cmake@${cmake_vnew}%gcc@${gcc_vnew}
-# BUILD LATEST CMAKE WITH OLD GCC DUE TO BUILD FAILURE WITH GCC_VNEW
-spack install --no-checksum cmake@${cmake_vnew}%gcc@${gcc_vold}
-spack install boost@${boost_vnew}%gcc@${gcc_vnew}
-spack install openmpi@${ompi_vnew}%gcc@${gcc_vnew}^libxml2@${libxml2_vnew}%gcc@${gcc_vnew}
-#spack install hdf5@${hdf5_vnew}%gcc@${gcc_vnew}^openmpi@${ompi_vnew}%gcc@${gcc_vnew}
-spack install hdf5@${hdf5_vnew}~mpi %gcc@${gcc_vnew}
-spack install fftw@${fftw_vnew}%gcc@${gcc_vnew}
-spack unload gcc@${gcc_vnew}
-echo --- llvm@${llvm_vnew}
-spack install llvm@${llvm_vnew}
-spack load llvm@${llvm_vnew}
-spack compiler find
-spack unload llvm@${llvm_vnew}
-
-# before you can build openmpi you need to define a fortran compiler for clang
-# we do this with a python script since we need to modify relatively free form yaml.
-# which btw we assume is in the same directory
-spack python ./add_fortran_for_spack_clang.py gcc@${gcc_vnew}
-spack install openmpi@${ompi_vnew}%clang@${llvm_vnew}^libxml2@${libxml2_vnew}%gcc@${gcc_vnew}
-
-echo --- Convenience
-spack install git
-echo --- Python setup for NEXUS
-spack install py-numpy@1.18.0%gcc@${gcc_vnew}
-spack install py-h5py^py-numpy@1.18.0%gcc@${gcc_vnew}
-spack install py-pandas@0.25.1^py-numpy@1.18.0%gcc@${gcc_vnew}
-spack install py-scipy@1.4.1^py-numpy@1.18.0%gcc@${gcc_vnew}
-spack activate py-numpy@1.18.0%gcc@${gcc_vnew}
-spack activate py-h5py^py-numpy@1.18.0%gcc@${gcc_vnew}
-spack activate py-pandas@0.25.1^py-numpy@1.18.0%gcc@${gcc_vnew}
-spack activate py-scipy@1.4.1^py-numpy@1.18.0%gcc@${gcc_vnew}
-
-echo --- FINISH `date`
diff --git a/tests/test_automation/vm_spack_env_cpu.sh b/tests/test_automation/vm_spack_env_cpu.sh
deleted file mode 100644
index 4befb842f2..0000000000
--- a/tests/test_automation/vm_spack_env_cpu.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-SRC="${BASH_SOURCE[0]}"
-SRC_DIR="$( cd -P "$( dirname "$SRC" )" >/dev/null 2>&1 && pwd )"
-. ${SRC_DIR}/start_spack_env.sh
-
-spack load gcc@$gcc_vnew%gcc@$gcc_vold
-spack load boost@$boost_vnew%gcc@$gcc_vnew
-spack load hdf5@$hdf5_vnew%gcc@$gcc_vnew~mpi
-spack load openssl@1.1.1d%gcc@9.2.0+systemcerts
-spack load cmake@$cmake_vnew%gcc@$gcc_vnew
-spack load hwloc%gcc@$gcc_vnew
-spack load libiconv%gcc@$gcc_vnew
-spack load openmpi@$ompi_vnew%gcc@$gcc_vnew
-spack load libxml2@$libxml2_vnew%gcc@$gcc_vnew
-spack load fftw@$fftw_vnew%gcc@$gcc_vnew
-spack load openblas%gcc@$gcc_vnew
-spack load netlib-lapack%gcc@$gcc_vnew
-
-#if you've installed more than one python for the new compiler this will fail
-spack load python%gcc@$gcc_vnew
-
-QMC_IMMUTABLE_FLAGS="-DBUILD_AFQMC=1"
-
-echo ${PATH}

From e7027ae96a7b55e214896f5c149318f0b1196084 Mon Sep 17 00:00:00 2001
From: Mark Dewing <markdewing@gmail.com>
Date: Mon, 20 Jun 2022 12:16:12 -0500
Subject: [PATCH 26/36] Orbital rotation test with legacy driver

The gen_rotated_lcao_wf.py file is extended to generate QMC averages for
the parameter gradients.

A test for the legacy driver using one thread is introduced.
---
 .../tests/gen_rotated_lcao_wf.py              | 136 +++++++++++++++++-
 src/QMCWaveFunctions/tests/stats.py           |  43 ++++++
 tests/molecules/He_param/CMakeLists.txt       |  26 ++++
 .../He_param/He_orb_rot_param_grad_legacy.xml | 100 +++++++++++++
 4 files changed, 303 insertions(+), 2 deletions(-)
 create mode 100644 src/QMCWaveFunctions/tests/stats.py
 create mode 100644 tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml

diff --git a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py
index 1563ad83e7..cb066cfbb5 100644
--- a/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py
+++ b/src/QMCWaveFunctions/tests/gen_rotated_lcao_wf.py
@@ -5,8 +5,11 @@
 #  compute spatial and parameter derivatives
 import autograd.numpy as np
 from autograd import hessian,grad
+from stats import averager
+
+# Point values used in test_RotatedSPOs_LCAO.cpp
+# QMC values used to validate tests/molecules/He_param/He_orb_rot_param_grad_legacy
 
-# Values used in test_RotatedSPOs_LCAO.cpp
 
 def mag(r):
     return np.sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2])
@@ -175,6 +178,10 @@ def print_wf_values(theta1=0.0, theta2=0.0,  use_j=False, B=0.0):
 
     print("")
 
+
+# Generate the wavefunction values for a single set of electron positions
+# used in test_RotatedSPOs_LCAO.cpp
+
 def print_point_values():
     r1 = np.array([1.0, 2.0, 3.0])
     r2 = np.array([0.0, 1.1, 2.2])
@@ -185,7 +192,132 @@ def print_point_values():
 
     print_wf_values(theta1=0.0, theta2=0.0, use_j=True, B=0.1)
 
+def run_qmc(VP):
+    r1 = np.array([1.0, 2.0, 3.0])
+    r2 = np.array([0.0, 1.1, 2.2])
+
+    # Outer loop for statistics
+    nblock = 20
+
+    # Loop to gather data
+    nstep = 100
+
+    # Loop for decorrelation
+    nsubstep = 10
+
+    nelectron = 2
+
+    # Step size for trial move
+    delta = 1.2
+
+    r = np.zeros((2,3))
+    r[0,:] = r1
+    r[1,:] = r2
+
+    r_old = np.zeros(3)
+    r_trial = np.zeros(3)
+
+    block_dp_ave = averager()
+    total_en_ave = averager()
+
+    # Outer loop for statistics
+    for nb in range(nblock):
+
+        naccept = 0
+        en_ave = averager()
+        dpsi_ave = averager()
+        deloc_ave = averager()
+        eloc_dpsi_ave = averager()
+
+        # Loop to gather data
+        for ns in range(nstep):
+            # Loop for decorrelation
+            for nss in range(nsubstep):
+                # Loop over electrons
+                for ne in range(nelectron):
+
+                    wf_old = psi(r[0,:], r[1,:], VP)
+                    r_old[:] = r[ne,:]
+
+                    change = delta*(np.random.rand(3)-0.5)
+
+                    r_trial[:] = r_old[:] + change
+                    r[ne,:] = r_trial[:]
+
+                    wf_new = psi(r[0,:], r[1,:], VP)
+
+                    wf_ratio = (wf_new/wf_old)**2
+
+                    if wf_ratio > np.random.random():
+                        naccept += 1
+                    else:
+                        r[ne,:] = r_old[:]
+
+            eloc = local_energy(r[0,:], r[1,:], VP)
+            en_ave.add_value(eloc)
+
+            psi_val = psi(r[0,:], r[1,:], VP)
+            dpsi_val = dpsi(r[0,:], r[1,:], VP)
+            dpsi_ave.add_value(dpsi_val/psi_val)
+
+            deloc_val = dlocal_energy(r[0,:], r[1,:], VP)
+            deloc_ave.add_value(deloc_val)
+
+            eloc_dpsi_ave.add_value(eloc * dpsi_val/psi_val)
+
+
+        en = en_ave.average()
+        en_err = en_ave.error()
+        ar = naccept/(nstep*nsubstep*nelectron)
+        print('block = ',nb, ' energy = ',en,en_err,' acceptance ratio = ',ar)
+
+        dp = dpsi_ave.average()
+        dp_err = dpsi_ave.error()
+
+        deloc = deloc_ave.average()
+        deloc_err = deloc_ave.error()
+
+        eloc_dpsi = eloc_dpsi_ave.average()
+        eloc_dpsi_err = eloc_dpsi_ave.error()
+
+        # For the parameter derivative formula, see
+        # https://github.com/QMCPACK/qmc_algorithms/blob/master/Variational/Parameter_Optimization.ipynb
+
+        dg = deloc + 2*eloc_dpsi - 2*en*dp
+        block_dp_ave.add_value(dg)
+
+    dg = block_dp_ave.average()
+    dg_err = block_dp_ave.error()
+    print('parameter values = ',VP)
+    print('parameter derivatives = ',dg)
+    print('parameter derivative errors = ',dg_err)
+
+
+def run_qmc_parameter_derivatives():
+    global use_jastrow
+    use_jastrow=False
+    theta1 = 0.1
+    theta2 = 0.1
+    VP = np.array([theta1, theta2])
+
+    run_qmc(VP)
+
+# Some results from run_qmc_parameter_derivatives
+
+# Run took about 10 minutes on laptop
+# nblock=20, nstep=1000, nsubstep=10
+# parameter values =  [0.1 0.1]
+# parameter derivatives =  [-0.20164722 -0.18347461]
+# parameter derivative errors =  [0.01201481 0.01314164]
+
+# Run took about 40 minutes on laptop
+# nblock=40, nstep=2000, nsubstep=10
+# parameter values =  [0.1 0.1]
+# parameter derivatives =  [-0.2204924  -0.21471184]
+# parameter derivative errors =  [0.00493837 0.00571082]
+
 
 if __name__=='__main__':
-    print_point_values()
+    #print_point_values()
+    run_qmc_parameter_derivatives()
 
diff --git a/src/QMCWaveFunctions/tests/stats.py b/src/QMCWaveFunctions/tests/stats.py
new file mode 100644
index 0000000000..61794a9a3c
--- /dev/null
+++ b/src/QMCWaveFunctions/tests/stats.py
@@ -0,0 +1,43 @@
+
+import math
+import numpy as np
+
+class averager:
+  """Compute average, variance, and standard error for a set of data"""
+  def __init__(self):
+    self.sum = 0.0
+    self.sum_sq = 0.0
+    self.norm = 0
+
+  def add_value(self,v):
+    self.sum += v
+    self.sum_sq += v*v
+    self.norm += 1
+
+  def average(self):
+    if (self.norm == 0):
+      return 0.0
+    else:
+      return self.sum/self.norm
+
+  def variance(self):
+    var = 0.0
+    if (self.norm != 0):
+      var = (self.sum_sq - self.sum*self.sum/self.norm)/self.norm
+    return var
+
+  def std_dev(self):
+    return math.sqrt(self.variance())
+
+  def error(self):
+    err = 0.0
+    if (self.norm > 1):
+      var = self.variance()
+      err = np.sqrt(var/(self.norm-1))
+    return err
+
+  def merge(self, other):
+      self.sum += other.sum
+      self.sum_sq += other.sum_sq
+      self.norm += other.norm
+
diff --git a/tests/molecules/He_param/CMakeLists.txt b/tests/molecules/He_param/CMakeLists.txt
index 02affdc006..c77a43a599 100644
--- a/tests/molecules/He_param/CMakeLists.txt
+++ b/tests/molecules/He_param/CMakeLists.txt
@@ -78,6 +78,32 @@ if(NOT QMC_CUDA)
   endif()
 
 
+  # Orbital Rotation
+
+  # Two atomic orbitals, no jastrow
+
+  list(APPEND HE_ORB_ROT_PARAM spo-up_orb_rot_0000_0001  -0.218  0.01) # scalar name, value, error
+  list(APPEND HE_ORB_ROT_PARAM spo-down_orb_rot_0000_0001  -0.218  0.01) # scalar name, value, error
+
+  qmc_run_and_check_custom_scalar(
+    BASE_NAME
+    He_orb_rot_param_grad_legacy
+    BASE_DIR
+    "${qmcpack_SOURCE_DIR}/tests/molecules/He_param"
+    PREFIX
+    He_orb_rot_param_grad_legacy.param
+    INPUT_FILE
+    He_orb_rot_param_grad_legacy.xml
+    PROCS
+    1
+    THREADS
+    1
+    SERIES
+    0
+    SCALAR_VALUES
+    HE_ORB_ROT_PARAM)
+
+
   else()
     message(VERBOSE "Skipping He_param tests because parameter output is not supported by mixed precison build (QMC_MIXED_PRECISION=1)")
   endif()
diff --git a/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml b/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml
new file mode 100644
index 0000000000..71b42b5b81
--- /dev/null
+++ b/tests/molecules/He_param/He_orb_rot_param_grad_legacy.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0"?>
+<simulation>
+  <project id="He_orb_rot_param_grad_legacy" series="0">
+         <parameter name="driver_version">legacy</parameter>
+  </project>
+
+  <!-- Location of atoms -->
+
+  <particleset name="ion0" size="1">
+    <group name="He">
+      <parameter name="charge">2</parameter>
+    </group>
+    <attrib name="position" datatype="posArray">
+      0.0 0.0 0.0
+    </attrib>
+  </particleset>
+
+  <!-- Randomly create electrons around the atomic position -->
+
+  <particleset name="e" random="yes" randomsrc="ion0">
+    <group name="u" size="1">
+      <parameter name="charge">-1</parameter>
+    </group>
+    <group name="d" size="1">
+      <parameter name="charge">-1</parameter>
+    </group>
+  </particleset>
+
+  <!-- Trial wavefunction - use Slater determinant multiplied by a Jastrow factor -->
+
+  <wavefunction name="psi0" target="e">
+    <sposet_collection type="MolecularOrbital">
+      <!-- Use a single Slater Type Orbital (STO) for the basis. Cusp condition is correct. -->
+      <basisset keyword="STO" transform="no">
+        <atomicBasisSet type="STO" elementType="He" normalized="no">
+          <basisGroup rid="R0" l="0" m="0" type="Slater">
+             <radfunc n="1" exponent="2.0"/>
+          </basisGroup>
+          <basisGroup rid="R1" l="0" m="0" type="Slater">
+             <radfunc n="2" exponent="1.0"/>
+          </basisGroup>
+        </atomicBasisSet>
+      </basisset>
+      <sposet basisset="LCAOBSet" name="spo-up" size="2" optimize="yes">
+          <opt_vars>0.1</opt_vars>
+          <coefficient id="updetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+      <sposet basisset="LCAOBSet" name="spo-down" size="2" optimize="yes">
+          <opt_vars>0.1</opt_vars>
+          <coefficient id="downdetC" type="Array" size="2">
+            1.0 0.0
+            0.0 1.0
+          </coefficient>
+      </sposet>
+    </sposet_collection>
+    <determinantset type="MO" key="STO" transform="no" source="ion0">
+      <slaterdeterminant>
+        <determinant id="spo-up" spin="1" size="2"/>
+        <determinant id="spo-down" spin="-1" size="1"/>
+      </slaterdeterminant>
+    </determinantset>
+
+
+  </wavefunction>
+
+  <!-- Hamiltonian - the energy of interactions between particles -->
+
+  <hamiltonian name="h0" type="generic" target="e">
+    <!-- Electon-electron -->
+    <pairpot name="ElecElec" type="coulomb" source="e" target="e"/>
+    <!-- Electon-ion -->
+    <pairpot name="Coulomb" type="coulomb" source="ion0" target="e"/>
+    <!-- Ion-ion (not needed for a single atom) -->
+    <!--<constant name="IonIon" type="coulomb" source="ion0" target="ion0"/>-->
+  </hamiltonian>
+
+  <!-- QMC method(s) to run -->
+
+  <loop max="10">
+    <qmc method="linear" move="pbyp" checkpoint="-1" gpu="no">
+      <optimize method="gradient_test">
+        <parameter name="output_param_file">yes</parameter>
+      </optimize>
+      <parameter name="blocks">     100  </parameter>
+
+      <parameter name="warmupsteps"> 25 </parameter>
+      <parameter name="steps"> 10 </parameter>
+      <parameter name="substeps"> 20 </parameter>
+      <parameter name="timestep"> 0.5 </parameter>
+      <parameter name="samples"> 1000 </parameter>
+      <cost name="energy">                   1.0 </cost>
+      <cost name="reweightedvariance">       0.00 </cost>
+    </qmc>
+  </loop>
+
+
+</simulation>

From 0a0d55acf2b433ff91f4ba3149bc9e2b8639f02a Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Mon, 20 Jun 2022 15:36:03 -0400
Subject: [PATCH 27/36] Do not use cudaDeviceProp maxTexture1D in HIP

---
 src/QMCHamiltonians/ECPComponentBuilder.2.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/QMCHamiltonians/ECPComponentBuilder.2.cpp b/src/QMCHamiltonians/ECPComponentBuilder.2.cpp
index 7d28e17902..f4aaf6eeac 100644
--- a/src/QMCHamiltonians/ECPComponentBuilder.2.cpp
+++ b/src/QMCHamiltonians/ECPComponentBuilder.2.cpp
@@ -474,7 +474,7 @@ void ECPComponentBuilder::doBreakUp(const std::vector<int>& angList,
                                     RealType rmax,
                                     mRealType Vprefactor)
 {
-#ifdef QMC_CUDA
+#if defined(QMC_CUDA) && !defined(QMC_CUDA2HIP)
   int device;
   cudaCheck(cudaGetDevice(&device));
   cudaDeviceProp deviceProp;

From cdf7430c50fa69794c49d52d5bdc0f61bfb41a4d Mon Sep 17 00:00:00 2001
From: William F Godoy <williamfgc@yahoo.com>
Date: Wed, 22 Jun 2022 11:51:40 -0400
Subject: [PATCH 28/36] Add MPI support to ROCm legacy CI

For CI running on nitrogen
---
 .../workflows/ci-github-actions-self-hosted.yaml |  8 ++++----
 .../github-actions/ci/run_step.sh                | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml
index d2f890091a..5d4b4c3e9f 100644
--- a/.github/workflows/ci-github-actions-self-hosted.yaml
+++ b/.github/workflows/ci-github-actions-self-hosted.yaml
@@ -222,10 +222,10 @@ jobs:
             ROCm-Clang13-NoMPI-CUDA2HIP-Real,
             ROCm-Clang13-NoMPI-CUDA2HIP-Complex-Mixed,
             ROCm-Clang13-NoMPI-CUDA2HIP-Complex,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real-Mixed,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Real,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex-Mixed,
-            ROCm-Clang13-NoMPI-Legacy-CUDA2HIP-Complex,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real-Mixed,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Real,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex-Mixed,
+            ROCm-Clang13-MPI-Legacy-CUDA2HIP-Complex,
           ]
 
     steps:
diff --git a/tests/test_automation/github-actions/ci/run_step.sh b/tests/test_automation/github-actions/ci/run_step.sh
index 5d15188167..1ec85c84dc 100755
--- a/tests/test_automation/github-actions/ci/run_step.sh
+++ b/tests/test_automation/github-actions/ci/run_step.sh
@@ -231,13 +231,21 @@ case "$1" in
               -DQMC_DATA=$QMC_DATA_DIR \
               ${GITHUB_WORKSPACE}
       ;;
-      *"ROCm-Clang13-NoMPI-Legacy-CUDA2HIP"*)
+      *"ROCm-Clang13-MPI-Legacy-CUDA2HIP"*)
         echo 'Configure for building CUDA2HIP with clang compilers shipped with ROCM on AMD hardware'
+
+        export OMPI_CC=/opt/rocm/llvm/bin/clang
+        export OMPI_CXX=/opt/rocm/llvm/bin/clang++
+
+        # Make current environment variables available to subsequent steps
+        echo "OMPI_CC=/opt/rocm/llvm/bin/clang" >> $GITHUB_ENV
+        echo "OMPI_CXX=/opt/rocm/llvm/bin/clang++" >> $GITHUB_ENV
+
         cmake -GNinja \
-              -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-              -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
+              -DCMAKE_C_COMPILER=/usr/lib64/openmpi/bin/mpicc \
+              -DCMAKE_CXX_COMPILER=/usr/lib64/openmpi/bin/mpicxx \
+              -DMPIEXEC_EXECUTABLE=/usr/lib64/openmpi/bin/mpirun \
               -DQMC_CUDA=1 \
-              -DQMC_MPI=0 \
               -DQMC_CUDA2HIP=ON \
               -DCMAKE_PREFIX_PATH="/opt/OpenBLAS/0.3.18" \
               -DQMC_COMPLEX=$IS_COMPLEX \

From e0afba1c2427a17fb130f4e3aa5f18fd30d26f17 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Wed, 22 Jun 2022 20:26:28 -0500
Subject: [PATCH 29/36] Rewrite loop to avoid NVHPC hang.

---
 .../Fermion/MatrixDelayedUpdateCUDA.h                | 12 +++++++-----
 src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h | 12 +++++++-----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
index 55c187a4b9..a2fbc37d80 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
@@ -526,13 +526,15 @@ class MatrixDelayedUpdateCUDA
       int success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1);
       if (success != 0)
         throw std::runtime_error("ompBLAS::gemv failed.");
-      PRAGMA_OFFLOAD("omp target is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)")
+
+      PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)")
+      for (int i = 0; i < norb; i++)
       {
-        temp_ptr[rowchanged] -= cone;
-        PRAGMA_OFFLOAD("omp parallel for simd")
-        for (int i = 0; i < norb; i++)
-          rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i];
+        rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i];
+        if (i == 0)
+          temp_ptr[rowchanged] -= cone;
       }
+
       success = ompBLAS::ger(dummy_handle, norb, norb, static_cast<Value>(FullPrecValue(-1) / c_ratio_in), rcopy_ptr, 1,
                              temp_ptr, 1, Ainv_ptr, lda);
       if (success != 0)
diff --git a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
index 6ce11595b1..cbb1efe06f 100644
--- a/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
+++ b/src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
@@ -222,13 +222,15 @@ class MatrixUpdateOMPTarget
       success = ompBLAS::gemv(dummy_handle, 'T', norb, norb, cone, Ainv_ptr, lda, phiV_ptr, 1, czero, temp_ptr, 1);
       if (success != 0)
         throw std::runtime_error("ompBLAS::gemv failed.");
-      PRAGMA_OFFLOAD("omp target is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)")
+
+      PRAGMA_OFFLOAD("omp target parallel for simd is_device_ptr(Ainv_ptr, temp_ptr, rcopy_ptr)")
+      for (int i = 0; i < norb; i++)
       {
-        temp_ptr[rowchanged] -= cone;
-        PRAGMA_OFFLOAD("omp parallel for simd")
-        for (int i = 0; i < norb; i++)
-          rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i];
+        rcopy_ptr[i] = Ainv_ptr[rowchanged * lda + i];
+        if (i == 0)
+          temp_ptr[rowchanged] -= cone;
       }
+
       success = ompBLAS::ger(dummy_handle, norb, norb, static_cast<Value>(-1.0 / c_ratio_in), rcopy_ptr, 1, temp_ptr, 1,
                              Ainv_ptr, lda);
       if (success != 0)

From 49bfe4b19183443c2ef4ab08a01a1a49a47e67f2 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Thu, 23 Jun 2022 08:28:52 -0500
Subject: [PATCH 30/36] Remove omp parallel over walkers.

---
 src/Particle/DistanceTable.h                   |  5 -----
 src/Particle/ParticleSet.cpp                   |  4 ----
 src/Particle/SoaDistanceTableAAOMPTarget.h     |  1 -
 src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp   |  1 -
 src/QMCWaveFunctions/SPOSet.cpp                |  5 -----
 src/QMCWaveFunctions/SpinorSet.cpp             |  2 --
 src/QMCWaveFunctions/WaveFunctionComponent.cpp | 12 ------------
 7 files changed, 30 deletions(-)

diff --git a/src/Particle/DistanceTable.h b/src/Particle/DistanceTable.h
index c1fa5b07ac..3175be4596 100644
--- a/src/Particle/DistanceTable.h
+++ b/src/Particle/DistanceTable.h
@@ -103,7 +103,6 @@ class DistanceTable
   virtual void mw_evaluate(const RefVectorWithLeader<DistanceTable>& dt_list,
                            const RefVectorWithLeader<ParticleSet>& p_list) const
   {
-#pragma omp parallel for
     for (int iw = 0; iw < dt_list.size(); iw++)
       dt_list[iw].evaluate(p_list[iw]);
   }
@@ -117,7 +116,6 @@ class DistanceTable
                             const RefVectorWithLeader<ParticleSet>& p_list,
                             const std::vector<bool>& recompute) const
   {
-#pragma omp parallel for
     for (int iw = 0; iw < dt_list.size(); iw++)
       if (recompute[iw])
         dt_list[iw].evaluate(p_list[iw]);
@@ -145,7 +143,6 @@ class DistanceTable
                        const IndexType iat,
                        bool prepare_old = true) const
   {
-#pragma omp parallel for
     for (int iw = 0; iw < dt_list.size(); iw++)
       dt_list[iw].move(p_list[iw], rnew_list[iw], iat, prepare_old);
   }
@@ -174,7 +171,6 @@ class DistanceTable
                                 IndexType jat,
                                 const std::vector<bool>& from_temp)
   {
-#pragma omp parallel for
     for (int iw = 0; iw < dt_list.size(); iw++)
       dt_list[iw].updatePartial(jat, from_temp[iw]);
   }
@@ -192,7 +188,6 @@ class DistanceTable
   virtual void mw_finalizePbyP(const RefVectorWithLeader<DistanceTable>& dt_list,
                                const RefVectorWithLeader<ParticleSet>& p_list) const
   {
-#pragma omp parallel for
     for (int iw = 0; iw < dt_list.size(); iw++)
       dt_list[iw].finalizePbyP(p_list[iw]);
   }
diff --git a/src/Particle/ParticleSet.cpp b/src/Particle/ParticleSet.cpp
index b6c72606fc..14f5c1ee3f 100644
--- a/src/Particle/ParticleSet.cpp
+++ b/src/Particle/ParticleSet.cpp
@@ -380,11 +380,8 @@ void ParticleSet::mw_update(const RefVectorWithLeader<ParticleSet>& p_list, bool
   }
 
   if (!skipSK && p_leader.structure_factor_)
-  {
-#pragma omp parallel for
     for (int iw = 0; iw < p_list.size(); iw++)
       p_list[iw].structure_factor_->updateAllPart(p_list[iw]);
-  }
 }
 
 void ParticleSet::makeMove(Index_t iat, const SingleParticlePos& displ, bool maybe_accept)
@@ -852,7 +849,6 @@ void ParticleSet::mw_loadWalker(const RefVectorWithLeader<ParticleSet>& p_list,
     pset.spins = awalker.spins;
     pset.coordinates_->setAllParticlePos(pset.R);
   };
-#pragma omp parallel for
   for (int iw = 0; iw < p_list.size(); ++iw)
     if (recompute[iw])
       loadWalkerConfig(p_list[iw], walkers[iw]);
diff --git a/src/Particle/SoaDistanceTableAAOMPTarget.h b/src/Particle/SoaDistanceTableAAOMPTarget.h
index 646b5085ee..c850488e6e 100644
--- a/src/Particle/SoaDistanceTableAAOMPTarget.h
+++ b/src/Particle/SoaDistanceTableAAOMPTarget.h
@@ -477,7 +477,6 @@ struct SoaDistanceTableAAOMPTarget : public DTD_BConds<T, D, SC>, public Distanc
     if (!(modes_ & DTModes::NEED_TEMP_DATA_ON_HOST))
       return;
 
-#pragma omp parallel for
     for (int iw = 0; iw < dt_list.size(); iw++)
       dt_list[iw].updatePartial(jat, from_temp[iw]);
   }
diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp
index 4da7844190..b668e1c975 100644
--- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp
+++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp
@@ -715,7 +715,6 @@ void J2OMPTarget<FT>::mw_recompute(const RefVectorWithLeader<WaveFunctionCompone
 {
   auto& wfc_leader = wfc_list.getCastedLeader<J2OMPTarget<FT>>();
   assert(this == &wfc_leader);
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     if (recompute[iw])
       wfc_list[iw].recompute(p_list[iw]);
diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp
index 93ade40335..8a1d2a6ca1 100644
--- a/src/QMCWaveFunctions/SPOSet.cpp
+++ b/src/QMCWaveFunctions/SPOSet.cpp
@@ -52,7 +52,6 @@ void SPOSet::mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet>& spo_list,
                                   std::vector<std::vector<ValueType>>& ratios_list) const
 {
   assert(this == &spo_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < spo_list.size(); iw++)
   {
     Vector<ValueType> invRow(const_cast<ValueType*>(invRow_ptr_list[iw]), psi_list[iw].get().size());
@@ -78,7 +77,6 @@ void SPOSet::mw_evaluateVGL(const RefVectorWithLeader<SPOSet>& spo_list,
                             const RefVector<ValueVector>& d2psi_v_list) const
 {
   assert(this == &spo_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < spo_list.size(); iw++)
     spo_list[iw].evaluateVGL(P_list[iw], iat, psi_v_list[iw], dpsi_v_list[iw], d2psi_v_list[iw]);
 }
@@ -89,7 +87,6 @@ void SPOSet::mw_evaluateValue(const RefVectorWithLeader<SPOSet>& spo_list,
                               const RefVector<ValueVector>& psi_v_list) const
 {
   assert(this == &spo_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < spo_list.size(); iw++)
     spo_list[iw].evaluateValue(P_list[iw], iat, psi_v_list[iw]);
 }
@@ -103,7 +100,6 @@ void SPOSet::mw_evaluateVGLWithSpin(const RefVectorWithLeader<SPOSet>& spo_list,
                                     const RefVector<ValueVector>& dspin_v_list) const
 {
   assert(this == &spo_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < spo_list.size(); iw++)
     spo_list[iw].evaluateVGL_spin(P_list[iw], iat, psi_v_list[iw], dpsi_v_list[iw], d2psi_v_list[iw], dspin_v_list[iw]);
 }
@@ -168,7 +164,6 @@ void SPOSet::mw_evaluate_notranspose(const RefVectorWithLeader<SPOSet>& spo_list
                                      const RefVector<ValueMatrix>& d2logdet_list) const
 {
   assert(this == &spo_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < spo_list.size(); iw++)
     spo_list[iw].evaluate_notranspose(P_list[iw], first, last, logdet_list[iw], dlogdet_list[iw], d2logdet_list[iw]);
 }
diff --git a/src/QMCWaveFunctions/SpinorSet.cpp b/src/QMCWaveFunctions/SpinorSet.cpp
index 7609800b5b..62b2fc4a47 100644
--- a/src/QMCWaveFunctions/SpinorSet.cpp
+++ b/src/QMCWaveFunctions/SpinorSet.cpp
@@ -195,7 +195,6 @@ void SpinorSet::mw_evaluateVGLWithSpin(const RefVectorWithLeader<SPOSet>& spo_li
   up_spo_leader.mw_evaluateVGL(up_spo_list, P_list, iat, up_psi_v_list, up_dpsi_v_list, up_d2psi_v_list);
   dn_spo_leader.mw_evaluateVGL(dn_spo_list, P_list, iat, dn_psi_v_list, dn_dpsi_v_list, dn_d2psi_v_list);
 
-#pragma omp parallel for
   for (int iw = 0; iw < nw; iw++)
   {
     ParticleSet::Scalar_t s = P_list[iw].activeSpin(iat);
@@ -327,7 +326,6 @@ void SpinorSet::mw_evaluate_notranspose(const RefVectorWithLeader<SPOSet>& spo_l
   dn_spo_leader.mw_evaluate_notranspose(dn_spo_list, P_list, first, last, dn_logdet_list, dn_dlogdet_list,
                                         dn_d2logdet_list);
 
-#pragma omp parallel for
   for (int iw = 0; iw < nw; iw++)
     for (int iat = 0; iat < nelec; iat++)
     {
diff --git a/src/QMCWaveFunctions/WaveFunctionComponent.cpp b/src/QMCWaveFunctions/WaveFunctionComponent.cpp
index 284fe17996..880f67684e 100644
--- a/src/QMCWaveFunctions/WaveFunctionComponent.cpp
+++ b/src/QMCWaveFunctions/WaveFunctionComponent.cpp
@@ -44,7 +44,6 @@ void WaveFunctionComponent::mw_evaluateLog(const RefVectorWithLeader<WaveFunctio
                                            const RefVector<ParticleSet::ParticleLaplacian>& L_list) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     wfc_list[iw].evaluateLog(p_list[iw], G_list[iw], L_list[iw]);
 }
@@ -62,7 +61,6 @@ void WaveFunctionComponent::mw_recompute(const RefVectorWithLeader<WaveFunctionC
                                          const std::vector<bool>& recompute) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     if (recompute[iw])
       wfc_list[iw].recompute(p_list[iw]);
@@ -73,7 +71,6 @@ void WaveFunctionComponent::mw_prepareGroup(const RefVectorWithLeader<WaveFuncti
                                             int ig) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     wfc_list[iw].prepareGroup(p_list[iw], ig);
 }
@@ -96,7 +93,6 @@ void WaveFunctionComponent::mw_evalGrad(const RefVectorWithLeader<WaveFunctionCo
                                         std::vector<GradType>& grad_now) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     grad_now[iw] = wfc_list[iw].evalGrad(p_list[iw], iat);
 }
@@ -108,7 +104,6 @@ void WaveFunctionComponent::mw_evalGradWithSpin(const RefVectorWithLeader<WaveFu
                                                 std::vector<ComplexType>& spingrad_now) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     grad_now[iw] = wfc_list[iw].evalGradWithSpin(p_list[iw], iat, spingrad_now[iw]);
 }
@@ -119,7 +114,6 @@ void WaveFunctionComponent::mw_calcRatio(const RefVectorWithLeader<WaveFunctionC
                                          std::vector<PsiValueType>& ratios) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     ratios[iw] = wfc_list[iw].ratio(p_list[iw], iat);
 }
@@ -151,7 +145,6 @@ void WaveFunctionComponent::mw_ratioGrad(const RefVectorWithLeader<WaveFunctionC
                                          std::vector<GradType>& grad_new) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     ratios[iw] = wfc_list[iw].ratioGrad(p_list[iw], iat, grad_new[iw]);
 }
@@ -164,7 +157,6 @@ void WaveFunctionComponent::mw_ratioGradWithSpin(const RefVectorWithLeader<WaveF
                                                  std::vector<ComplexType>& spingrad_new) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     ratios[iw] = wfc_list[iw].ratioGradWithSpin(p_list[iw], iat, grad_new[iw], spingrad_new[iw]);
 }
@@ -176,7 +168,6 @@ void WaveFunctionComponent::mw_accept_rejectMove(const RefVectorWithLeader<WaveF
                                                  bool safe_to_delay) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     if (isAccepted[iw])
       wfc_list[iw].acceptMove(p_list[iw], iat, safe_to_delay);
@@ -187,7 +178,6 @@ void WaveFunctionComponent::mw_accept_rejectMove(const RefVectorWithLeader<WaveF
 void WaveFunctionComponent::mw_completeUpdates(const RefVectorWithLeader<WaveFunctionComponent>& wfc_list) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     wfc_list[iw].completeUpdates();
 }
@@ -207,7 +197,6 @@ void WaveFunctionComponent::mw_evaluateGL(const RefVectorWithLeader<WaveFunction
                                           bool fromscratch) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     wfc_list[iw].evaluateGL(p_list[iw], G_list[iw], L_list[iw], fromscratch);
 }
@@ -248,7 +237,6 @@ void WaveFunctionComponent::mw_evaluateRatios(const RefVectorWithLeader<WaveFunc
                                               std::vector<std::vector<ValueType>>& ratios) const
 {
   assert(this == &wfc_list.getLeader());
-#pragma omp parallel for
   for (int iw = 0; iw < wfc_list.size(); iw++)
     wfc_list[iw].evaluateRatios(vp_list[iw], ratios[iw]);
 }

From 608cdab860c98f405b02c937ee26e9d217ea1813 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Fri, 24 Jun 2022 01:44:50 -0500
Subject: [PATCH 31/36] Add MSD::mw_accept_rejectMove unit test

---
 .../Fermion/MultiSlaterDetTableMethod.cpp     |  3 ++
 .../tests/test_multi_slater_determinant.cpp   | 37 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp
index 4c8e73b905..9162e60ca7 100644
--- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp
+++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp
@@ -1082,6 +1082,9 @@ void MultiSlaterDetTableMethod::mw_prepareGroup(const RefVectorWithLeader<WaveFu
                                                 const RefVectorWithLeader<ParticleSet>& p_list,
                                                 int ig) const
 {
+  if (!use_pre_computing_)
+    return;
+
   auto& det_leader = wfc_list.getCastedLeader<MultiSlaterDetTableMethod>();
   const size_t nw  = wfc_list.size();
   assert(this == &det_leader);
diff --git a/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp b/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp
index f980fda7e9..8c222f6ca6 100644
--- a/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp
+++ b/src/QMCWaveFunctions/tests/test_multi_slater_determinant.cpp
@@ -276,6 +276,43 @@ void test_LiH_msd(const std::string& spo_xml_string,
     CHECK(grad_new.grads_positions[1][0] == ValueApprox(-0.8633778143));
     CHECK(grad_new.grads_positions[1][1] == ValueApprox(0.8245347691));
     CHECK(grad_new.grads_positions[1][2] == ValueApprox(-5.1513380151));
+
+    std::vector<bool> isAccepted{false, true};
+    TrialWaveFunction::mw_accept_rejectMove(wf_ref_list, p_ref_list, moved_elec_id, isAccepted);
+    ParticleSet::mw_accept_rejectMove(p_ref_list, moved_elec_id, isAccepted);
+
+    CHECK(wf_ref_list[0].getLogPsi() == Approx(-7.803347327300152));
+    CHECK(wf_ref_list[1].getLogPsi() == Approx(-7.321765331299484));
+
+    // move the next electron
+    TrialWaveFunction::mw_prepareGroup(wf_ref_list, p_ref_list, 1);
+    const int moved_elec_id_next = 2;
+    TrialWaveFunction::mw_evalGrad(wf_ref_list, p_ref_list, moved_elec_id_next, grad_old);
+
+    CHECK(grad_old.grads_positions[0][0] == ValueApprox(1.3325558736));
+    CHECK(grad_old.grads_positions[0][1] == ValueApprox(1.3327966725));
+    CHECK(grad_old.grads_positions[0][2] == ValueApprox(1.2157689586));
+    CHECK(grad_old.grads_positions[1][0] == ValueApprox(1.3222514142));
+    CHECK(grad_old.grads_positions[1][1] == ValueApprox(1.3230108868));
+    CHECK(grad_old.grads_positions[1][2] == ValueApprox(1.2035047435));
+
+    ParticleSet::mw_makeMove(p_ref_list, moved_elec_id_next, displ);
+    TrialWaveFunction::mw_calcRatio(wf_ref_list, p_ref_list, moved_elec_id_next, ratios);
+
+    CHECK(ratios[0] == ValueApprox(PsiValueType(2.1080036144)));
+    CHECK(ratios[1] == ValueApprox(PsiValueType(0.4947158435)));
+
+    TrialWaveFunction::mw_calcRatioGrad(wf_ref_list, p_ref_list, moved_elec_id_next, ratios, grad_new);
+
+    CHECK(ratios[0] == ValueApprox(PsiValueType(2.1080036144)));
+    CHECK(ratios[1] == ValueApprox(PsiValueType(0.4947158435)));
+
+    CHECK(grad_new.grads_positions[0][0] == ValueApprox(1.8412365668));
+    CHECK(grad_new.grads_positions[0][1] == ValueApprox(1.3736370007));
+    CHECK(grad_new.grads_positions[0][2] == ValueApprox(0.8043818454));
+    CHECK(grad_new.grads_positions[1][0] == ValueApprox(1.3553132105));
+    CHECK(grad_new.grads_positions[1][1] == ValueApprox(1.5552132255));
+    CHECK(grad_new.grads_positions[1][2] == ValueApprox(0.804301246));
   }
 }
 

From 2fce4436bd2208d5403e8291b7c559f03f0e1fce Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Sun, 12 Jun 2022 21:54:12 -0500
Subject: [PATCH 32/36] Add mock-up mw_accept_rejectMove in MSD.

---
 .../Fermion/MultiDiracDeterminant.cpp         | 13 +++++++++++
 .../Fermion/MultiDiracDeterminant.h           |  5 +++++
 .../Fermion/MultiSlaterDetTableMethod.cpp     | 22 +++++++++++++++++++
 .../Fermion/MultiSlaterDetTableMethod.h       |  6 +++++
 4 files changed, 46 insertions(+)

diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp
index a42f88409b..9787e91d68 100644
--- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp
+++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.cpp
@@ -517,6 +517,19 @@ void MultiDiracDeterminant::restore(int iat)
   */
 }
 
+void MultiDiracDeterminant::mw_accept_rejectMove(const RefVectorWithLeader<MultiDiracDeterminant>& wfc_list,
+                                                 const RefVectorWithLeader<ParticleSet>& p_list,
+                                                 int iat,
+                                                 const std::vector<bool>& isAccepted)
+{
+  // TODO to be expanded to serve offload needs without relying on calling acceptMove and restore
+  for (int iw = 0; iw < wfc_list.size(); iw++)
+    if (isAccepted[iw])
+      wfc_list[iw].acceptMove(p_list[iw], iat, false);
+    else
+      wfc_list[iw].restore(iat);
+}
+
 // this has been fixed
 MultiDiracDeterminant::MultiDiracDeterminant(const MultiDiracDeterminant& s)
     : WaveFunctionComponent(s),
diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h
index 690ba02368..b42a61df73 100644
--- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h
+++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.h
@@ -201,6 +201,11 @@ class MultiDiracDeterminant : public WaveFunctionComponent
    */
   void restore(int iat) override;
 
+  static void mw_accept_rejectMove(const RefVectorWithLeader<MultiDiracDeterminant>& wfc_list,
+                                                 const RefVectorWithLeader<ParticleSet>& p_list,
+                                                 int iat,
+                                                 const std::vector<bool>& isAccepted);
+
   void createResource(ResourceCollection& collection) const override;
   void acquireResource(ResourceCollection& collection,
                        const RefVectorWithLeader<MultiDiracDeterminant>& wfc_list) const;
diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp
index 9162e60ca7..c86af93eeb 100644
--- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp
+++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.cpp
@@ -669,6 +669,28 @@ void MultiSlaterDetTableMethod::restore(int iat)
   curRatio = 1.0;
 }
 
+void MultiSlaterDetTableMethod::mw_accept_rejectMove(const RefVectorWithLeader<WaveFunctionComponent>& wfc_list,
+                          const RefVectorWithLeader<ParticleSet>& p_list,
+                          int iat,
+                          const std::vector<bool>& isAccepted,
+                          bool safe_to_delay) const
+{
+  ScopedTimer local_timer(AccRejTimer);
+  for (size_t iw = 0; iw < isAccepted.size(); iw++)
+  {
+    auto& det = wfc_list.getCastedElement<MultiSlaterDetTableMethod>(iw);
+    if (isAccepted[iw])
+    {
+      det.psi_ratio_to_ref_det_ = det.new_psi_ratio_to_new_ref_det_;
+      det.log_value_ += convertValueToLog(det.curRatio);
+    }
+    det.curRatio = 1.0;
+  }
+  const auto det_id = getDetID(iat);
+  const auto det_list(extract_DetRef_list(wfc_list, det_id));
+  Dets[det_id]->mw_accept_rejectMove(det_list, p_list, iat, isAccepted);
+}
+
 void MultiSlaterDetTableMethod::registerData(ParticleSet& P, WFBufferType& buf)
 {
   for (size_t id = 0; id < Dets.size(); id++)
diff --git a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h
index 51c2a2b163..66d6699fb3 100644
--- a/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h
+++ b/src/QMCWaveFunctions/Fermion/MultiSlaterDetTableMethod.h
@@ -159,6 +159,12 @@ class MultiSlaterDetTableMethod : public WaveFunctionComponent
   void acceptMove(ParticleSet& P, int iat, bool safe_to_delay = false) override;
   void restore(int iat) override;
 
+  void mw_accept_rejectMove(const RefVectorWithLeader<WaveFunctionComponent>& wfc_list,
+                            const RefVectorWithLeader<ParticleSet>& p_list,
+                            int iat,
+                            const std::vector<bool>& isAccepted,
+                            bool safe_to_delay = false) const override;
+
   void registerData(ParticleSet& P, WFBufferType& buf) override;
   LogValueType updateBuffer(ParticleSet& P, WFBufferType& buf, bool fromscratch = false) override;
   void copyFromBuffer(ParticleSet& P, WFBufferType& buf) override;

From 17160d4425abf312fcc68806a0d9edebe0209c0b Mon Sep 17 00:00:00 2001
From: Peter Doak <doakpw@ornl.gov>
Date: Fri, 24 Jun 2022 10:05:42 -0400
Subject: [PATCH 33/36] drop standalone-debug for offload builds

---
 CMake/ClangCompilers.cmake | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake
index c8ba9ddf9b..3688bb4d9d 100644
--- a/CMake/ClangCompilers.cmake
+++ b/CMake/ClangCompilers.cmake
@@ -104,8 +104,16 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 
 # Set extra debug flags
-set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug")
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer")
+
+# unfortunately this removes standalone-debug altogether for offload builds
+# but until we discover how to use the ${OPENMP_OFFLOAD_COMPILE_OPTIONS} more selectively
+# this is the only way to avoid a warning per compilation unit that contains an omp symbol.
+if (NOT OFFLOAD_TARGET MATCHES "nvptx64")
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fstandalone-debug")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstandalone-debug")
+endif()
 
 #--------------------------------------
 # Special architectural flags

From 8019f7f21bb0e4db574f9c5cc42b82c4d867b476 Mon Sep 17 00:00:00 2001
From: Peter Doak <doakpw@ornl.gov>
Date: Fri, 24 Jun 2022 14:19:55 -0400
Subject: [PATCH 34/36] adding a message about -fstandalone-debug

---
 CMake/ClangCompilers.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake
index 3688bb4d9d..ac439a4de8 100644
--- a/CMake/ClangCompilers.cmake
+++ b/CMake/ClangCompilers.cmake
@@ -111,6 +111,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer")
 # but until we discover how to use the ${OPENMP_OFFLOAD_COMPILE_OPTIONS} more selectively
 # this is the only way to avoid a warning per compilation unit that contains an omp symbol.
 if (NOT OFFLOAD_TARGET MATCHES "nvptx64")
+  message(STATUS "QMCPACK adds -fstandalone-debug for Debug builds")
   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fstandalone-debug")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstandalone-debug")
 endif()

From 179a2d0666751671967558590768e75da3624656 Mon Sep 17 00:00:00 2001
From: Paul Kent <kentpr@ornl.gov>
Date: Fri, 24 Jun 2022 15:56:27 -0400
Subject: [PATCH 35/36] Guard div by 0

---
 nexus/tests/unit/test_structure.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nexus/tests/unit/test_structure.py b/nexus/tests/unit/test_structure.py
index acdbd6798d..d7aef53980 100644
--- a/nexus/tests/unit/test_structure.py
+++ b/nexus/tests/unit/test_structure.py
@@ -1372,7 +1372,9 @@ def test_embed():
         r = np.linalg.norm(dr,axis=1)
         dilation = 2*r*np.exp(-r)
         for i in range(npos):
-            gr.pos[i] += dilation[i]/r[i]*dr[i]
+            if r[i]>0:
+                gr.pos[i] += dilation[i]/r[i]*dr[i]
+            #end if
         #end for
 
         # Represent the unrelaxed large cell

From b89a911fe2dcb19095286035c2d5c4f124ba3424 Mon Sep 17 00:00:00 2001
From: Jeongnim Kim <jeongnim.kim@gmail.com>
Date: Mon, 11 Jul 2022 11:06:31 -0700
Subject: [PATCH 36/36] Fix to avoid double-free with latest compilers.

---
 src/Platforms/SYCL/SYCLDeviceManager.cpp | 11 +++++++----
 src/Platforms/SYCL/SYCLDeviceManager.h   |  7 ++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/Platforms/SYCL/SYCLDeviceManager.cpp b/src/Platforms/SYCL/SYCLDeviceManager.cpp
index 6372660deb..eb5082ac8c 100644
--- a/src/Platforms/SYCL/SYCLDeviceManager.cpp
+++ b/src/Platforms/SYCL/SYCLDeviceManager.cpp
@@ -41,11 +41,14 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices,
     if(id == sycl_default_device_num)
     {
       int result;
-      default_device_queue.reset(static_cast<sycl::queue *> (omp_get_interop_ptr(interop, omp_ipr_targetsync, &result)));
+      sycl::queue* omp_queue = static_cast<sycl::queue *>(omp_get_interop_ptr(interop, omp_ipr_targetsync, &result));
       if(result != omp_irc_success)
         throw std::runtime_error("SYCLDeviceManager::SYCLDeviceManager fail to obtain sycl::queue by interop");
+
+      default_device_queue.reset(new sycl::queue(*omp_queue));
+      visible_devices[id].queue_ = omp_queue;
     }
-    visible_devices[id].interop=interop;
+    visible_devices[id].interop_ = interop;
   } 
 
 #else
@@ -92,8 +95,8 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices,
       default_device_num = sycl_default_device_num;
     else if (default_device_num != sycl_default_device_num)
       throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!");
-    default_device_queue = std::make_unique<sycl::queue>(visible_devices[sycl_default_device_num].context,
-                                                         visible_devices[sycl_default_device_num].device);
+    default_device_queue = std::make_unique<sycl::queue>(visible_devices[sycl_default_device_num].context_,
+                                                         visible_devices[sycl_default_device_num].device_);
   }
 #endif
 }
diff --git a/src/Platforms/SYCL/SYCLDeviceManager.h b/src/Platforms/SYCL/SYCLDeviceManager.h
index 7055c13c3a..7a54391b9f 100644
--- a/src/Platforms/SYCL/SYCLDeviceManager.h
+++ b/src/Platforms/SYCL/SYCLDeviceManager.h
@@ -24,9 +24,10 @@ namespace qmcplusplus
 {
 struct syclDeviceInfo
 {
-  sycl::context context;
-  sycl::device device;
-  omp_interop_t interop;
+  sycl::context context_;
+  sycl::device device_;
+  sycl::queue* queue_ = nullptr;
+  omp_interop_t interop_;
 };
 
 /** SYCL device manager