diff --git a/hamr_buffer.h b/hamr_buffer.h
index 09ae4d4..16b8a1c 100644
--- a/hamr_buffer.h
+++ b/hamr_buffer.h
@@ -279,9 +279,11 @@ class HAMR_EXPORT buffer
      *                  driver API is used to determine the device that
      *                  allocated the memory.
      * @param[in] ptr   a pointer to the array
+     * @param[in] take  set non-zero if the buffer should delete the passed
+     *                  memory using the named allocator
      */
     buffer(allocator alloc, const hamr::stream &strm,
-        transfer sync, size_t size, int owner, T *ptr);
+        transfer sync, size_t size, int owner, T *ptr, int take = 1);
 
     /** Construct by directly providing the buffer contents. This can be used
      * for zero-copy transfer of data.  One must also name the allocator type
diff --git a/hamr_buffer_impl.h b/hamr_buffer_impl.h
index 3db9c51..00b83a4 100644
--- a/hamr_buffer_impl.h
+++ b/hamr_buffer_impl.h
@@ -242,14 +242,18 @@ buffer<T>::buffer(allocator alloc, const hamr::stream &strm, transfer sync,
 // --------------------------------------------------------------------------
 template <typename T>
 buffer<T>::buffer(allocator alloc, const hamr::stream &strm, transfer sync,
-    size_t size, int owner, T *ptr) : m_alloc(alloc), m_data(nullptr),
+    size_t size, int owner, T *ptr, int take) : m_alloc(alloc), m_data(nullptr),
     m_size(size), m_capacity(size), m_owner(owner), m_stream(strm),
     m_sync(sync)
 {
     assert_valid_allocator(alloc);
 
     // create the deleter for the passed allocator
-    if (alloc == allocator::cpp)
+    if (!take)
+    {
+        m_data = std::shared_ptr<T>(ptr, [](T*){});
+    }
+    else if (alloc == allocator::cpp)
     {
         m_data = std::shared_ptr<T>(ptr, new_deleter<T>(ptr, m_size));
     }
@@ -2147,8 +2151,8 @@ template <typename T>
 int buffer<T>::print() const
 {
     std::cerr << "m_alloc = " << get_allocator_name(m_alloc)
-        << ", m_size = " << m_size << ", m_capacity = " << m_capacity
-        << ", m_data = ";
+        << ", m_owner = " << m_owner << ", m_size = " << m_size
+        << ", m_capacity = " << m_capacity << ", m_data = ";
 
     if (m_size)
     {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 77d5144..017e0bf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -16,18 +16,18 @@ if (HAMR_ENABLE_CUDA)
     add_test(NAME test_hamr_pipeline_cuda COMMAND test_hamr_pipeline_cuda)
     set_tests_properties(test_hamr_pipeline_cuda PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
 
-    if (NOT HAMR_NVHPC_CUDA)
-        set_source_files_properties(test_hamr_stream_cuda.cpp PROPERTIES LANGUAGE CUDA)
-    endif()
-    add_executable(test_hamr_stream_cuda test_hamr_stream_cuda.cpp)
-    target_link_libraries(test_hamr_stream_cuda hamr CUDA::cublas)
-    if (NOT HAMR_NVHPC_CUDA)
-        set_target_properties(test_hamr_stream_cuda PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}")
-    endif()
-    add_test(NAME test_hamr_stream_cuda_async COMMAND test_hamr_stream_cuda 5000 async)
-    set_tests_properties(test_hamr_stream_cuda_async PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
-    add_test(NAME test_hamr_stream_cuda_default COMMAND test_hamr_stream_cuda 5000 default)
-    set_tests_properties(test_hamr_stream_cuda_default PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
+    #if (NOT HAMR_NVHPC_CUDA)
+    #    set_source_files_properties(test_hamr_stream_cuda.cpp PROPERTIES LANGUAGE CUDA)
+    #endif()
+    #add_executable(test_hamr_stream_cuda test_hamr_stream_cuda.cpp)
+    #target_link_libraries(test_hamr_stream_cuda hamr CUDA::cublas)
+    #if (NOT HAMR_NVHPC_CUDA)
+    #    set_target_properties(test_hamr_stream_cuda PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}")
+    #endif()
+    #add_test(NAME test_hamr_stream_cuda_async COMMAND test_hamr_stream_cuda 5000 async)
+    #set_tests_properties(test_hamr_stream_cuda_async PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
+    #add_test(NAME test_hamr_stream_cuda_default COMMAND test_hamr_stream_cuda 5000 default)
+    #set_tests_properties(test_hamr_stream_cuda_default PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
 endif()
 
 if (HAMR_ENABLE_HIP)
@@ -102,6 +102,21 @@ if (HAMR_ENABLE_OPENMP)
     set_tests_properties(test_hamr_openmp_allocator PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
 endif()
 
+if (HAMR_ENABLE_OPENMP AND HAMR_ENABLE_CUDA AND HAMR_NVHPC_CUDA)
+    #NOTE: nvhpc is the only compiler that can do cuda and openmp in the same translation unit
+    # this test will have to be refactored for clang into multiple source files
+    if (NOT HAMR_NVHPC_CUDA)
+       set_source_files_properties(test_hamr_openmp_cuda_interop.cpp PROPERTIES LANGUAGE CUDA)
+    endif()
+    add_executable(test_hamr_openmp_cuda_interop test_hamr_openmp_cuda_interop.cpp)
+    if (NOT HAMR_NVHPC_CUDA)
+        set_target_properties(test_hamr_openmp_cuda_interop PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}")
+    endif()
+    target_link_libraries(test_hamr_openmp_cuda_interop hamr)
+    add_test(NAME test_hamr_openmp_cuda_interop COMMAND test_hamr_openmp_cuda_interop 0 0)
+    set_tests_properties(test_hamr_openmp_cuda_interop PROPERTIES ENVIRONMENT HAMR_VERBOSE=1)
+endif()
+
 if (HAMR_ENABLE_PYTHON)
 
     add_test(NAME test_hamr_buffer_numpy_host
diff --git a/test/test_hamr_openmp_cuda_interop.cpp b/test/test_hamr_openmp_cuda_interop.cpp
new file mode 100644
index 0000000..c7e5e89
--- /dev/null
+++ b/test/test_hamr_openmp_cuda_interop.cpp
@@ -0,0 +1,297 @@
+#include "hamr_config.h"
+#include "hamr_buffer.h"
+#include "hamr_buffer_allocator.h"
+#include "hamr_buffer_util.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <omp.h>
+
+#include <iostream>
+#include <fstream>
+
+using allocator = hamr::buffer_allocator;
+using transfer = hamr::buffer_transfer;
+
+void example1(size_t nElem, int devId)
+{
+  // OpenMP allocates this array on device memory
+  omp_set_default_device(devId);
+  double *devPtr = (double*)malloc(nElem*sizeof(double));
+  #pragma omp target enter data map(alloc: devPtr[0:nElem])
+
+  // OpenMP initializes the memory on the device
+  #pragma omp target teams distribute \
+    parallel for map(alloc: devPtr[0:nElem])
+  for (size_t i = 0; i < nElem; ++i)
+    devPtr[i] = -3.14;
+
+  // zero-copy construct with the device pointer
+  hamr::buffer<double> *simData;
+  #pragma omp target data use_device_addr(devPtr)
+  {
+  simData = new hamr::buffer<double>(allocator::openmp, hamr::stream(),
+                                     nElem, devId, devPtr, 0);
+  }
+
+  // do something with the buffer
+  simData->print();
+
+  // delete the array
+  delete simData;
+
+  // now it is safe to deallocate the device memory
+  #pragma omp target exit data map(release: devPtr[0:nElem])
+}
+
+
+void example2(size_t nElem, int devId)
+{
+  // allocate device memory
+  omp_set_default_device(devId);
+  double *devPtr = (double*)omp_target_alloc(nElem*sizeof(double), devId);
+
+  // wrap it in a shared pointer so it is eventually deallocated
+  std::shared_ptr<double> spDev(devPtr,
+    [devId](double *ptr){ omp_target_free(ptr, devId); });
+
+  // initialize the array on the device
+  #pragma omp target teams distribute parallel for is_device_ptr(devPtr)
+  for (size_t i = 0; i < nElem; ++i)
+    devPtr[i] = -3.14;
+
+  // zero-copy construct with coordinated life cycle management
+  auto simData = hamr::buffer<double>(allocator::openmp, hamr::stream(),
+                                      nElem, devId, spDev);
+
+  // do something with the buffer
+  simData.print();
+}
+
+void example3(size_t nElem, int srcDev, int destDev)
+{
+  // allocate device memory
+  omp_set_default_device(srcDev);
+  double *devPtr = (double*)omp_target_alloc(nElem*sizeof(double), srcDev);
+
+  // initialize
+  #pragma omp target teams distribute parallel for is_device_ptr(devPtr)
+  for (size_t i = 0; i < nElem; ++i)
+    devPtr[i] = -3.14;
+
+  // zero-copy construct from a device pointer, and take ownership
+  auto simData = hamr::buffer<double>(allocator::openmp, hamr::stream(),
+                                          nElem, srcDev, devPtr, 1);
+
+  // move to destDev in place
+  omp_set_default_device(destDev);
+  simData.move(allocator::openmp);
+
+  // do something with the buffer
+  simData.print();
+}
+
+void example4(size_t nElem, int srcDev, int destDev)
+{
+  // allocate and value initialize on one device using OpenMP
+  omp_set_default_device(srcDev);
+  auto simData = hamr::buffer<double>(allocator::openmp, hamr::stream(),
+                                          nElem, -3.14);
+
+  // deep-copy to another device using CUDA
+  cudaSetDevice(destDev);
+  auto dataCpy = hamr::buffer<double>(allocator::openmp, simData);
+
+  // make sure movement is complete before using
+  dataCpy.synchronize();
+
+  // do something with the data
+  dataCpy.print();
+}
+
+
+
+hamr::buffer<double>
+add_arrays_mp(int dev, hamr::buffer<double> &a1, hamr::buffer<double> &a2)
+{
+  // get a view of the incoming data on the device we will use
+  omp_set_default_device(dev);
+
+#if defined(STRUCTURED_BINDING)
+  auto [spa1, pa1] = hamr::get_openmp_accessible(a1);
+  auto [spa2, pa2] = hamr::get_openmp_accessible(a2);
+#else
+  auto spa1 = a1.get_openmp_accessible();
+  auto pa1 = spa1.get();
+
+  auto spa2 = a2.get_openmp_accessible();
+  auto pa2 = spa2.get();
+#endif
+
+  // allocate space for the result
+  size_t nElem = a1.size();
+
+  auto a3 = hamr::buffer<double>(allocator::openmp, nElem);
+
+  // direct access to the result since we know it is in place
+  auto pa3 = a3.data();
+
+  // do the calculation
+  #pragma omp target teams distribute parallel for is_device_ptr(pa1, pa2)
+  for (size_t i = 0; i < nElem; ++i)
+    pa3[i] = pa2[i] + pa1[i];
+
+  return a3;
+}
+
+void example5(size_t nElem, int dev1, int dev2)
+{
+  // this data is located in host main memory
+  auto a1 = hamr::buffer<double>(allocator::malloc, nElem, 1.0);
+
+  // this data is located in device 1 main memory
+  omp_set_default_device(dev1);
+  auto a2 = hamr::buffer<double>(allocator::openmp, hamr::stream(),
+                                 transfer::async, nElem, 2.0);
+
+  // do the calculation on device 2
+  auto a3 = add_arrays_mp(dev2, a1, a2);
+
+  // do something with the result
+  a3.print();
+}
+
+
+namespace libA {
+__global__
+void add(double *a3, const double *a1, const double *a2, size_t n)
+{
+  int i = threadIdx.x + blockIdx.x*blockDim.x;
+  if (i >= n) return;
+  a3[i] = a1[i] + a2[i];
+  //printf("a3[%d]=%g  a1[%d]=%g  a2[%d]=%g \n", i, a3[i], i, a1[i], i, a2[i]);
+}
+
+hamr::buffer<double>
+Add(int dev, const hamr::buffer<double> &a1, const hamr::buffer<double> &a2)
+{
+  // use this stream for the calculation
+  cudaStream_t strm = hamr::stream();
+
+  // get a view of the incoming data on the device we will use
+  cudaSetDevice(dev);
+
+#if defined(STRUCTURED_BINDING)
+  auto [spa1, pa1] = hamr::get_cuda_accessible(a1);
+  auto [spa2, pa2] = hamr::get_cuda_accessible(a2);
+#else
+  auto spa1 = a1.get_openmp_accessible();
+  auto pa1 = spa1.get();
+
+  auto spa2 = a2.get_openmp_accessible();
+  auto pa2 = spa2.get();
+#endif
+
+  // allocate space for the result
+  size_t nElem = a1.size();
+
+  auto a3 = hamr::buffer<double>(allocator::cuda_async, strm, transfer::async, nElem);
+
+  // direct access to the result since we know it is in place
+  auto pa3 = a3.data();
+
+  // make sure the data in flight, if it was moved, has arrived
+  a1.synchronize();
+  a2.synchronize();
+
+  // do the calculation
+  int threads = 128;
+  int blocks = nElem / threads + ( nElem % threads ? 1 : 0 );
+  add<<<blocks,threads,0,strm>>>(pa3, pa1, pa2, nElem);
+
+  return a3;
+}
+}
+
+namespace libB {
+void Write(std::ofstream &ofs, hamr::buffer<double> &a)
+{
+  // get a view of the data on the host
+  auto [spA, pA] = hamr::get_host_accessible(a);
+
+  // make sure the data if moved has arrived
+  a.synchronize();
+
+  // send the data to the file
+  size_t nElem = a.size();
+  for (size_t i = 0; i < nElem; ++i)
+    ofs << pA[i] << " ";
+  ofs << std::endl;
+}
+}
+
+void example6(size_t nElem, int dev1, int dev2)
+{
+  // this data is located in host memory, initialized to 1
+  auto a1 = hamr::buffer<double>(allocator::malloc, hamr::stream(),
+                                 transfer::async, nElem, 1.0);
+
+  // this data is located in device 1 memory, unitialized
+  omp_set_default_device(dev1);
+  auto a2 = hamr::buffer<double>(allocator::openmp, hamr::stream(),
+                                 transfer::async, nElem, 1.0);
+
+  // initialize with OpenMP target offload
+  auto pA2 = a2.data();
+
+  #pragma omp target teams distribute parallel for is_device_ptr(pA2)
+  for (size_t i = 0; i < nElem; ++i)
+    pA2[i] = 2.0;
+
+  // pass data to libA for the calculations
+  auto a3 = libA::Add(dev2, a1, a2);
+
+  // pass data to libB for I/O
+  auto ofile = std::ofstream("data.txt");
+  libB::Write(ofile, a1);
+  libB::Write(ofile, a2);
+  libB::Write(ofile, a3);
+  ofile.close();
+}
+
+
+
+
+
+int main(int argc, char **argv)
+{
+  if (argc != 3)
+  {
+      std::cerr << "usage: test_openmp_cuda_interop [device id] [device id]" << std::endl;
+      return -1;
+  }
+
+  size_t nElem = 64;
+  int dev = atoi(argv[1]);
+  int destDev = atoi(argv[2]);
+
+  std::cerr << "zero-copy construct manual life cycle management ... " << std::endl;
+  example1(nElem, dev);
+
+  std::cerr << "zero-copy construct automatic life cycle management ... " << std::endl;
+  example2(nElem, dev);
+
+  std::cerr << "move in place ..." << std::endl;
+  example3(nElem, dev, destDev);
+
+  std::cerr << "deep copy construct on another device ... " << std::endl;
+  example4(nElem, dev, destDev);
+
+  std::cerr << "add two arrays OpenMP ... " << std::endl;
+  example5(nElem, dev, destDev);
+
+  std::cerr << "add two arrays OpenMP CUDA interop ... " << std::endl;
+  example6(nElem, dev, destDev);
+
+  return 0;
+}