diff --git a/hamr_buffer.h b/hamr_buffer.h index 09ae4d4..16b8a1c 100644 --- a/hamr_buffer.h +++ b/hamr_buffer.h @@ -279,9 +279,11 @@ class HAMR_EXPORT buffer * driver API is used to determine the device that * allocated the memory. * @param[in] ptr a pointer to the array + * @param[in] take set non-zero if the buffer should delete the passed + * memory using the named allocator */ buffer(allocator alloc, const hamr::stream &strm, - transfer sync, size_t size, int owner, T *ptr); + transfer sync, size_t size, int owner, T *ptr, int take = 1); /** Construct by directly providing the buffer contents. This can be used * for zero-copy transfer of data. One must also name the allocator type diff --git a/hamr_buffer_impl.h b/hamr_buffer_impl.h index 3db9c51..00b83a4 100644 --- a/hamr_buffer_impl.h +++ b/hamr_buffer_impl.h @@ -242,14 +242,18 @@ buffer::buffer(allocator alloc, const hamr::stream &strm, transfer sync, // -------------------------------------------------------------------------- template buffer::buffer(allocator alloc, const hamr::stream &strm, transfer sync, - size_t size, int owner, T *ptr) : m_alloc(alloc), m_data(nullptr), + size_t size, int owner, T *ptr, int take) : m_alloc(alloc), m_data(nullptr), m_size(size), m_capacity(size), m_owner(owner), m_stream(strm), m_sync(sync) { assert_valid_allocator(alloc); // create the deleter for the passed allocator - if (alloc == allocator::cpp) + if (!take) + { + m_data = std::shared_ptr(ptr, [](T*){}); + } + else if (alloc == allocator::cpp) { m_data = std::shared_ptr(ptr, new_deleter(ptr, m_size)); } @@ -2147,8 +2151,8 @@ template int buffer::print() const { std::cerr << "m_alloc = " << get_allocator_name(m_alloc) - << ", m_size = " << m_size << ", m_capacity = " << m_capacity - << ", m_data = "; + << ", m_owner = " << m_owner << ", m_size = " << m_size + << ", m_capacity = " << m_capacity << ", m_data = "; if (m_size) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 77d5144..017e0bf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,18 +16,18 @@ if (HAMR_ENABLE_CUDA) add_test(NAME test_hamr_pipeline_cuda COMMAND test_hamr_pipeline_cuda) set_tests_properties(test_hamr_pipeline_cuda PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) - if (NOT HAMR_NVHPC_CUDA) - set_source_files_properties(test_hamr_stream_cuda.cpp PROPERTIES LANGUAGE CUDA) - endif() - add_executable(test_hamr_stream_cuda test_hamr_stream_cuda.cpp) - target_link_libraries(test_hamr_stream_cuda hamr CUDA::cublas) - if (NOT HAMR_NVHPC_CUDA) - set_target_properties(test_hamr_stream_cuda PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}") - endif() - add_test(NAME test_hamr_stream_cuda_async COMMAND test_hamr_stream_cuda 5000 async) - set_tests_properties(test_hamr_stream_cuda_async PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) - add_test(NAME test_hamr_stream_cuda_default COMMAND test_hamr_stream_cuda 5000 default) - set_tests_properties(test_hamr_stream_cuda_default PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) + #if (NOT HAMR_NVHPC_CUDA) + # set_source_files_properties(test_hamr_stream_cuda.cpp PROPERTIES LANGUAGE CUDA) + #endif() + #add_executable(test_hamr_stream_cuda test_hamr_stream_cuda.cpp) + #target_link_libraries(test_hamr_stream_cuda hamr CUDA::cublas) + #if (NOT HAMR_NVHPC_CUDA) + # set_target_properties(test_hamr_stream_cuda PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}") + #endif() + #add_test(NAME test_hamr_stream_cuda_async COMMAND test_hamr_stream_cuda 5000 async) + #set_tests_properties(test_hamr_stream_cuda_async PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) + #add_test(NAME test_hamr_stream_cuda_default COMMAND test_hamr_stream_cuda 5000 default) + #set_tests_properties(test_hamr_stream_cuda_default PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) endif() if (HAMR_ENABLE_HIP) @@ -102,6 +102,21 @@ if (HAMR_ENABLE_OPENMP) set_tests_properties(test_hamr_openmp_allocator PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) endif() +if (HAMR_ENABLE_OPENMP AND HAMR_ENABLE_CUDA AND HAMR_NVHPC_CUDA) + #NOTE: nvhpc is the only compiler that can do cuda and openmp in the same translation unit + # this test will have to be refactored for clang into multiple source files + if (NOT HAMR_NVHPC_CUDA) + set_source_files_properties(test_hamr_openmp_cuda_interop.cpp PROPERTIES LANGUAGE CUDA) + endif() + add_executable(test_hamr_openmp_cuda_interop test_hamr_openmp_cuda_interop.cpp) + if (NOT HAMR_NVHPC_CUDA) + set_target_properties(test_hamr_openmp_cuda_interop PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}") + endif() + target_link_libraries(test_hamr_openmp_cuda_interop hamr) + add_test(NAME test_hamr_openmp_cuda_interop COMMAND test_hamr_openmp_cuda_interop 0 0) + set_tests_properties(test_hamr_openmp_cuda_interop PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) +endif() + if (HAMR_ENABLE_PYTHON) add_test(NAME test_hamr_buffer_numpy_host diff --git a/test/test_hamr_openmp_cuda_interop.cpp b/test/test_hamr_openmp_cuda_interop.cpp new file mode 100644 index 0000000..c7e5e89 --- /dev/null +++ b/test/test_hamr_openmp_cuda_interop.cpp @@ -0,0 +1,297 @@ +#include "hamr_config.h" +#include "hamr_buffer.h" +#include "hamr_buffer_allocator.h" +#include "hamr_buffer_util.h" + +#include +#include +#include + +#include +#include + +using allocator = hamr::buffer_allocator; +using transfer = hamr::buffer_transfer; + +void example1(size_t nElem, int devId) +{ + // OpenMP allocates this array on device memory + omp_set_default_device(devId); + double *devPtr = (double*)malloc(nElem*sizeof(double)); + #pragma omp target enter data map(alloc: devPtr[0:nElem]) + + // OpenMP initializes the memory on the device + #pragma omp target teams distribute \ + parallel for map(alloc: devPtr[0:nElem]) + for (size_t i = 0; i < nElem; ++i) + devPtr[i] = -3.14; + + // zero-copy construct with the device pointer + hamr::buffer *simData; + #pragma omp target data use_device_addr(devPtr) + { + simData = new hamr::buffer(allocator::openmp, hamr::stream(), + nElem, devId, devPtr, 0); + } + + // do something with the buffer + simData->print(); + + // delete the array + delete simData; + + // now it is safe to deallocate the device memory + #pragma omp target exit data map(release: devPtr[0:nElem]) +} + + +void example2(size_t nElem, int devId) +{ + // allocate device memory + omp_set_default_device(devId); + double *devPtr = (double*)omp_target_alloc(nElem*sizeof(double), devId); + + // wrap it in a shared pointer so it is eventually deallocated + std::shared_ptr spDev(devPtr, + [devId](double *ptr){ omp_target_free(ptr, devId); }); + + // initialize the array on the device + #pragma omp target teams distribute parallel for is_device_ptr(devPtr) + for (size_t i = 0; i < nElem; ++i) + devPtr[i] = -3.14; + + // zero-copy construct with coordinated life cycle management + auto simData = hamr::buffer(allocator::openmp, hamr::stream(), + nElem, devId, spDev); + + // do something with the buffer + simData.print(); +} + +void example3(size_t nElem, int srcDev, int destDev) +{ + // allocate device memory + omp_set_default_device(srcDev); + double *devPtr = (double*)omp_target_alloc(nElem*sizeof(double), srcDev); + + // initialize + #pragma omp target teams distribute parallel for is_device_ptr(devPtr) + for (size_t i = 0; i < nElem; ++i) + devPtr[i] = -3.14; + + // zero-copy construct from a device pointer, and take ownership + auto simData = hamr::buffer(allocator::openmp, hamr::stream(), + nElem, srcDev, devPtr, 1); + + // move to destDev in place + omp_set_default_device(destDev); + simData.move(allocator::openmp); + + // do something with the buffer + simData.print(); +} + +void example4(size_t nElem, int srcDev, int destDev) +{ + // allocate and value initialize on one device using OpenMP + omp_set_default_device(srcDev); + auto simData = hamr::buffer(allocator::openmp, hamr::stream(), + nElem, -3.14); + + // deep-copy to another device using CUDA + cudaSetDevice(destDev); + auto dataCpy = hamr::buffer(allocator::openmp, simData); + + // make sure movement is complete before using + dataCpy.synchronize(); + + // do something with the data + dataCpy.print(); +} + + + +hamr::buffer +add_arrays_mp(int dev, hamr::buffer &a1, hamr::buffer &a2) +{ + // get a view of the incoming data on the device we will use + omp_set_default_device(dev); + +#if defined(STRUCTURED_BINDING) + auto [spa1, pa1] = hamr::get_openmp_accessible(a1); + auto [spa2, pa2] = hamr::get_openmp_accessible(a2); +#else + auto spa1 = a1.get_openmp_accessible(); + auto pa1 = spa1.get(); + + auto spa2 = a2.get_openmp_accessible(); + auto pa2 = spa2.get(); +#endif + + // allocate space for the result + size_t nElem = a1.size(); + + auto a3 = hamr::buffer(allocator::openmp, nElem); + + // direct access to the result since we know it is in place + auto pa3 = a3.data(); + + // do the calculation + #pragma omp target teams distribute parallel for is_device_ptr(pa1, pa2) + for (size_t i = 0; i < nElem; ++i) + pa3[i] = pa2[i] + pa1[i]; + + return a3; +} + +void example5(size_t nElem, int dev1, int dev2) +{ + // this data is located in host main memory + auto a1 = hamr::buffer(allocator::malloc, nElem, 1.0); + + // this data is located in device 1 main memory + omp_set_default_device(dev1); + auto a2 = hamr::buffer(allocator::openmp, hamr::stream(), + transfer::async, nElem, 2.0); + + // do the calculation on device 2 + auto a3 = add_arrays_mp(dev2, a1, a2); + + // do something with the result + a3.print(); +} + + +namespace libA { +__global__ +void add(double *a3, const double *a1, const double *a2, size_t n) +{ + int i = threadIdx.x + blockIdx.x*blockDim.x; + if (i >= n) return; + a3[i] = a1[i] + a2[i]; + //printf("a3[%d]=%g a1[%d]=%g a2[%d]=%g \n", i, a3[i], i, a1[i], i, a2[i]); +} + +hamr::buffer +Add(int dev, const hamr::buffer &a1, const hamr::buffer &a2) +{ + // use this stream for the calculation + cudaStream_t strm = hamr::stream(); + + // get a view of the incoming data on the device we will use + cudaSetDevice(dev); + +#if defined(STRUCTURED_BINDING) + auto [spa1, pa1] = hamr::get_cuda_accessible(a1); + auto [spa2, pa2] = hamr::get_cuda_accessible(a2); +#else + auto spa1 = a1.get_openmp_accessible(); + auto pa1 = spa1.get(); + + auto spa2 = a2.get_openmp_accessible(); + auto pa2 = spa2.get(); +#endif + + // allocate space for the result + size_t nElem = a1.size(); + + auto a3 = hamr::buffer(allocator::cuda_async, strm, transfer::async, nElem); + + // direct access to the result since we know it is in place + auto pa3 = a3.data(); + + // make sure the data in flight, if it was moved, has arrived + a1.synchronize(); + a2.synchronize(); + + // do the calculation + int threads = 128; + int blocks = nElem / threads + ( nElem % threads ? 1 : 0 ); + add<<>>(pa3, pa1, pa2, nElem); + + return a3; +} +} + +namespace libB { +void Write(std::ofstream &ofs, hamr::buffer &a) +{ + // get a view of the data on the host + auto [spA, pA] = hamr::get_host_accessible(a); + + // make sure the data if moved has arrived + a.synchronize(); + + // send the data to the file + size_t nElem = a.size(); + for (size_t i = 0; i < nElem; ++i) + ofs << pA[i] << " "; + ofs << std::endl; +} +} + +void example6(size_t nElem, int dev1, int dev2) +{ + // this data is located in host memory, initialized to 1 + auto a1 = hamr::buffer(allocator::malloc, hamr::stream(), + transfer::async, nElem, 1.0); + + // this data is located in device 1 memory, unitialized + omp_set_default_device(dev1); + auto a2 = hamr::buffer(allocator::openmp, hamr::stream(), + transfer::async, nElem, 1.0); + + // initialize with OpenMP target offload + auto pA2 = a2.data(); + + #pragma omp target teams distribute parallel for is_device_ptr(pA2) + for (size_t i = 0; i < nElem; ++i) + pA2[i] = 2.0; + + // pass data to libA for the calculations + auto a3 = libA::Add(dev2, a1, a2); + + // pass data to libB for I/O + auto ofile = std::ofstream("data.txt"); + libB::Write(ofile, a1); + libB::Write(ofile, a2); + libB::Write(ofile, a3); + ofile.close(); +} + + + + + +int main(int argc, char **argv) +{ + if (argc != 3) + { + std::cerr << "usage: test_openmp_cuda_interop [device id] [device id]" << std::endl; + return -1; + } + + size_t nElem = 64; + int dev = atoi(argv[1]); + int destDev = atoi(argv[2]); + + std::cerr << "zero-copy construct manual life cycle management ... " << std::endl; + example1(nElem, dev); + + std::cerr << "zero-copy construct automatic life cycle management ... " << std::endl; + example2(nElem, dev); + + std::cerr << "move in place ..." << std::endl; + example3(nElem, dev, destDev); + + std::cerr << "deep copy construct on another device ... " << std::endl; + example4(nElem, dev, destDev); + + std::cerr << "add two arrays OpenMP ... " << std::endl; + example5(nElem, dev, destDev); + + std::cerr << "add two arrays OpenMP CUDA interop ... " << std::endl; + example6(nElem, dev, destDev); + + return 0; +}