From d8b5a1b0731de732050812ff87980367f683ca5d Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Tue, 15 May 2018 02:44:00 -0400
Subject: [PATCH] Add an Open MP target device Rename gpu_* in device_cuda.[ch]
 Add a core dgemm that executes on hosted MIC with OpenMP target Add an OpenMP
 target body to LLT Configury to make it compile in target mode with icc

---
 CMakeLists.txt                             |   14 +-
 dplasma/cores/CMakeLists.txt               |   34 +
 dplasma/cores/omp_zgemm.c                  |   37 +
 dplasma/lib/CMakeLists.txt                 |    1 +
 dplasma/lib/zgeqrf.jdf                     |    4 +-
 dplasma/lib/zpotrf_U.jdf                   |   48 +
 dplasma/tools/PrecisionGenerator/subs.py   |    1 +
 parsec/CMakeLists.txt                      |   15 +
 parsec/class/parsec_list.c                 |    8 +
 parsec/devices/cuda/dev_cuda.c             |  281 ++-
 parsec/devices/cuda/dev_cuda.h             |   98 +-
 parsec/devices/device.c                    |   66 +-
 parsec/devices/device.h                    |   25 +-
 parsec/devices/openmp/dev_omp.c            | 1986 ++++++++++++++++++++
 parsec/devices/openmp/dev_omp.h            |  198 ++
 parsec/include/parsec/parsec_options.h.in  |    3 +
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c |  313 ++-
 17 files changed, 2875 insertions(+), 257 deletions(-)
 create mode 100644 dplasma/cores/omp_zgemm.c
 create mode 100644 parsec/devices/openmp/dev_omp.c
 create mode 100644 parsec/devices/openmp/dev_omp.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8006528c..1b78c02a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,12 +110,9 @@ option(PARSEC_GPU_CUDA_ALLOC_PER_TILE
   "Tile based allocation engine for GPU memory (instead of internal management
   of a complete allocation)" OFF)
 mark_as_advanced(PARSEC_GPU_CUDA_ALLOC_PER_TILE)
-option(PARSEC_GPU_WITH_OPENCL
-  "Enable GPU support using OpenCL kernels" OFF)
-mark_as_advanced(PARSEC_GPU_WITH_OPENCL) # Hide this as it is not supported yet
-if(PARSEC_GPU_WITH_OPENCL)
-  message(WARNING "Open CL is not supported yet, ignored.")
-endif()
+option(PARSEC_GPU_WITH_OPENMP
+  "Enable GPU support using OpenMP target" OFF)
+mark_as_advanced(PARSEC_GPU_WITH_OPENMP) # Hide this as it is experimental now 
 
 ### Debug options
 if( "Debug" STREQUAL CMAKE_BUILD_TYPE )
@@ -625,6 +622,11 @@ IF( BUILD_PARSEC )
     endif (CUDA_FOUND)
   endif( PARSEC_GPU_WITH_CUDA )
 
+  if( PARSEC_GPU_WITH_OPENMP )
+    find_package(OpenMP 4.5)
+    set(PARSEC_HAVE_OPENMP ${OPENMP_FOUND})
+  endif( PARSEC_GPU_WITH_OPENMP )
+
   find_package(AYUDAME QUIET)
   set(PARSEC_HAVE_AYUDAME ${AYUDAME_FOUND})
 #
diff --git a/dplasma/cores/CMakeLists.txt b/dplasma/cores/CMakeLists.txt
index 1c1b50c31..6c2e61d42 100644
--- a/dplasma/cores/CMakeLists.txt
+++ b/dplasma/cores/CMakeLists.txt
@@ -5,6 +5,16 @@ set(generated_headers "")
 
 include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR})
 
+if( OPENMP_FOUND )
+    set(CORE_OMP_HEADERS
+        "")
+    set(CORE_OMP_SOURCES
+        omp_zgemm.c)
+else()
+    set(CORE_OMP_HEADERS "")
+    set(CORE_OMP_SOURCES "")
+endif()
+
 if( CUDA_FOUND )
     set(CORE_CUDA_HEADERS
     )
@@ -64,6 +74,30 @@ install(TARGETS dplasma_cores
         ARCHIVE DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib
         LIBRARY DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib)
 
+if( OPENMP_FOUND )
+  # generate the cores_omp library (hooks to mkl in omp target regions)
+  precisions_rules_py(generated_omp_files
+                 ${CORE_OMP_SOURCES}
+                 PRECISIONS "${DPLASMA_PRECISIONS}")
+
+  if( NOT "${generated_omp_files}" STREQUAL "" )
+    add_library(dplasma_cores_omp
+      ${generated_omp_files})
+    set_target_properties(dplasma_cores_omp PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+    set_target_properties(dplasma_cores_omp PROPERTIES LINK_FLAGS "-qoffload-option,mic,ld,\"-L$ENV{MKLROOT}/lib/mic -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core\"")
+    add_dependencies(dplasma_cores_omp
+      dplasma_includes
+      dplasma_cores_includes)
+    target_link_libraries(dplasma_cores_omp
+      ${COREBLAS_LIBRARIES}
+      ${EXTRA_LIBS})
+    target_link_libraries(dplasma_cores dplasma_cores_omp)
+    install(TARGETS dplasma_cores_omp
+      ARCHIVE DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib
+      LIBRARY DESTINATION ${DPLASMA_INSTALL_PREFIX}/lib)
+  endif()
+endif( OPENMP_FOUND)
+
 if( CUDA_FOUND )
   # generate the cores_cuda library (hooks to cublas)
   precisions_rules_py(generated_cuda_files
diff --git a/dplasma/cores/omp_zgemm.c b/dplasma/cores/omp_zgemm.c
new file mode 100644
index 000000000..6beda8c4f
--- /dev/null
+++ b/dplasma/cores/omp_zgemm.c
@@ -0,0 +1,37 @@
+/*
+ * @precisions normal z -> c d s
+ */
+#include "dplasma.h"
+#include "dplasma_cores.h"
+#include "dplasma_zcores.h"
+#include <cblas.h>
+#include <omp.h>
+
+void CORE_omp_zgemm(PLASMA_enum transA, int transB,
+                int M, int N, int K,
+                PLASMA_Complex64_t alpha, const PLASMA_Complex64_t *A, int LDA,
+                                          const PLASMA_Complex64_t *B, int LDB,
+                PLASMA_Complex64_t beta, PLASMA_Complex64_t *C, int LDC, int device_index, int *device_stream) {
+//map(to: transA, transB, M, N, K, alpha, beta, LDA, LDB, LDC)
+#pragma omp target nowait device(device_index) depend(out: device_stream[0])  is_device_ptr(A, B, C) 
+{
+#pragma omp declare target
+void cblas_zgemm(const enum CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA,
+                 const CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+#pragma omp end declare target
+    
+    printf("Gemm %d %d %d A=%p, B=%p, C=%p device=%s event=%p (%d)\n", M, N, K, A, B, C, omp_is_initial_device()? "host": "offloaded", device_stream, device_stream[0]);
+    cblas_zgemm(
+        CblasColMajor,
+        (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
+        M, N, K,
+        &alpha, A, LDA,
+        B, LDB,
+        &beta, C, LDC);
+}
+//#pragma omp taskwait
+}
+
diff --git a/dplasma/lib/CMakeLists.txt b/dplasma/lib/CMakeLists.txt
index 5b48d43fa..1860680a4 100644
--- a/dplasma/lib/CMakeLists.txt
+++ b/dplasma/lib/CMakeLists.txt
@@ -108,6 +108,7 @@ list(APPEND generated_jdf
 )
 
 jdf_rules(generated_files "${generated_jdf}")
+#set_source_files_properties(spotrf_U.c PROPERTIES COMPILE_FLAGS ${OpenMP_C_FLAGS})
 
 ### Generate the dplasma wrappers for all required precisions
 set(SOURCES
diff --git a/dplasma/lib/zgeqrf.jdf b/dplasma/lib/zgeqrf.jdf
index 689093423..5954522a1 100644
--- a/dplasma/lib/zgeqrf.jdf
+++ b/dplasma/lib/zgeqrf.jdf
@@ -402,8 +402,8 @@ BODY [type=CUDA device=%{ return n; %}]
     int ldak = BLKLDD( descA, k );
     int ldam = BLKLDD( descA, m );
 
-    WORK  = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(parsec_complex64_t));
-    WORKC = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(parsec_complex64_t));
+    WORK  = parsec_cuda_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(parsec_complex64_t));
+    WORKC = parsec_cuda_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(parsec_complex64_t));
 
     dplasma_cuda_ztsmqr( PlasmaLeft, PlasmaConjTrans,
                          descA->mb, tempnn, tempmm, tempnn, descA->nb, ib,
diff --git a/dplasma/lib/zpotrf_U.jdf b/dplasma/lib/zpotrf_U.jdf
index f96f1196f..682cd3e8d 100644
--- a/dplasma/lib/zpotrf_U.jdf
+++ b/dplasma/lib/zpotrf_U.jdf
@@ -290,6 +290,54 @@ RW    C <- (k == 0)   ? descA(m, n)  : C potrf_zgemm(m, n, k-1)
 
 ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - m - n - 3) * (n - m) + 6 * (n - k) : PRI_MAX
 
+
+BODY [type=OPENMP
+      weight=(m+1-k)]
+{
+    int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
+    int ldak = BLKLDD( descA, k );
+    int ldam = BLKLDD( descA, m );
+
+#if !defined(PARSEC_DRY_RUN)
+#if 0
+#pragma omp target nowait device(parsec_body.index) depend(out: parsec_openmp_stream[0])  is_device_ptr(A, B, C)
+//#pragma omp task depend(inout: parsec_openmp_stream[0])
+{
+    printf("Gemm %d %d %d A=%p, B=%p, C=%p device=%s event=%p (%d)\n", m, n, k, A, B, C, omp_is_initial_device()? "host": "offloaded", parsec_openmp_stream, parsec_openmp_stream[0]);
+    //for(int i = 0; i < m; i++) printf("C[0][%d]=%g\n", i, ((float*)C)[i]);
+#pragma omp declare target
+void CORE_zgemm(PLASMA_enum transA, int transB,
+                int M, int N, int K,
+                PLASMA_Complex64_t alpha, const PLASMA_Complex64_t *A, int LDA,
+                                          const PLASMA_Complex64_t *B, int LDB,
+                PLASMA_Complex64_t beta, PLASMA_Complex64_t *C, int LDC);
+#pragma omp end declare target
+    CORE_zgemm(PlasmaConjTrans, PlasmaNoTrans,
+               descA->mb, tempnn, descA->nb,
+               (parsec_complex64_t)-1.0, A /*A(k, m)*/, ldak,
+                                        B /*A(k, n)*/, ldak,
+               (parsec_complex64_t) 1.0, C /*A(m, n)*/, ldam);
+}
+#else
+    CORE_omp_zgemm(PlasmaConjTrans, PlasmaNoTrans,
+                   descA->mb, tempnn, descA->nb,
+                   (parsec_complex64_t)-1.0, A, ldak,
+                                             B, ldak,
+                   (parsec_complex64_t) 1.0, C, ldam, parsec_body.index, parsec_body.stream);
+#endif
+#endif  /* !defined(PARSEC_DRY_RUN) */
+
+printf("GEMM %d %d %d is on device %d", m, n, k, parsec_body.index);
+    printlog("CORE_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n",
+             m, n, k,
+             plasma_const( PlasmaConjTrans ),  plasma_const( PlasmaNoTrans ),
+             descA->mb, tempnn, descA->nb,
+             -1.0, k, m, A, ldak,
+                   k, n, B, ldak,
+              1.0, m, n, C, ldam);
+}
+END
+
 BODY [type=CUDA
       dyld=cublasZgemm dyldtype=cublas_zgemm_t
       weight=(m+1-k)]
diff --git a/dplasma/tools/PrecisionGenerator/subs.py b/dplasma/tools/PrecisionGenerator/subs.py
index 89a7b0f9c..d93639892 100644
--- a/dplasma/tools/PrecisionGenerator/subs.py
+++ b/dplasma/tools/PrecisionGenerator/subs.py
@@ -265,6 +265,7 @@
     ('CORE_s',      'CORE_d',      'CORE_s',      'CORE_d'     ),
     ('core_ssy',    'core_dsy',    'core_che',    'core_zhe'   ),
     ('core_s',      'core_d',      'core_c',      'core_z'     ),
+    ('omp_s',       'omp_d',       'omp_c',       'omp_z'      ),
     ('coreblas_s',  'coreblas_d',  'coreblas_c',  'coreblas_z' ),
     ('cuda_s',      'cuda_d',      'cuda_s',      'cuda_d'     ),
     ('cuda_s',      'cuda_d',      'cuda_c',      'cuda_z'     ),
diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt
index bedaa5eef..656c2dcc9 100644
--- a/parsec/CMakeLists.txt
+++ b/parsec/CMakeLists.txt
@@ -88,6 +88,20 @@ if (CUDA_FOUND)
   endif( PARSEC_WITH_DEVEL_HEADERS )
 endif (CUDA_FOUND)
 
+if (PARSEC_GPU_WITH_OPENMP)
+  find_package(OpenMP 4.5)
+  set(PARSEC_HAVE_OPENMP ${OPENMP_FOUND})
+  if (OPENMP_FOUND)
+    set_source_files_properties(devices/openmp/dev_omp.c PROPERTIES COMPILE_FLAGS ${OpenMP_C_FLAGS})
+    list(APPEND SOURCES devices/openmp/dev_omp.c)
+    if( PARSEC_WITH_DEVEL_HEADERS )
+        install(FILES
+          ${CMAKE_CURRENT_SOURCE_DIR}/devices/openmp/dev_omp.h
+          DESTINATION include/parsec/devices/openmp )
+    endif( PARSEC_WITH_DEVEL_HEADERS )
+  endif( OPENMP_FOUND )
+endif( PARSEC_GPU_WITH_OPENMP )
+
 if( PARSEC_PROF_GRAPHER )
   list(APPEND SOURCES parsec_prof_grapher.c)
 endif( PARSEC_PROF_GRAPHER )
@@ -112,6 +126,7 @@ if( BUILD_PARSEC )
   if (MPI_C_FOUND)
     set_target_properties(parsec PROPERTIES COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}")
   endif (MPI_C_FOUND)
+  set_target_properties(parsec PROPERTIES LINK_FLAGS "${OpenMP_C_FLAGS}")
   target_link_libraries(parsec parsec-base ${EXTRA_LIBS})
 
   install(TARGETS parsec
diff --git a/parsec/class/parsec_list.c b/parsec/class/parsec_list.c
index c9a86d2a9..ebdadaca0 100644
--- a/parsec/class/parsec_list.c
+++ b/parsec/class/parsec_list.c
@@ -49,3 +49,11 @@ OBJ_CLASS_INSTANCE(parsec_list_t, parsec_object_t,
                    parsec_list_construct, parsec_list_destruct);
 
 
+/* To be called from GDB, not from actual code */
+void parsec_list_debug_walker(parsec_list_t *list)
+{
+    parsec_list_item_t *p = (parsec_list_item_t *)list->ghost_element.list_next;
+    while (p != &(list->ghost_element)) {
+        p = (parsec_list_item_t *)p->list_next;
+    }
+}
diff --git a/parsec/devices/cuda/dev_cuda.c b/parsec/devices/cuda/dev_cuda.c
index 0944b1c40..df442a96e 100644
--- a/parsec/devices/cuda/dev_cuda.c
+++ b/parsec/devices/cuda/dev_cuda.c
@@ -28,6 +28,11 @@
 #include <dlfcn.h>
 #include <sys/stat.h>
 
+#define PARSEC_GPU_USE_PRIORITIES     1
+
+#define PARSEC_MAX_STREAMS            6
+#define PARSEC_MAX_EVENTS_PER_STREAM  4
+
 #if defined(PARSEC_PROF_TRACE)
 /* Accepted values are: PARSEC_PROFILE_CUDA_TRACK_DATA_IN | PARSEC_PROFILE_CUDA_TRACK_DATA_OUT |
  *                      PARSEC_PROFILE_CUDA_TRACK_OWN | PARSEC_PROFILE_CUDA_TRACK_EXEC
@@ -46,12 +51,12 @@ int parsec_cuda_output_stream = -1;
 static char* cuda_lib_path = NULL;
 
 static int
-parsec_cuda_memory_reserve( gpu_device_t* gpu_device,
+parsec_cuda_memory_reserve( parsec_cuda_device_t* gpu_device,
                            int           memory_percentage,
                            int           number_of_elements,
                            size_t        eltsize );
 static int
-parsec_cuda_memory_release( gpu_device_t* gpu_device );
+parsec_cuda_memory_release( parsec_cuda_device_t* gpu_device );
 
 static int cuda_legal_compute_capabilitites[] = {10, 11, 12, 13, 20, 21, 30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70};
 
@@ -113,7 +118,7 @@ static int parsec_cuda_device_lookup_cudamp_floprate(int major, int minor, int *
 
 static int parsec_cuda_device_fini(parsec_device_t* device)
 {
-    gpu_device_t* gpu_device = (gpu_device_t*)device;
+    parsec_cuda_device_t* gpu_device = (parsec_cuda_device_t*)device;
     cudaError_t status;
     int j, k;
 
@@ -129,7 +134,7 @@ static int parsec_cuda_device_fini(parsec_device_t* device)
 
     /* Release all streams */
     for( j = 0; j < gpu_device->max_exec_streams; j++ ) {
-        parsec_gpu_exec_stream_t* exec_stream = &(gpu_device->exec_stream[j]);
+        parsec_cuda_exec_stream_t* exec_stream = &(gpu_device->exec_stream[j]);
 
         exec_stream->max_events   = PARSEC_MAX_EVENTS_PER_STREAM;
         exec_stream->executed     = 0;
@@ -213,7 +218,7 @@ static int parsec_cuda_memory_unregister(parsec_device_t* device, parsec_data_co
 }
 
 
-void* cuda_find_incarnation(gpu_device_t* gpu_device,
+void* cuda_find_incarnation(parsec_cuda_device_t* gpu_device,
                                      const char* fname)
 {
     char library_name[FILENAME_MAX], function_name[FILENAME_MAX], *env;
@@ -324,7 +329,7 @@ void* cuda_find_incarnation(gpu_device_t* gpu_device,
 static int
 parsec_cuda_taskpool_register(parsec_device_t* device, parsec_taskpool_t* tp)
 {
-    gpu_device_t* gpu_device = (gpu_device_t*)device;
+    parsec_cuda_device_t* gpu_device = (parsec_cuda_device_t*)device;
     uint32_t i, j, dev_mask = 0x0;
     int32_t rc = PARSEC_ERR_NOT_FOUND;
 
@@ -348,7 +353,7 @@ parsec_cuda_taskpool_register(parsec_device_t* device, parsec_taskpool_t* tp)
                 else {
                     void* devf = cuda_find_incarnation(gpu_device, chores[j].dyld);
                     if( NULL != devf ) {
-                        chores[gpu_device->cuda_index].dyld_fn = devf;
+                        chores[j].dyld_fn = devf;
                         rc = PARSEC_SUCCESS;
                         dev_mask |= chores[j].type;
                     }
@@ -371,7 +376,7 @@ parsec_cuda_taskpool_unregister(parsec_device_t* device, parsec_taskpool_t* tp)
     return PARSEC_SUCCESS;
 }
 
-int parsec_gpu_init(parsec_context_t *parsec_context)
+int parsec_cuda_init(parsec_context_t *parsec_context)
 {
     int cuda_memory_block_size, cuda_memory_percentage, cuda_memory_number_of_blocks = -1;
     int show_caps_index, show_caps = 0;
@@ -461,7 +466,7 @@ int parsec_gpu_init(parsec_context_t *parsec_context)
 #endif  /* defined(PROFILING) */
 
     for( i = 0; i < ndevices; i++ ) {
-        gpu_device_t* gpu_device;
+        parsec_cuda_device_t* gpu_device;
         char *szName;
         int major, minor, concurrency, computemode, streaming_multiprocessor, drate, srate, trate, hrate;
         float clockRate;
@@ -483,7 +488,7 @@ int parsec_gpu_init(parsec_context_t *parsec_context)
         streaming_multiprocessor = prop.multiProcessorCount;
         computemode = prop.computeMode;
 
-        gpu_device = (gpu_device_t*)calloc(1, sizeof(gpu_device_t));
+        gpu_device = (parsec_cuda_device_t*)calloc(1, sizeof(parsec_cuda_device_t));
         OBJ_CONSTRUCT(gpu_device, parsec_list_item_t);
         gpu_device->cuda_index = (uint8_t)i;
         gpu_device->major      = (uint8_t)major;
@@ -492,10 +497,10 @@ int parsec_gpu_init(parsec_context_t *parsec_context)
 
         gpu_device->max_exec_streams = PARSEC_MAX_STREAMS;
         gpu_device->exec_stream =
-            (parsec_gpu_exec_stream_t*)malloc(gpu_device->max_exec_streams
-                                              * sizeof(parsec_gpu_exec_stream_t));
+            (parsec_cuda_exec_stream_t*)malloc(gpu_device->max_exec_streams
+                                              * sizeof(parsec_cuda_exec_stream_t));
         for( j = 0; j < gpu_device->max_exec_streams; j++ ) {
-            parsec_gpu_exec_stream_t* exec_stream = &(gpu_device->exec_stream[j]);
+            parsec_cuda_exec_stream_t* exec_stream = &(gpu_device->exec_stream[j]);
 
             /* Allocate the stream */
             cudastatus = cudaStreamCreate( &(exec_stream->cuda_stream) );
@@ -508,8 +513,8 @@ int parsec_gpu_init(parsec_context_t *parsec_context)
             exec_stream->end          = 0;
             exec_stream->fifo_pending = (parsec_list_t*)OBJ_NEW(parsec_list_t);
             OBJ_CONSTRUCT(exec_stream->fifo_pending, parsec_list_t);
-            exec_stream->tasks  = (parsec_gpu_context_t**)malloc(exec_stream->max_events
-                                                                 * sizeof(parsec_gpu_context_t*));
+            exec_stream->tasks  = (parsec_cuda_context_t**)malloc(exec_stream->max_events
+                                                                 * sizeof(parsec_cuda_context_t*));
             exec_stream->events = (cudaEvent_t*)malloc(exec_stream->max_events * sizeof(cudaEvent_t));
             /* and the corresponding events */
             for( k = 0; k < exec_stream->max_events; k++ ) {
@@ -592,27 +597,27 @@ int parsec_gpu_init(parsec_context_t *parsec_context)
 
 #if defined(PARSEC_HAVE_PEER_DEVICE_MEMORY_ACCESS)
     for( i = 0; i < ndevices; i++ ) {
-        gpu_device_t *source_gpu, *target_gpu;
+        parsec_cuda_device_t *source_gpu, *target_gpu;
         int canAccessPeer;
 
-        if( NULL == (source_gpu = (gpu_device_t*)parsec_devices_get(i)) ) continue;
+        if( NULL == (source_gpu = (parsec_cuda_device_t*)parsec_devices_get(i)) ) continue;
         /* Skip all non CUDA devices */
         if( PARSEC_DEV_CUDA != source_gpu->super.type ) continue;
 
         source_gpu->peer_access_mask = 0;
 
         for( j = 0; j < ndevices; j++ ) {
-            if( (NULL == (target_gpu = (gpu_device_t*)parsec_devices_get(j))) || (i == j) ) continue;
+            if( (NULL == (target_gpu = (parsec_cuda_device_t*)parsec_devices_get(j))) || (i == j) ) continue;
             /* Skip all non CUDA devices */
             if( PARSEC_DEV_CUDA != target_gpu->super.type ) continue;
 
             /* Communication mask */
             cudastatus = cudaDeviceCanAccessPeer( &canAccessPeer, source_gpu->cuda_index, target_gpu->cuda_index );
-            PARSEC_CUDA_CHECK_ERROR( "(parsec_gpu_init) cudaDeviceCanAccessPeer ", cudastatus,
+            PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_init) cudaDeviceCanAccessPeer ", cudastatus,
                                      {continue;} );
             if( 1 == canAccessPeer ) {
                 cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 );
-                PARSEC_CUDA_CHECK_ERROR( "(parsec_gpu_init) cuCtxEnablePeerAccess ", cudastatus,
+                PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_init) cuCtxEnablePeerAccess ", cudastatus,
                                          {continue;} );
                 source_gpu->peer_access_mask = (int16_t)(source_gpu->peer_access_mask | (int16_t)(1 << target_gpu->cuda_index));
             }
@@ -623,13 +628,13 @@ int parsec_gpu_init(parsec_context_t *parsec_context)
     return 0;
 }
 
-int parsec_gpu_fini(void)
+int parsec_cuda_fini(void)
 {
-    gpu_device_t* gpu_device;
+    parsec_cuda_device_t* gpu_device;
     int i;
 
     for(i = 0; i < parsec_devices_enabled(); i++) {
-        if( NULL == (gpu_device = (gpu_device_t*)parsec_devices_get(i)) ) continue;
+        if( NULL == (gpu_device = (parsec_cuda_device_t*)parsec_devices_get(i)) ) continue;
         if(PARSEC_DEV_CUDA != gpu_device->super.type) continue;
         parsec_cuda_device_fini((parsec_device_t*)gpu_device);
         parsec_devices_remove((parsec_device_t*)gpu_device);
@@ -652,7 +657,7 @@ int parsec_gpu_fini(void)
  * can be reserved in a single allocation.
  */
 static int
-parsec_cuda_memory_reserve( gpu_device_t* gpu_device,
+parsec_cuda_memory_reserve( parsec_cuda_device_t* gpu_device,
                            int           memory_percentage,
                            int           number_blocks,
                            size_t        eltsize )
@@ -706,7 +711,7 @@ parsec_cuda_memory_reserve( gpu_device_t* gpu_device,
      */
     while( (free_mem > eltsize )
            && ((total_mem - free_mem) < how_much_we_allocate) ) {
-        parsec_gpu_data_copy_t* gpu_elem;
+        parsec_cuda_data_copy_t* gpu_elem;
         void *device_ptr;
 
         status = (cudaError_t)cudaMalloc( &device_ptr, eltsize);
@@ -767,13 +772,13 @@ parsec_cuda_memory_reserve( gpu_device_t* gpu_device,
     return PARSEC_SUCCESS;
 }
 
-static void parsec_cuda_memory_release_list(gpu_device_t* gpu_device,
+static void parsec_cuda_memory_release_list(parsec_cuda_device_t* gpu_device,
                                            parsec_list_t* list)
 {
     parsec_list_item_t* item;
 
     while(NULL != (item = parsec_list_nolock_fifo_pop(list)) ) {
-        parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
+        parsec_cuda_data_copy_t* gpu_copy = (parsec_cuda_data_copy_t*)item;
         parsec_data_t* original = gpu_copy->original;
 
         PARSEC_DEBUG_VERBOSE(20, parsec_cuda_output_stream,
@@ -809,12 +814,12 @@ static void parsec_cuda_memory_release_list(gpu_device_t* gpu_device,
  * should be enough to enforce a clean release.
  */
 static int
-parsec_cuda_memory_release( gpu_device_t* gpu_device )
+parsec_cuda_memory_release( parsec_cuda_device_t* gpu_device )
 {
     cudaError_t status;
 
 #if 0
-    dump_GPU_state(gpu_device); // debug only
+    dump_cuda_device(gpu_device); // debug only
 #endif
     status = cudaSetDevice( gpu_device->cuda_index );
     PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_memory_release) cudaSetDevice ", status,
@@ -846,11 +851,11 @@ parsec_cuda_memory_release( gpu_device_t* gpu_device )
  *   -2: The task needs to rescheduled
  */
 static inline int
-parsec_gpu_data_reserve_device_space( gpu_device_t* gpu_device,
-                                     parsec_gpu_context_t *gpu_task )
+parsec_cuda_data_reserve_device_space( parsec_cuda_device_t* gpu_device,
+                                     parsec_cuda_context_t *gpu_task )
 {
     parsec_task_t *this_task = gpu_task->ec;
-    parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
+    parsec_cuda_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
     parsec_data_t* master, *oldmaster;
     const parsec_flow_t *flow;
     int i, j;
@@ -886,7 +891,7 @@ parsec_gpu_data_reserve_device_space( gpu_device_t* gpu_device,
 
           find_another_data:
             /* Look for a data_copy to free */
-            lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_nolock_fifo_pop(&gpu_device->gpu_mem_lru);
+            lru_gpu_elem = (parsec_cuda_data_copy_t*)parsec_list_nolock_fifo_pop(&gpu_device->gpu_mem_lru);
             if( NULL == lru_gpu_elem ) {
                 /* We can't find enough room on the GPU. Insert the tiles in the begining of
                  * the LRU (in order to be reused asap) and return without scheduling the task.
@@ -912,7 +917,7 @@ parsec_gpu_data_reserve_device_space( gpu_device_t* gpu_device,
             assert( NULL != lru_gpu_elem );
 
             /* If there are pending readers, let the gpu_elem loose. This is a weak coordination
-             * protocol between here and the parsec_gpu_data_stage_in, where the readers don't necessarily
+             * protocol between here and the parsec_cuda_data_stage_in, where the readers don't necessarily
              * always remove the data from the LRU.
              */
             if( 0 != lru_gpu_elem->readers ) {
@@ -973,15 +978,15 @@ parsec_gpu_data_reserve_device_space( gpu_device_t* gpu_device,
  *   -1: A copy cannot be issued due to CUDA.
  */
 static inline int
-parsec_gpu_data_stage_in( gpu_device_t* gpu_device,
+parsec_cuda_data_stage_in( parsec_cuda_device_t* gpu_device,
                          int32_t type,
                          parsec_data_pair_t* task_data,
-                         parsec_gpu_context_t *gpu_task,
-                         parsec_gpu_exec_stream_t *gpu_stream )
+                         parsec_cuda_context_t *gpu_task,
+                         parsec_cuda_exec_stream_t *gpu_stream )
 {
     parsec_data_copy_t* in_elem = task_data->data_in;
     parsec_data_t* original = in_elem->original;
-    parsec_gpu_data_copy_t* gpu_elem = task_data->data_out;
+    parsec_cuda_data_copy_t* gpu_elem = task_data->data_out;
     int transfer_from = -1;
 
     /**
@@ -1067,18 +1072,18 @@ parsec_gpu_data_stage_in( gpu_device_t* gpu_device,
     return 0;
 }
 
-void* parsec_gpu_pop_workspace(gpu_device_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream, size_t size)
+void* parsec_cuda_pop_workspace(parsec_cuda_device_t* gpu_device, parsec_cuda_exec_stream_t* gpu_stream, size_t size)
 {
     (void)gpu_device; (void)gpu_stream; (void)size;
     void *work = NULL;
 
 #if !defined(PARSEC_GPU_CUDA_ALLOC_PER_TILE)
     if (gpu_stream->workspace == NULL) {
-        gpu_stream->workspace = (parsec_gpu_workspace_t *)malloc(sizeof(parsec_gpu_workspace_t));
-        gpu_stream->workspace->total_workspace = PARSEC_GPU_MAX_WORKSPACE;
-        gpu_stream->workspace->stack_head = PARSEC_GPU_MAX_WORKSPACE - 1;
+        gpu_stream->workspace = (parsec_cuda_workspace_t *)malloc(sizeof(parsec_cuda_workspace_t));
+        gpu_stream->workspace->total_workspace = PARSEC_CUDA_MAX_WORKSPACE;
+        gpu_stream->workspace->stack_head = PARSEC_CUDA_MAX_WORKSPACE - 1;
 
-        for( int i = 0; i < PARSEC_GPU_MAX_WORKSPACE; i++ ) {
+        for( int i = 0; i < PARSEC_CUDA_MAX_WORKSPACE; i++ ) {
             gpu_stream->workspace->workspace[i] = zone_malloc( gpu_device->memory, size);
         }
     }
@@ -1089,23 +1094,23 @@ void* parsec_gpu_pop_workspace(gpu_device_t* gpu_device, parsec_gpu_exec_stream_
     return work;
 }
 
-int parsec_gpu_push_workspace(gpu_device_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream)
+int parsec_cuda_push_workspace(parsec_cuda_device_t* gpu_device, parsec_cuda_exec_stream_t* gpu_stream)
 {
     (void)gpu_device; (void)gpu_stream;
 #if !defined(PARSEC_GPU_CUDA_ALLOC_PER_TILE)
     gpu_stream->workspace->stack_head ++;
-    assert (gpu_stream->workspace->stack_head < PARSEC_GPU_MAX_WORKSPACE);
+    assert (gpu_stream->workspace->stack_head < PARSEC_CUDA_MAX_WORKSPACE);
 #endif /* !defined(PARSEC_GPU_CUDA_ALLOC_PER_TILE) */
     return 0;
 }
 
-int parsec_gpu_free_workspace(gpu_device_t * gpu_device)
+int parsec_cuda_free_workspace(parsec_cuda_device_t * gpu_device)
 {
     (void)gpu_device;
 #if !defined(PARSEC_GPU_CUDA_ALLOC_PER_TILE)
     int i, j;
     for( i = 0; i < gpu_device->max_exec_streams; i++ ) {
-        parsec_gpu_exec_stream_t *gpu_stream = &(gpu_device->exec_stream[i]);
+        parsec_cuda_exec_stream_t *gpu_stream = &(gpu_device->exec_stream[i]);
         if (gpu_stream->workspace != NULL) {
             for (j = 0; j < gpu_stream->workspace->total_workspace; j++) {
                 zone_free( gpu_device->memory, gpu_stream->workspace->workspace[j] );
@@ -1118,7 +1123,7 @@ int parsec_gpu_free_workspace(gpu_device_t * gpu_device)
     return 0;
 }
 
-static inline int parsec_gpu_check_space_needed(gpu_device_t *gpu_device, parsec_gpu_context_t *gpu_task)
+static inline int parsec_cuda_check_space_needed(parsec_cuda_device_t *gpu_device, parsec_cuda_context_t *gpu_task)
 {
     int i;
     int space_needed = 0;
@@ -1142,16 +1147,8 @@ static inline int parsec_gpu_check_space_needed(gpu_device_t *gpu_device, parsec
     return space_needed;
 }
 
-void dump_list(parsec_list_t *list)
-{
-    parsec_list_item_t *p = (parsec_list_item_t *)list->ghost_element.list_next;
-    while (p != &(list->ghost_element)) {
-        p = (parsec_list_item_t *)p->list_next;
-    }
-}
 
-
-int parsec_gpu_sort_pending_list(gpu_device_t *gpu_device)
+static int parsec_cuda_sort_pending_list(parsec_cuda_device_t *gpu_device)
 {
     //parsec_list_t *sort_list = &(gpu_device->pending);
     parsec_list_t *sort_list = gpu_device->exec_stream[0].fifo_pending;
@@ -1182,12 +1179,12 @@ int parsec_gpu_sort_pending_list(gpu_device_t *gpu_device)
         }
         min_p = p; /* assume the minimum one is the first one p */
         q = (parsec_list_item_t*)min_p->list_next;
-        space_min = parsec_gpu_check_space_needed(gpu_device, (parsec_gpu_context_t*)min_p);
+        space_min = parsec_cuda_check_space_needed(gpu_device, (parsec_cuda_context_t*)min_p);
         for (j = i+1; j < NB_SORT; j++) {
             if ( q == &(sort_list->ghost_element) ) {
                 break;
             }
-            space_q = parsec_gpu_check_space_needed(gpu_device, (parsec_gpu_context_t*)q);
+            space_q = parsec_cuda_check_space_needed(gpu_device, (parsec_cuda_context_t*)q);
             if ( space_min > space_q ) {
                 min_p = q;
                 space_min = space_q;
@@ -1214,27 +1211,27 @@ int parsec_gpu_sort_pending_list(gpu_device_t *gpu_device)
 }
 
 /**
- * Transfer at most the PARSEC_GPU_W2R_NB_MOVE_OUT oldest data from the GPU back
+ * Transfer at most the PARSEC_CUDA_W2R_NB_MOVE_OUT oldest data from the GPU back
  * to main memory. Create a single task to move them all out, then switch the
  * GPU data copy in shared mode.
  */
-parsec_gpu_context_t* parsec_gpu_create_W2R_task(gpu_device_t *gpu_device, parsec_execution_stream_t *es)
+static parsec_cuda_context_t* parsec_cuda_create_W2R_task(parsec_cuda_device_t *gpu_device, parsec_execution_stream_t *es)
 {
-    parsec_gpu_context_t *w2r_task = NULL;
+    parsec_cuda_context_t *w2r_task = NULL;
     parsec_task_t *ec = NULL;
-    parsec_gpu_data_copy_t *gpu_copy;
+    parsec_cuda_data_copy_t *gpu_copy;
     parsec_data_t* original;
     parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next;
     int nb_cleaned = 0;
 
     /* Find a data copy that has no pending users on the GPU, and can be
      * safely moved back on the main memory */
-    while(nb_cleaned < PARSEC_GPU_W2R_NB_MOVE_OUT) {
+    while(nb_cleaned < PARSEC_CUDA_W2R_NB_MOVE_OUT) {
         /* Break at the end of the list */
         if( item == &(gpu_device->gpu_mem_owned_lru.ghost_element) ) {
             break;
         }
-        gpu_copy = (parsec_gpu_data_copy_t*)item;
+        gpu_copy = (parsec_cuda_data_copy_t*)item;
         original = gpu_copy->original;
         if( (0 != gpu_copy->readers) || (0 != original->device_copies[0]->readers) ) {
             item = (parsec_list_item_t*)item->list_next;  /* conversion needed for volatile */
@@ -1256,7 +1253,7 @@ parsec_gpu_context_t* parsec_gpu_create_W2R_task(gpu_device_t *gpu_device, parse
     if( 0 == nb_cleaned )
         return NULL;
 
-    w2r_task = (parsec_gpu_context_t *)malloc(sizeof(parsec_gpu_context_t));
+    w2r_task = (parsec_cuda_context_t *)malloc(sizeof(parsec_cuda_context_t));
     OBJ_CONSTRUCT(w2r_task, parsec_list_item_t);
     ec->priority = INT32_MAX;
     ec->task_class = NULL;
@@ -1268,17 +1265,17 @@ parsec_gpu_context_t* parsec_gpu_create_W2R_task(gpu_device_t *gpu_device, parse
 /**
  * Complete a data copy transfer originated from the engine.
  */
-int parsec_gpu_W2R_task_fini(gpu_device_t *gpu_device,
-                            parsec_gpu_context_t *w2r_task,
+static int parsec_cuda_W2R_task_fini(parsec_cuda_device_t *gpu_device,
+                            parsec_cuda_context_t *w2r_task,
                             parsec_execution_stream_t *es)
 {
-    parsec_gpu_data_copy_t *gpu_copy, *cpu_copy;
+    parsec_cuda_data_copy_t *gpu_copy, *cpu_copy;
     parsec_task_t *ec = w2r_task->ec;
     parsec_data_t* original;
     int i;
 
     assert(w2r_task->task_type == GPU_TASK_TYPE_D2HTRANSFER);
-    for( i = 0; (i < PARSEC_GPU_W2R_NB_MOVE_OUT) && (NULL != (gpu_copy = ec->data[i].data_out)); i++ ) {
+    for( i = 0; (i < PARSEC_CUDA_W2R_NB_MOVE_OUT) && (NULL != (gpu_copy = ec->data[i].data_out)); i++ ) {
         gpu_copy->coherency_state = DATA_COHERENCY_SHARED;
         original = gpu_copy->original;
         cpu_copy = original->device_copies[0];
@@ -1297,69 +1294,7 @@ int parsec_gpu_W2R_task_fini(gpu_device_t *gpu_device,
 }
 
 
-/**
- * Try to find the best device to execute the kernel based on the compute
- * capability of the card.
- *
- * Returns:
- *  > 1    - if the kernel should be executed by the a GPU
- *  0 or 1 - if the kernel should be executed by some other meaning (in this case the
- *         execution context is not released).
- * -1      - if the kernel is scheduled to be executed on a GPU.
- */
-
-/**
- * This version is based on 4 streams: one for transfers from the memory to
- * the GPU, 2 for kernel executions and one for tranfers from the GPU into
- * the main memory. The synchronization on each stream is based on CUDA events,
- * such an event indicate that a specific epoch of the lifetime of a task has
- * been completed. Each type of stream (in, exec and out) has a pending FIFO,
- * where tasks ready to jump to the respective step are waiting.
- */
-int parsec_gpu_get_best_device( parsec_task_t* this_task, double ratio )
-{
-    int i, dev_index = -1, data_index = 0;
-    parsec_taskpool_t* tp = this_task->taskpool;
-
-    /* Step one: Find the first data in WRITE mode stored on a GPU */
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
-        if( (NULL != this_task->task_class->out[i]) &&
-            (this_task->task_class->out[i]->flow_flags & FLOW_ACCESS_WRITE) ) {
-            data_index = this_task->task_class->out[i]->flow_index;
-            dev_index  = this_task->data[data_index].data_in->original->owner_device;
-            if (dev_index > 1) {
-                break;
-            }
-        }
-    }
-    assert(dev_index >= 0);
-
-    /* 0 is CPU, and 1 is recursive device */
-    if( dev_index <= 1 ) {  /* This is the first time we see this data for a GPU.
-                             * Let's decide which GPU will work on it. */
-        int best_index = 0;  /* default value: first CPU device */
-        float weight, best_weight = parsec_device_load[0] + ratio * parsec_device_sweight[0];
-
-        /* Start at 2, to skip the recursive body */
-        for( dev_index = 2; dev_index < parsec_devices_enabled(); dev_index++ ) {
-            /* Skip the device if it is not configured */
-            if(!(tp->devices_mask & (1 << dev_index))) continue;
-            weight = parsec_device_load[dev_index] + ratio * parsec_device_sweight[dev_index];
-            if( best_weight > weight ) {
-                best_index = dev_index;
-                best_weight = weight;
-            }
-        }
-        parsec_device_load[best_index] += ratio * parsec_device_sweight[best_index];
-        assert( best_index != 1 );
-        dev_index = best_index;
-    }
-
-    return dev_index;
-}
-
 #if PARSEC_GPU_USE_PRIORITIES
-
 static inline parsec_list_item_t* parsec_fifo_push_ordered( parsec_list_t* fifo,
                                                           parsec_list_item_t* elem )
 {
@@ -1372,11 +1307,11 @@ static inline parsec_list_item_t* parsec_fifo_push_ordered( parsec_list_t* fifo,
 #endif
 
 static inline int
-progress_stream( gpu_device_t* gpu_device,
-                 parsec_gpu_exec_stream_t* exec_stream,
-                 advance_task_function_t progress_fct,
-                 parsec_gpu_context_t* task,
-                 parsec_gpu_context_t** out_task )
+progress_stream( parsec_cuda_device_t* gpu_device,
+                 parsec_cuda_exec_stream_t* exec_stream,
+                 advance_cuda_task_function_t progress_fct,
+                 parsec_cuda_context_t* task,
+                 parsec_cuda_context_t** out_task )
 {
     int saved_rc = 0, rc, i;
     *out_task = NULL;
@@ -1390,7 +1325,7 @@ progress_stream( gpu_device_t* gpu_device,
   grab_a_task:
     if( NULL == exec_stream->tasks[exec_stream->start] ) {
         /* get the best task */
-        task = (parsec_gpu_context_t*)parsec_list_nolock_fifo_pop(exec_stream->fifo_pending);
+        task = (parsec_cuda_context_t*)parsec_list_nolock_fifo_pop(exec_stream->fifo_pending);
     }
     if( NULL == task ) {
         /* No more room on the event list or no tasks. Keep moving */
@@ -1469,7 +1404,7 @@ progress_stream( gpu_device_t* gpu_device,
              * actually ready to use
              */
             if (exec_stream == &(gpu_device->exec_stream[0])) {  /* exec_stream[0] is the PUSH stream */
-                parsec_gpu_context_t *gtask = exec_stream->tasks[exec_stream->end];
+                parsec_cuda_context_t *gtask = exec_stream->tasks[exec_stream->end];
                 this_task = gtask->ec;
                 for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
                     flow = gtask->flow[i];
@@ -1540,7 +1475,7 @@ progress_stream( gpu_device_t* gpu_device,
     return saved_rc;
 }
 
-void dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream)
+void dump_cuda_stream(parsec_cuda_exec_stream_t* exec_stream)
 {
     char task_str[128];
     int i;
@@ -1557,7 +1492,7 @@ void dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream)
     /* Don't yet dump the fifo_pending queue */
 }
 
-void dump_GPU_state(gpu_device_t* gpu_device)
+void dump_cuda_device(parsec_cuda_device_t* gpu_device)
 {
     int i;
 
@@ -1569,14 +1504,14 @@ void dump_GPU_state(gpu_device_t* gpu_device)
                  (unsigned long long)gpu_device->super.transferred_data_in, (unsigned long long)gpu_device->super.transferred_data_out,
                  (unsigned long long)gpu_device->super.required_data_in, (unsigned long long)gpu_device->super.required_data_out);
     for( i = 0; i < gpu_device->max_exec_streams; i++ ) {
-        dump_exec_stream(&gpu_device->exec_stream[i]);
+        dump_cuda_stream(&gpu_device->exec_stream[i]);
     }
     if( !parsec_list_nolock_is_empty(&gpu_device->gpu_mem_lru) ) {
         parsec_output(parsec_cuda_output_stream, "#\n# LRU list\n#\n");
         i = 0;
         PARSEC_LIST_NOLOCK_ITERATOR(&gpu_device->gpu_mem_lru, item,
                              {
-                                 parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
+                                 parsec_cuda_data_copy_t* gpu_copy = (parsec_cuda_data_copy_t*)item;
                                  parsec_output(parsec_cuda_output_stream, "  %d. elem %p GPU mem %p\n", i, gpu_copy, gpu_copy->device_private);
                                  parsec_dump_data_copy(gpu_copy);
                                  i++;
@@ -1587,7 +1522,7 @@ void dump_GPU_state(gpu_device_t* gpu_device)
         i = 0;
         PARSEC_LIST_NOLOCK_ITERATOR(&gpu_device->gpu_mem_owned_lru, item,
                              {
-                                 parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
+                                 parsec_cuda_data_copy_t* gpu_copy = (parsec_cuda_data_copy_t*)item;
                                  parsec_output(parsec_cuda_output_stream, "  %d. elem %p GPU mem %p\n", i, gpu_copy, gpu_copy->device_private);
                                  parsec_dump_data_copy(gpu_copy);
                                  i++;
@@ -1606,9 +1541,9 @@ void dump_GPU_state(gpu_device_t* gpu_device)
  *     -2: No more room on the GPU to move this data.
  */
 int
-parsec_gpu_kernel_push( gpu_device_t            *gpu_device,
-                       parsec_gpu_context_t     *gpu_task,
-                       parsec_gpu_exec_stream_t *gpu_stream)
+parsec_cuda_kernel_push( parsec_cuda_device_t            *gpu_device,
+                       parsec_cuda_context_t     *gpu_task,
+                       parsec_cuda_exec_stream_t *gpu_stream)
 {
     int i, ret = 0;
     parsec_task_t *this_task = gpu_task->ec;
@@ -1625,7 +1560,7 @@ parsec_gpu_kernel_push( gpu_device_t            *gpu_device,
     /**
      * First, let's reserve enough space on the device to transfer the data on the GPU.
      */
-    ret = parsec_gpu_data_reserve_device_space( gpu_device,
+    ret = parsec_cuda_data_reserve_device_space( gpu_device,
                                                gpu_task );
     if( ret < 0 ) {
         return ret;
@@ -1653,7 +1588,7 @@ parsec_gpu_kernel_push( gpu_device_t            *gpu_device,
                             "GPU[%1d]:\t\tIN  Data of %s <%x> on GPU\n",
                             gpu_device->cuda_index, flow->name,
                             this_task->data[i].data_out->original->key);
-        ret = parsec_gpu_data_stage_in( gpu_device, flow->flow_flags,
+        ret = parsec_cuda_data_stage_in( gpu_device, flow->flow_flags,
                                        &(this_task->data[i]), gpu_task, gpu_stream );
         if( ret < 0 ) {
             return ret;
@@ -1676,12 +1611,12 @@ parsec_gpu_kernel_push( gpu_device_t            *gpu_device,
  *           positive: the number of data to be moved.
  */
 int
-parsec_gpu_kernel_pop( gpu_device_t            *gpu_device,
-                      parsec_gpu_context_t     *gpu_task,
-                      parsec_gpu_exec_stream_t *gpu_stream)
+parsec_cuda_kernel_pop( parsec_cuda_device_t            *gpu_device,
+                      parsec_cuda_context_t     *gpu_task,
+                      parsec_cuda_exec_stream_t *gpu_stream)
 {
     parsec_task_t *this_task = gpu_task->ec;
-    parsec_gpu_data_copy_t     *gpu_copy;
+    parsec_cuda_data_copy_t     *gpu_copy;
     parsec_data_t              *original;
     const parsec_flow_t        *flow;
     int return_code = 0, how_many = 0, i;
@@ -1793,11 +1728,11 @@ parsec_gpu_kernel_pop( gpu_device_t            *gpu_device,
  * Make sure all data on the device is correctly put back into the queues.
  */
 int
-parsec_gpu_kernel_epilog( gpu_device_t        *gpu_device,
-                         parsec_gpu_context_t *gpu_task )
+parsec_cuda_kernel_epilog( parsec_cuda_device_t        *gpu_device,
+                         parsec_cuda_context_t *gpu_task )
 {
     parsec_task_t *this_task = gpu_task->ec;
-    parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
+    parsec_cuda_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
     int i;
 
@@ -1883,11 +1818,11 @@ parsec_gpu_kernel_epilog( gpu_device_t        *gpu_device,
  * @return Currently only success.
  */
 int
-parsec_gpu_kernel_cleanout( gpu_device_t        *gpu_device,
-                            parsec_gpu_context_t *gpu_task )
+parsec_cuda_kernel_cleanout( parsec_cuda_device_t        *gpu_device,
+                            parsec_cuda_context_t *gpu_task )
 {
     parsec_task_t *this_task = gpu_task->ec;
-    parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
+    parsec_cuda_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
     int i;
 
@@ -1937,19 +1872,19 @@ parsec_gpu_kernel_cleanout( gpu_device_t        *gpu_device,
  * where tasks ready to jump to the respective step are waiting.
  */
 parsec_hook_return_t
-parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
-                            parsec_gpu_context_t    *gpu_task,
+parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
+                            parsec_cuda_context_t    *gpu_task,
                             int which_gpu )
 {
-    gpu_device_t* gpu_device;
+    parsec_cuda_device_t* gpu_device;
     cudaError_t status;
     int rc, exec_stream = 0;
-    parsec_gpu_context_t *progress_task, *out_task_push, *out_task_submit = NULL, *out_task_pop = NULL;
+    parsec_cuda_context_t *progress_task, *out_task_push, *out_task_submit = NULL, *out_task_pop = NULL;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
 
-    gpu_device = (gpu_device_t*)parsec_devices_get(which_gpu);
+    gpu_device = (parsec_cuda_device_t*)parsec_devices_get(which_gpu);
 
 #if defined(PARSEC_PROF_TRACE)
     PARSEC_PROFILING_TRACE_FLAGS( es->es_profile,
@@ -1974,7 +1909,7 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
 #endif  /* defined(PARSEC_PROF_TRACE) */
 
     status = cudaSetDevice( gpu_device->cuda_index );
-    PARSEC_CUDA_CHECK_ERROR( "(parsec_gpu_kernel_scheduler) cudaSetDevice ", status,
+    PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_kernel_scheduler) cudaSetDevice ", status,
                             {return PARSEC_HOOK_RETURN_DISABLE;} );
 
   check_in_deps:
@@ -1987,7 +1922,7 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
     }
     rc = progress_stream( gpu_device,
                           &(gpu_device->exec_stream[0]),
-                          parsec_gpu_kernel_push,
+                          parsec_cuda_kernel_push,
                           gpu_task, &progress_task );
     if( rc < 0 ) {
         if( -1 == rc )
@@ -2015,7 +1950,7 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
                another incarnation of the task will be executed. */
             if( NULL != progress_task ) {
                 __parsec_reschedule(es, progress_task->ec);
-                parsec_gpu_kernel_cleanout(gpu_device, progress_task);
+                parsec_cuda_kernel_cleanout(gpu_device, progress_task);
                 gpu_task = progress_task;
                 progress_task = NULL;
                 goto remove_gpu_task;
@@ -2035,12 +1970,12 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
                             gpu_task->ec->priority );
     }
     if (out_task_submit == NULL && out_task_push == NULL) {
-        gpu_task = parsec_gpu_create_W2R_task(gpu_device, es);
+        gpu_task = parsec_cuda_create_W2R_task(gpu_device, es);
     }
     /* Task is ready to move the data back to main memory */
     rc = progress_stream( gpu_device,
                           &(gpu_device->exec_stream[1]),
-                          parsec_gpu_kernel_pop,
+                          parsec_cuda_kernel_pop,
                           gpu_task, &progress_task );
     if( rc < 0 ) {
         if( -1 == rc )
@@ -2061,9 +1996,9 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
   fetch_task_from_shared_queue:
     assert( NULL == gpu_task );
     if (out_task_submit == NULL && out_task_pop == NULL) {
-        parsec_gpu_sort_pending_list(gpu_device);
+        parsec_cuda_sort_pending_list(gpu_device);
     }
-    gpu_task = (parsec_gpu_context_t*)parsec_fifo_try_pop( &(gpu_device->pending) );
+    gpu_task = (parsec_cuda_context_t*)parsec_fifo_try_pop( &(gpu_device->pending) );
     if( NULL != gpu_task ) {
         PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,  "GPU[%1d]:\tGet from shared queue %s priority %d", gpu_device->cuda_index,
                             parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
@@ -2079,11 +2014,11 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
     /* Everything went fine so far, the result is correct and back in the main memory */
     PARSEC_LIST_ITEM_SINGLETON(gpu_task);
     if (gpu_task->task_type == GPU_TASK_TYPE_D2HTRANSFER) {
-        parsec_gpu_W2R_task_fini(gpu_device, gpu_task, es);
+        parsec_cuda_W2R_task_fini(gpu_device, gpu_task, es);
         gpu_task = progress_task;
         goto fetch_task_from_shared_queue;
     }
-    parsec_gpu_kernel_epilog( gpu_device, gpu_task );
+    parsec_cuda_kernel_epilog( gpu_device, gpu_task );
     __parsec_complete_execution( es, gpu_task->ec );
     gpu_device->super.executed_tasks++;
   remove_gpu_task:
diff --git a/parsec/devices/cuda/dev_cuda.h b/parsec/devices/cuda/dev_cuda.h
index 8cb45d61e..c895759b8 100644
--- a/parsec/devices/cuda/dev_cuda.h
+++ b/parsec/devices/cuda/dev_cuda.h
@@ -4,8 +4,8 @@
  *                         reserved.
  */
 
-#ifndef PARSEC_GPU_DATA_H_HAS_BEEN_INCLUDED
-#define PARSEC_GPU_DATA_H_HAS_BEEN_INCLUDED
+#ifndef PARSEC_CUDA_DATA_H_HAS_BEEN_INCLUDED
+#define PARSEC_CUDA_DATA_H_HAS_BEEN_INCLUDED
 
 #include "parsec/parsec_internal.h"
 #include "parsec/class/parsec_object.h"
@@ -23,12 +23,14 @@
 
 BEGIN_C_DECLS
 
-#define PARSEC_GPU_USE_PRIORITIES     1
+#define PARSEC_CUDA_MAX_WORKSPACE      2
+#define PARSEC_CUDA_W2R_NB_MOVE_OUT    1
 
-#define PARSEC_MAX_STREAMS            6
-#define PARSEC_MAX_EVENTS_PER_STREAM  4
-#define PARSEC_GPU_MAX_WORKSPACE      2
-#define PARSEC_GPU_W2R_NB_MOVE_OUT    1
+typedef struct __parsec_cuda_workspace {
+    void* workspace[PARSEC_CUDA_MAX_WORKSPACE];
+    int stack_head;
+    int total_workspace;
+} parsec_cuda_workspace_t;
 
 #if defined(PARSEC_PROF_TRACE)
 #define PARSEC_PROFILE_CUDA_TRACK_DATA_IN  0x0001
@@ -47,45 +49,37 @@ extern int parsec_cuda_own_GPU_key_end;
 
 #define GPU_TASK_TYPE_D2HTRANSFER 111
 
-extern float *device_load, *device_weight;
+struct __parsec_cuda_context;
+typedef struct __parsec_cuda_context parsec_cuda_context_t;
 
-typedef struct __parsec_gpu_workspace {
-    void* workspace[PARSEC_GPU_MAX_WORKSPACE];
-    int stack_head;
-    int total_workspace;
-} parsec_gpu_workspace_t;
-
-struct __parsec_gpu_context;
-typedef struct __parsec_gpu_context parsec_gpu_context_t;
-
-struct __parsec_gpu_exec_stream;
-typedef struct __parsec_gpu_exec_stream parsec_gpu_exec_stream_t;
+struct __parsec_cuda_exec_stream;
+typedef struct __parsec_cuda_exec_stream parsec_cuda_exec_stream_t;
 
 struct _gpu_device;
-typedef struct _gpu_device gpu_device_t;
+typedef struct _gpu_device parsec_cuda_device_t;
 
-typedef int (*advance_task_function_t)(gpu_device_t            *gpu_device,
-                                       parsec_gpu_context_t     *gpu_task,
-                                       parsec_gpu_exec_stream_t *gpu_stream);
+typedef int (*advance_cuda_task_function_t)(parsec_cuda_device_t            *gpu_device,
+                                       parsec_cuda_context_t     *gpu_task,
+                                       parsec_cuda_exec_stream_t *gpu_stream);
 
-struct __parsec_gpu_context {
+struct __parsec_cuda_context {
     parsec_list_item_t          list_item;
     parsec_task_t *ec;
-    advance_task_function_t    submit;
+    advance_cuda_task_function_t    submit;
     int                        task_type;
     int                        pushout[MAX_PARAM_COUNT];
     const parsec_flow_t        *flow[MAX_PARAM_COUNT];
 };
 
-struct __parsec_gpu_exec_stream {
-    struct __parsec_gpu_context **tasks;
+struct __parsec_cuda_exec_stream {
+    struct __parsec_cuda_context **tasks;
     cudaEvent_t *events;
     cudaStream_t cuda_stream;
     int32_t max_events;  /* number of potential events, and tasks */
     int32_t executed;    /* number of executed tasks */
     int32_t start, end;  /* circular buffer management start and end positions */
     parsec_list_t *fifo_pending;
-    parsec_gpu_workspace_t *workspace;
+    parsec_cuda_workspace_t *workspace;
 #if defined(PARSEC_PROF_TRACE)
     parsec_thread_profiling_t *profiling;
 #endif  /* defined(PROFILING) */
@@ -105,7 +99,7 @@ struct _gpu_device {
                                 *   the device to access directly the memory of
                                 *   the index of the set bit device.
                                 */
-    parsec_gpu_exec_stream_t* exec_stream;
+    parsec_cuda_exec_stream_t* exec_stream;
     volatile uint32_t mutex;
     parsec_list_t gpu_mem_lru;
     parsec_list_t gpu_mem_owned_lru;
@@ -124,14 +118,14 @@ struct _gpu_device {
         }                                                               \
     } while(0)
 
-int parsec_gpu_init(parsec_context_t *parsec_context);
-int parsec_gpu_fini(void);
+int parsec_cuda_init(parsec_context_t *parsec_context);
+int parsec_cuda_fini(void);
 
 /**
  * Debugging functions.
  */
-void dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream);
-void dump_GPU_state(gpu_device_t* gpu_device);
+void dump_cuda_stream(parsec_cuda_exec_stream_t* exec_stream);
+void dump_cuda_device(parsec_cuda_device_t* gpu_device);
 
 /****************************************************
  ** GPU-DATA Specific Starts Here **
@@ -140,21 +134,15 @@ void dump_GPU_state(gpu_device_t* gpu_device);
 /**
  * Overload the default data_copy_t with a GPU specialized type
  */
-typedef parsec_data_copy_t parsec_gpu_data_copy_t;
+typedef parsec_data_copy_t parsec_cuda_data_copy_t;
 
 #include "parsec/data_distribution.h"
 
 /* GPU workspace  ONLY works when PARSEC_ALLOC_GPU_PER_TILE is OFF */
-int parsec_gpu_push_workspace(gpu_device_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream);
-void* parsec_gpu_pop_workspace(gpu_device_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream, size_t size);
-int parsec_gpu_free_workspace(gpu_device_t * gpu_device);
-
-int parsec_gpu_get_best_device( parsec_task_t* this_task, double ratio );
+int parsec_cuda_push_workspace(parsec_cuda_device_t* gpu_device, parsec_cuda_exec_stream_t* gpu_stream);
+void* parsec_cuda_pop_workspace(parsec_cuda_device_t* gpu_device, parsec_cuda_exec_stream_t* gpu_stream, size_t size);
+int parsec_cuda_free_workspace(parsec_cuda_device_t * gpu_device);
 
-/* sort pending task list by number of spaces needed */
-int parsec_gpu_sort_pending_list(gpu_device_t *gpu_device);
-parsec_gpu_context_t* parsec_gpu_create_W2R_task(gpu_device_t *gpu_device, parsec_execution_stream_t *es);
-int parsec_gpu_W2R_task_fini(gpu_device_t *gpu_device, parsec_gpu_context_t *w2r_task, parsec_execution_stream_t *es);
 
 /**
  * Progress
@@ -168,8 +156,8 @@ int parsec_gpu_W2R_task_fini(gpu_device_t *gpu_device, parsec_gpu_context_t *w2r
  * where tasks ready to jump to the respective step are waiting.
  */
 parsec_hook_return_t
-parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
-                             parsec_gpu_context_t    *gpu_task,
+parsec_cuda_kernel_scheduler(parsec_execution_stream_t *es,
+                             parsec_cuda_context_t    *gpu_task,
                              int which_gpu );
 
 /**
@@ -186,9 +174,9 @@ parsec_gpu_kernel_scheduler( parsec_execution_stream_t *es,
  *     -2: No more room on the GPU to move this data.
  */
 int
-parsec_gpu_kernel_push( gpu_device_t            *gpu_device,
-                       parsec_gpu_context_t     *gpu_task,
-                       parsec_gpu_exec_stream_t *gpu_stream);
+parsec_cuda_kernel_push(parsec_cuda_device_t            *gpu_device,
+                       parsec_cuda_context_t     *gpu_task,
+                       parsec_cuda_exec_stream_t *gpu_stream);
 
 /**
  *  This function schedule the move of all the modified data for a
@@ -198,20 +186,20 @@ parsec_gpu_kernel_push( gpu_device_t            *gpu_device,
  *           positive: the number of data to be moved.
  */
 int
-parsec_gpu_kernel_pop( gpu_device_t            *gpu_device,
-                      parsec_gpu_context_t     *gpu_task,
-                      parsec_gpu_exec_stream_t *gpu_stream);
+parsec_cuda_kernel_pop(parsec_cuda_device_t            *gpu_device,
+                      parsec_cuda_context_t     *gpu_task,
+                      parsec_cuda_exec_stream_t *gpu_stream);
 
 /**
  * Make sure all data on the device is correctly put back into the queues.
  */
 int
-parsec_gpu_kernel_epilog( gpu_device_t        *gpu_device,
-                         parsec_gpu_context_t *gpu_task );
+parsec_cuda_kernel_epilog(parsec_cuda_device_t        *gpu_device,
+                         parsec_cuda_context_t *gpu_task );
 
 int
-parsec_gpu_kernel_cleanout( gpu_device_t        *gpu_device,
-                            parsec_gpu_context_t *gpu_task );
+parsec_cuda_kernel_cleanout(parsec_cuda_device_t        *gpu_device,
+                            parsec_cuda_context_t *gpu_task );
 
 END_C_DECLS
 
diff --git a/parsec/devices/device.c b/parsec/devices/device.c
index c49c7ed44..d3442e186 100644
--- a/parsec/devices/device.c
+++ b/parsec/devices/device.c
@@ -6,16 +6,18 @@
  */
 
 #include "parsec/parsec_config.h"
+#include "parsec/parsec_internal.h"
 #include "parsec/devices/device.h"
 #include "parsec/utils/mca_param.h"
 #include "parsec/constants.h"
 #include "parsec/utils/debug.h"
 #include "parsec/execution_stream.h"
 #include "parsec/utils/argv.h"
-
+#include "parsec/data_internal.h"
 #include <stdlib.h>
 
 #include <parsec/devices/cuda/dev_cuda.h>
+#include <parsec/devices/openmp/dev_omp.h>
 
 uint32_t parsec_nb_devices = 0;
 static uint32_t parsec_nb_max_devices = 0;
@@ -201,9 +203,14 @@ int parsec_devices_fini(parsec_context_t* parsec_context)
     parsec_device_tweight = NULL;
 
 #if defined(PARSEC_HAVE_CUDA)
-    (void)parsec_gpu_fini();
+    (void)parsec_cuda_fini();
 #endif  /* defined(PARSEC_HAVE_CUDA) */
 
+#if defined(PARSEC_HAVE_OPENMP)
+
+    (void)parsec_omp_fini();
+#endif /* defined(PARSEC_HAVE_OPENMP) */
+
     if( NULL != parsec_device_recursive ) {  /* Release recursive device */
         parsec_devices_remove(parsec_device_recursive);
         free(parsec_device_recursive); parsec_device_recursive = NULL;
@@ -278,6 +285,48 @@ int parsec_devices_freezed(parsec_context_t* context)
     return parsec_devices_are_freezed;
 }
 
+int parsec_devices_best_load( parsec_task_t* this_task, double ratio )
+{
+    int i, dev_index = -1, data_index = 0;
+    parsec_taskpool_t* tp = this_task->taskpool;
+
+    /* Step one: Find the first data in WRITE mode stored on a GPU */
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        if( (NULL != this_task->task_class->out[i]) &&
+            (this_task->task_class->out[i]->flow_flags & FLOW_ACCESS_WRITE) ) {
+            data_index = this_task->task_class->out[i]->flow_index;
+            dev_index  = this_task->data[data_index].data_in->original->owner_device;
+            if (dev_index > 1) {
+                break;
+            }
+        }
+    }
+    assert(dev_index >= 0);
+
+    /* 0 is CPU, and 1 is recursive device */
+    if( dev_index <= 1 ) {  /* This is the first time we see this data for a GPU.
+                             * Let's decide which GPU will work on it. */
+        int best_index = 0;  /* default value: first CPU device */
+        float weight, best_weight = parsec_device_load[0] + ratio * parsec_device_sweight[0];
+
+        /* Start at 2, to skip the recursive body */
+        for( dev_index = 2; dev_index < parsec_devices_enabled(); dev_index++ ) {
+            /* Skip the device if it is not configured */
+            if(!(tp->devices_mask & (1 << dev_index))) continue;
+            weight = parsec_device_load[dev_index] + ratio * parsec_device_sweight[dev_index];
+            if( best_weight > weight ) {
+                best_index = dev_index;
+                best_weight = weight;
+            }
+        }
+        parsec_device_load[best_index] += ratio * parsec_device_sweight[best_index];
+        assert( best_index != 1 );
+        dev_index = best_index;
+    }
+
+    return dev_index;
+}
+
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif
@@ -404,6 +453,7 @@ static int cpu_weights(parsec_device_t* device, int nstreams) {
 
 int parsec_devices_select(parsec_context_t* context)
 {
+    int rc = PARSEC_SUCCESS;
     int nb_total_comp_threads = 0;
 
     for(int p = 0; p < context->nb_vp; p++) {
@@ -430,11 +480,17 @@ int parsec_devices_select(parsec_context_t* context)
         parsec_device_recursive->device_dweight = parsec_device_cpus->device_dweight;
         parsec_devices_add(context, parsec_device_recursive);
     }
+
+#if defined(PARSEC_HAVE_OPENMP)
+    rc = parsec_omp_init(context);
+    if( PARSEC_SUCCESS != rc ) return rc;
+#endif /* define(PARSEC_HAVE_OPENMP) */
+
 #if defined(PARSEC_HAVE_CUDA)
-    return parsec_gpu_init(context);
-#else
-    return PARSEC_SUCCESS;
+    rc = parsec_cuda_init(context);
+    if( PARSEC_SUCCESS != rc ) return rc;
 #endif  /* defined(PARSEC_HAVE_CUDA) */
+    return PARSEC_SUCCESS;
 }
 
 int parsec_devices_add(parsec_context_t* context, parsec_device_t* device)
diff --git a/parsec/devices/device.h b/parsec/devices/device.h
index b4e8ad36a..f7a0f763d 100644
--- a/parsec/devices/device.h
+++ b/parsec/devices/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 The University of Tennessee and The University
+ * Copyright (c) 2013-2018 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  */
@@ -17,10 +17,11 @@
 #define PARSEC_DEV_NONE       ((uint8_t)    0x00)
 #define PARSEC_DEV_CPU        ((uint8_t)(1 << 0))
 #define PARSEC_DEV_RECURSIVE  ((uint8_t)(1 << 1))
-#define PARSEC_DEV_CUDA       ((uint8_t)(1 << 2))
-#define PARSEC_DEV_INTEL_PHI  ((uint8_t)(1 << 3))
-#define PARSEC_DEV_OPENCL     ((uint8_t)(1 << 4))
-#define PARSEC_DEV_ALL        ((uint8_t)    0x1f)
+#define PARSEC_DEV_OPENMP     ((uint8_t)(1 << 2))
+#define PARSEC_DEV_CUDA       ((uint8_t)(1 << 3))
+#define PARSEC_DEV_INTEL_PHI  ((uint8_t)(1 << 4))
+#define PARSEC_DEV_OPENCL     ((uint8_t)(1 << 5))
+#define PARSEC_DEV_ALL        ((uint8_t)    0x3f)
 
 typedef struct parsec_device_s parsec_device_t;
 
@@ -99,8 +100,20 @@ extern int parsec_devices_freeze(parsec_context_t*);
  * to parsec_devices_freeze().
  */
 extern int parsec_devices_freezed(parsec_context_t*);
-
 /**
+ * Return the device index of the device that would better balance the load 
+ * average (compute and h2d load) according to the capabilities of the 
+ * devices and current positionning of data copies for this task
+ *
+ * Returns:
+ *  > 1    - if the kernel should be executed by the a GPU
+ *  0 or 1 - if the kernel should be executed by some other meaning (in this case the
+ *         execution context is not released).
+ * -1      - if the kernel is scheduled to be executed on a GPU.
+ */
+extern int parsec_devices_best_load(parsec_task_t*, double load);
+
+    /**
  * Declare a new device with the runtime. The device will later provide a list
  * of supported operations.
  */
diff --git a/parsec/devices/openmp/dev_omp.c b/parsec/devices/openmp/dev_omp.c
new file mode 100644
index 000000000..312219253
--- /dev/null
+++ b/parsec/devices/openmp/dev_omp.c
@@ -0,0 +1,1986 @@
+/*
+ * Copyright (c) 2010-2018 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#include "parsec/parsec_config.h"
+#include "parsec/parsec_internal.h"
+#include "parsec/sys/atomic.h"
+
+#include "parsec/utils/mca_param.h"
+#include "parsec/constants.h"
+
+#if defined(PARSEC_HAVE_OPENMP)
+#include "parsec.h"
+#include "parsec/data_internal.h"
+#include "parsec/devices/openmp/dev_omp.h"
+#include "parsec/devices/device.h"
+#include "parsec/profiling.h"
+#include "parsec/execution_stream.h"
+#include "parsec/arena.h"
+#include "parsec/utils/debug.h"
+#include "parsec/utils/argv.h"
+#include "parsec/scheduling.h"
+
+#include <omp.h>
+#include <errno.h>
+#include <dlfcn.h>
+#include <sys/stat.h>
+
+#define PARSEC_GPU_USE_PRIORITIES     1
+//#define PARSEC_OPENMP_OMP_ALLOC_PER_TILE 1
+
+#define PARSEC_MAX_STREAMS            6
+#define PARSEC_MAX_EVENTS_PER_STREAM  4
+
+#if defined(PARSEC_PROF_TRACE)
+/* Accepted values are: PARSEC_PROFILE_OMP_TRACK_DATA_IN | PARSEC_PROFILE_OMP_TRACK_DATA_OUT |
+ *                      PARSEC_PROFILE_OMP_TRACK_OWN | PARSEC_PROFILE_OMP_TRACK_EXEC
+ */
+int parsec_omp_trackable_events = PARSEC_PROFILE_OMP_TRACK_EXEC | PARSEC_PROFILE_OMP_TRACK_DATA_OUT
+    | PARSEC_PROFILE_OMP_TRACK_DATA_IN | PARSEC_PROFILE_OMP_TRACK_OWN;
+int parsec_omp_movein_key_start;
+int parsec_omp_movein_key_end;
+int parsec_omp_moveout_key_start;
+int parsec_omp_moveout_key_end;
+int parsec_omp_own_GPU_key_start;
+int parsec_omp_own_GPU_key_end;
+#endif  /* defined(PROFILING) */
+
+int parsec_openmp_output_stream = -1;
+static char* omp_lib_path = NULL;
+
+static int
+parsec_omp_memory_reserve( parsec_openmp_device_t* gpu_device,
+                           int           memory_percentage,
+                           int           number_of_elements,
+                           size_t        eltsize );
+static int
+parsec_omp_memory_release( parsec_openmp_device_t* gpu_device );
+
+
+/* look up how many FMA per cycle in single/double, per omp MP
+ * precision.
+ * The following table provides updated values for future archs
+ * http://docs.nvidia.com/omp/omp-c-programming-guide/index.html#arithmetic-instructions
+ */
+static int parsec_omp_device_lookup_ompmp_floprate(int major, int minor, int *drate, int *srate, int *trate, int *hrate) {
+    //TODO find a way to compute this if possible
+    *hrate = *trate = *srate = *drate = 1;
+    return PARSEC_SUCCESS;
+}
+
+
+static int parsec_omp_device_fini(parsec_device_t* device)
+{
+    parsec_openmp_device_t* gpu_device = (parsec_openmp_device_t*)device;
+    int j, k;
+
+    /* Release the registered memory */
+    parsec_omp_memory_release(gpu_device);
+
+    /* Release pending queue */
+    OBJ_DESTRUCT(&gpu_device->pending);
+
+    /* Release all streams */
+    for( j = 0; j < gpu_device->max_exec_streams; j++ ) {
+        parsec_openmp_exec_stream_t* exec_stream = &(gpu_device->exec_stream[j]);
+
+        exec_stream->max_events   = PARSEC_MAX_EVENTS_PER_STREAM;
+        exec_stream->executed     = 0;
+        exec_stream->start        = 0;
+        exec_stream->end          = 0;
+
+        for( k = 0; k < exec_stream->max_events; k++ ) {
+            assert( NULL == exec_stream->tasks[k] );
+            assert( 0 == exec_stream->events[k] );
+        }
+        free(exec_stream->events); exec_stream->events = NULL;
+        free(exec_stream->tasks); exec_stream->tasks = NULL;
+        free(exec_stream->fifo_pending); exec_stream->fifo_pending = NULL;
+        /* Release the stream */
+        //TODO: omp-team-destroy? on exec_stream->omp_stream
+    }
+    free(gpu_device->exec_stream); gpu_device->exec_stream = NULL;
+
+    gpu_device->omp_index = -1;
+
+    /* Cleanup the GPU memory. */
+    OBJ_DESTRUCT(&gpu_device->gpu_mem_lru);
+    OBJ_DESTRUCT(&gpu_device->gpu_mem_owned_lru);
+
+    return PARSEC_SUCCESS;
+}
+
+static int parsec_omp_memory_register(parsec_device_t* device, parsec_data_collection_t* desc, void* ptr, size_t length)
+{
+    int rc = PARSEC_ERROR;
+
+    if (desc->memory_registration_status == MEMORY_STATUS_REGISTERED) {
+        rc = PARSEC_SUCCESS;
+        return rc;
+    }
+
+    /*
+     * We rely on the thread-safety of the OMP interface to register the memory
+     * as another thread might be submiting tasks at the same time
+     * (omp_scheduling.h), and we do not set a device since we register it for
+     * all devices.
+     */
+#if 0
+    status = ompHostRegister(ptr, length, ompHostRegisterPortable );
+    PARSEC_OMP_CHECK_ERROR( "(parsec_omp_memory_register) ompHostRegister ", status,
+                            { goto restore_and_return; } );
+#endif
+    rc = PARSEC_SUCCESS;
+    desc->memory_registration_status = MEMORY_STATUS_REGISTERED;
+
+  restore_and_return:
+    (void)device;
+    return rc;
+}
+
+static int parsec_omp_memory_unregister(parsec_device_t* device, parsec_data_collection_t* desc, void* ptr)
+{
+    int rc = PARSEC_ERROR;
+
+    if (desc->memory_registration_status == MEMORY_STATUS_UNREGISTERED) {
+        rc = PARSEC_SUCCESS;
+        return rc;
+    }
+
+    /*
+     * We rely on the thread-safety of the OMP interface to unregister the memory
+     * as another thread might be submiting tasks at the same time (omp_scheduling.h)
+     */
+#if 0
+    status = ompHostUnregister(ptr);
+    PARSEC_OMP_CHECK_ERROR( "(parsec_omp_memory_ununregister) ompHostUnregister ", status,
+                            {continue;} );
+#endif
+    rc = PARSEC_SUCCESS;
+    desc->memory_registration_status = MEMORY_STATUS_UNREGISTERED;
+
+    (void)device;
+    return rc;
+}
+
+
+void* omp_find_incarnation(parsec_openmp_device_t* gpu_device,
+                                     const char* fname)
+{
+    char library_name[FILENAME_MAX], function_name[FILENAME_MAX], *env;
+    int index = -1, capability = gpu_device->major * 10 + gpu_device->minor;
+    void *fn = NULL, *dlh = NULL;
+    char** argv = NULL, **target;
+
+    /**
+     * Prepare the list of PATH or FILE to be searched for a OMP shared library.
+     * In any case this list might be a list of ; separated possible targets,
+     * where each target can be either a directory or a specific file.
+     */
+    env = getenv("PARSEC_OMPCORES_LIB");
+    if( NULL != env ) {
+        argv = parsec_argv_split(env, ';');
+    } else if( NULL != omp_lib_path ) {
+        argv = parsec_argv_split(omp_lib_path, ';');
+    }
+
+  retry_lesser_version:
+    if( -1 == index ) {
+        capability = 0;
+        snprintf(function_name, FILENAME_MAX, "%s", fname);
+    }
+    else {
+        snprintf(function_name, FILENAME_MAX, "%s_%2d", fname, capability);
+    }
+
+    for( target = argv; (NULL != target) && (NULL != *target); target++ ) {
+        struct stat status;
+        if( 0 != stat(*target, &status) ) {
+            parsec_debug_verbose(10, parsec_openmp_output_stream,
+                                "Could not stat the %s path (%s)", *target, strerror(errno));
+            continue;
+        }
+        if( S_ISDIR(status.st_mode) ) {
+            if( capability )
+                snprintf(library_name,  FILENAME_MAX, "%s/libdplasma_cores_omp%d.so", *target, capability);
+            else
+                snprintf(library_name,  FILENAME_MAX, "%s/libdplasma_cores_omp.so", *target);
+        } else {
+            snprintf(library_name,  FILENAME_MAX, "%s", *target);
+        }
+
+        dlh = dlopen(library_name, RTLD_NOW | RTLD_NODELETE );
+        if(NULL == dlh) {
+            parsec_debug_verbose(10, parsec_openmp_output_stream,
+                                "Could not find %s dynamic library (%s)", library_name, dlerror());
+            continue;
+        }
+        fn = dlsym(dlh, function_name);
+        dlclose(dlh);
+        if( NULL != fn ) {
+            parsec_debug_verbose(4, parsec_openmp_output_stream,
+                                "Function %s found in shared library %s",
+                                function_name, library_name);
+            break;  /* we got one, stop here */
+        }
+    }
+    /* Couldn't load from named dynamic libs, try linked/static */
+    if(NULL == fn) {
+        parsec_output_verbose(10, parsec_openmp_output_stream,
+                             "No dynamic function %s found, trying from compile time linked in\n",
+                             function_name);
+        dlh = dlopen(NULL, RTLD_NOW | RTLD_NODELETE);
+        if(NULL != dlh) {
+            fn = dlsym(dlh, function_name);
+            if(NULL != fn) {
+                parsec_debug_verbose(4, parsec_openmp_output_stream,
+                                    "Function %s found in the application symbols",
+                                    function_name);
+            }
+            dlclose(dlh);
+        }
+    }
+
+    /* Still not found?? skip this GPU */
+    if(NULL == fn) {
+        parsec_debug_verbose(10, parsec_openmp_output_stream,
+                            "No function %s found for OMP device %s",
+                            function_name, gpu_device->super.name);
+        index--;
+        if(-1 <= index)
+            goto retry_lesser_version;
+    }
+
+    if( NULL != argv )
+        parsec_argv_free(argv);
+
+    return fn;
+}
+
+static int
+parsec_omp_taskpool_register(parsec_device_t* device, parsec_taskpool_t* tp)
+{
+    parsec_openmp_device_t* gpu_device = (parsec_openmp_device_t*)device;
+    uint32_t i, j, dev_mask = 0x0;
+    int32_t rc = PARSEC_ERR_NOT_FOUND;
+
+    /**
+     * Let's suppose it is not our job to detect if a particular body can
+     * run or not. We will need to add some properties that will allow the
+     * user to write the code to assess this.
+     */
+    assert(PARSEC_DEV_OPENMP == device->type);
+    for( i = 0; NULL != tp->task_classes_array[i]; i++ ) {
+        const parsec_task_class_t* tc = tp->task_classes_array[i];
+        __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
+        for( dev_mask = j = 0; NULL != chores[j].hook; j++ ) {
+            if( chores[j].type == device->type ) {
+                if ( NULL == chores[j].dyld ) {
+                    /* No dynamic load for this kernel */
+                    chores[gpu_device->omp_index].dyld_fn = NULL;
+                    rc = PARSEC_SUCCESS;
+                    dev_mask |= chores[j].type;
+                }
+                else {
+                    void* devf = omp_find_incarnation(gpu_device, chores[j].dyld);
+                    if( NULL != devf ) {
+                        chores[gpu_device->omp_index].dyld_fn = devf;
+                        rc = PARSEC_SUCCESS;
+                        dev_mask |= chores[j].type;
+                    }
+                }
+            }
+        }
+    }
+    /* Not a single chore supports this device, there is no reason to check anything further */
+    if(PARSEC_SUCCESS != rc) {
+        tp->devices_mask &= ~(device->device_index);
+    }
+
+    return rc;
+}
+
+static int
+parsec_omp_taskpool_unregister(parsec_device_t* device, parsec_taskpool_t* tp)
+{
+    (void)device; (void)tp;
+    return PARSEC_SUCCESS;
+}
+
+int parsec_omp_init(parsec_context_t *parsec_context)
+{
+    int omp_memory_block_size, omp_memory_percentage, omp_memory_number_of_blocks = -1;
+    int show_caps_index, show_caps = 0;
+    int use_omp_index, use_omp;
+    int omp_mask, omp_verbosity;
+    int ndevices, i, j, k;
+
+    use_omp_index = parsec_mca_param_reg_int_name("device_omp", "enabled",
+                                                   "The number of OMP device to enable for the next PaRSEC context",
+                                                   false, false, -1, &use_omp);
+    parsec_mca_param_reg_int_name("device_omp", "mask",
+                                  "The bitwise mask of OMP devices to be enabled (default all)",
+                                  false, false, 0xffffffff, &omp_mask);
+    (void)parsec_mca_param_reg_int_name("device_omp", "verbose",
+                                        "Set the verbosity level of the OMP device (negative value: use debug verbosity), higher is less verbose)\n",
+                                        false, false, -1, &omp_verbosity);
+    (void)parsec_mca_param_reg_string_name("device_omp", "path",
+                                           "Path to the shared library files containing the OMP version of the hooks. It is a ;-separated list of either directories or .so files.\n",
+                                           false, false, PARSEC_LIB_OPENMP_PREFIX, &omp_lib_path);
+    (void)parsec_mca_param_reg_int_name("device_omp", "memory_block_size",
+                                        "The OMP memory page for PaRSEC internal management.",
+                                        false, false, 32*1024, &omp_memory_block_size);
+    (void)parsec_mca_param_reg_int_name("device_omp", "memory_use",
+                                        "The percentage of the total GPU memory to be used by this PaRSEC context",
+                                        false, false, 95, &omp_memory_percentage);
+    (void)parsec_mca_param_reg_int_name("device_omp", "memory_number_of_blocks",
+                                        "Alternative to device_omp_memory_use: sets exactly the number of blocks to allocate (-1 means to use a percentage of the available memory)",
+                                        false, false, -1, &omp_memory_number_of_blocks);
+    if( 0 == use_omp ) {
+        return -1;  /* Nothing to do around here */
+    }
+
+    if( omp_verbosity >= 0 ) {
+        parsec_openmp_output_stream = parsec_output_open(NULL);
+        parsec_output_set_verbosity(parsec_openmp_output_stream, omp_verbosity);
+    } else {
+        parsec_openmp_output_stream = parsec_debug_output;
+    }
+
+    assert(omp_is_initial_device());
+    ndevices = omp_get_num_devices();
+
+    if( -1 == use_omp ) {  /* use all OMP devices */
+        parsec_mca_param_set_int(use_omp_index, ndevices);
+        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output,
+                             "device_omp_enabled updated to the total number of devices (%d)\n", ndevices);
+    } else {
+        if( ndevices > use_omp ) {
+            if( 0 < use_omp_index ) {
+                ndevices = use_omp;
+            }
+        } else if (ndevices < use_omp ) {
+            if( 0 < use_omp_index ) {
+                parsec_warning("User requested %d GPUs, but only %d are available in this machine. PaRSEC will enable all of them.",
+                               use_omp, ndevices);
+                parsec_mca_param_set_int(use_omp_index, ndevices);
+            }
+        }
+    }
+
+    /* Update the number of GPU for the upper layer */
+    use_omp = ndevices;
+    if( 0 == ndevices ) {
+        return -1;
+    }
+
+    show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities");
+    if(0 < show_caps_index) {
+        parsec_mca_param_lookup_int(show_caps_index, &show_caps);
+    }
+#if defined(PARSEC_PROF_TRACE)
+    parsec_profiling_add_dictionary_keyword( "movein", "fill:#33FF33",
+                                             sizeof(intptr_t), "pointer{int64_t}",
+                                             &parsec_omp_movein_key_start, &parsec_omp_movein_key_end);
+    parsec_profiling_add_dictionary_keyword( "moveout", "fill:#ffff66",
+                                             sizeof(intptr_t), "pointer{int64_t}",
+                                             &parsec_omp_moveout_key_start, &parsec_omp_moveout_key_end);
+    parsec_profiling_add_dictionary_keyword( "omp", "fill:#66ff66",
+                                             0, NULL,
+                                             &parsec_omp_own_GPU_key_start, &parsec_omp_own_GPU_key_end);
+#endif  /* defined(PROFILING) */
+
+    for( i = 0; i < ndevices; i++ ) {
+        parsec_openmp_device_t* gpu_device;
+        char *szName;
+        int major, minor, host_emu, streaming_multiprocessor, drate, srate, trate, hrate;
+        float clockRate;
+
+        /* Allow fine grain selection of the GPU's */
+        if( !((1 << i) & omp_mask) ) continue;
+
+#pragma omp target device(i) map(from: streaming_multiprocessor, host_emu)
+{
+        streaming_multiprocessor = omp_get_num_procs();
+        host_emu = omp_is_initial_device();
+}
+        major     = 4; /* omp target version */
+        minor     = 5;
+        //TODO: fill this with real values
+        clockRate = 2e3; /* in MHz */
+        asprintf(&szName, "OMP device %d (%s)", i, host_emu?"host emulation": "offload");
+        
+        gpu_device = (parsec_openmp_device_t*)calloc(1, sizeof(parsec_openmp_device_t));
+        OBJ_CONSTRUCT(gpu_device, parsec_list_item_t);
+        gpu_device->omp_index = (uint8_t)i;
+        gpu_device->major      = (uint8_t)major;
+        gpu_device->minor      = (uint8_t)minor;
+        gpu_device->super.name = strdup(szName);
+
+        gpu_device->max_exec_streams = PARSEC_MAX_STREAMS;
+        gpu_device->exec_stream =
+            (parsec_openmp_exec_stream_t*)malloc(gpu_device->max_exec_streams
+                                              * sizeof(parsec_openmp_exec_stream_t));
+        for( j = 0; j < gpu_device->max_exec_streams; j++ ) {
+            parsec_openmp_exec_stream_t* exec_stream = &(gpu_device->exec_stream[j]);
+
+#if 0
+            //TODO: create an OMP team to async the memcpy
+            /* Allocate the stream */
+            ompstatus = ompStreamCreate( &(exec_stream->omp_stream) );
+            PARSEC_OMP_CHECK_ERROR( "ompStreamCreate ", ompstatus,
+                                     {break;} );
+#else
+            exec_stream->omp_stream = j;
+#endif
+            exec_stream->workspace    = NULL;
+            exec_stream->max_events   = PARSEC_MAX_EVENTS_PER_STREAM;
+            exec_stream->executed     = 0;
+            exec_stream->start        = 0;
+            exec_stream->end          = 0;
+            exec_stream->fifo_pending = (parsec_list_t*)OBJ_NEW(parsec_list_t);
+            OBJ_CONSTRUCT(exec_stream->fifo_pending, parsec_list_t);
+            exec_stream->tasks  = (parsec_openmp_context_t**)malloc(exec_stream->max_events
+                                                                 * sizeof(parsec_openmp_context_t*));
+            exec_stream->events = (int*)malloc(exec_stream->max_events * sizeof(int));
+            /* and the corresponding events */
+            for( k = 0; k < exec_stream->max_events; k++ ) {
+                exec_stream->events[k] = 0;
+                exec_stream->tasks[k]  = NULL;
+            }
+#if defined(PARSEC_PROF_TRACE)
+            exec_stream->profiling = parsec_profiling_thread_init( 2*1024*1024, PARSEC_PROFILE_STREAM_STR, i, j );
+            if(j == 0) {
+                exec_stream->prof_event_track_enable = parsec_omp_trackable_events & PARSEC_PROFILE_OMP_TRACK_DATA_IN;
+                exec_stream->prof_event_key_start    = parsec_omp_movein_key_start;
+                exec_stream->prof_event_key_end      = parsec_omp_movein_key_end;
+            } else if(j == 1) {
+                exec_stream->prof_event_track_enable = parsec_omp_trackable_events & PARSEC_PROFILE_OMP_TRACK_DATA_OUT;
+                exec_stream->prof_event_key_start    = parsec_omp_moveout_key_start;
+                exec_stream->prof_event_key_end      = parsec_omp_moveout_key_end;
+            } else {
+                exec_stream->prof_event_track_enable = parsec_omp_trackable_events & PARSEC_PROFILE_OMP_TRACK_EXEC;
+                exec_stream->prof_event_key_start    = -1;
+                exec_stream->prof_event_key_end      = -1;
+            }
+#endif  /* defined(PARSEC_PROF_TRACE) */
+        }
+
+        gpu_device->super.type                 = PARSEC_DEV_OPENMP;
+        gpu_device->super.executed_tasks       = 0;
+        gpu_device->super.transferred_data_in  = 0;
+        gpu_device->super.transferred_data_out = 0;
+        gpu_device->super.required_data_in     = 0;
+        gpu_device->super.required_data_out    = 0;
+
+        gpu_device->super.device_fini                = parsec_omp_device_fini;
+        gpu_device->super.device_memory_register     = parsec_omp_memory_register;
+        gpu_device->super.device_memory_unregister   = parsec_omp_memory_unregister;
+        gpu_device->super.device_taskpool_register   = parsec_omp_taskpool_register;
+        gpu_device->super.device_taskpool_unregister = parsec_omp_taskpool_unregister;
+
+        if (parsec_omp_device_lookup_ompmp_floprate(major, minor, &drate, &srate, &trate, &hrate) == PARSEC_ERROR ) {
+            return -1;
+        }
+        gpu_device->super.device_hweight = (float)streaming_multiprocessor * (float)hrate * (float)clockRate * 4e-3f;
+        gpu_device->super.device_tweight = (float)streaming_multiprocessor * (float)trate * (float)clockRate * 2e-3f;
+        gpu_device->super.device_sweight = (float)streaming_multiprocessor * (float)srate * (float)clockRate * 4e-3f;
+        gpu_device->super.device_dweight = (float)streaming_multiprocessor * (float)drate * (float)clockRate * 2e-3f;
+
+        if( show_caps ) {
+            parsec_inform("OMP Device %d (capability %d.%d): %s\n"
+                          "\tSM                 : %d\n"
+                          "\tclockRate (GHz)    : %2.2f\n"
+                          "\tpeak Gflops         : double %2.4f, single %2.4f tensor %2.4f half %2.4f",
+                          i, major, minor, szName,
+                          clockRate*1e-3,
+                          streaming_multiprocessor,
+                          gpu_device->super.device_dweight, gpu_device->super.device_sweight, gpu_device->super.device_tweight, gpu_device->super.device_hweight);
+        }
+
+        if( PARSEC_SUCCESS != parsec_omp_memory_reserve(gpu_device,
+                                                         omp_memory_percentage,
+                                                         omp_memory_number_of_blocks,
+                                                         omp_memory_block_size) ) {
+            free(gpu_device);
+            continue;
+        }
+
+        /* Initialize internal lists */
+        OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru,       parsec_list_t);
+        OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t);
+        OBJ_CONSTRUCT(&gpu_device->pending,           parsec_list_t);
+
+        gpu_device->sort_starting_p = NULL;
+        parsec_devices_add(parsec_context, &(gpu_device->super));
+    }
+
+#if 1 || defined(PARSEC_HAVE_PEER_DEVICE_MEMORY_ACCESS)
+    /* In OpenMP all devices have peer access, it may be emulated if
+     * needed */
+    for( i = 0; i < ndevices; i++ ) {
+        parsec_openmp_device_t *source_gpu, *target_gpu;
+
+        if( NULL == (source_gpu = (parsec_openmp_device_t*)parsec_devices_get(i)) ) continue;
+        /* Skip all non OMP devices */
+        if( PARSEC_DEV_OPENMP != source_gpu->super.type ) continue;
+
+        source_gpu->peer_access_mask = 0;
+
+        for( j = 0; j < ndevices; j++ ) {
+            if( (NULL == (target_gpu = (parsec_openmp_device_t*)parsec_devices_get(j))) || (i == j) ) continue;
+            /* Skip all non OMP devices */
+            if( PARSEC_DEV_OPENMP != target_gpu->super.type ) continue;
+
+            /* Communication mask */
+            /* In OpenMP all devices have peer access, it may be emulated if
+             * needed */
+            source_gpu->peer_access_mask = (int16_t)(source_gpu->peer_access_mask | (int16_t)(1 << target_gpu->omp_index));
+        }
+    }
+#endif
+
+    return 0;
+}
+
+int parsec_omp_fini(void)
+{
+    parsec_openmp_device_t* gpu_device;
+    int i;
+
+    for(i = 0; i < parsec_devices_enabled(); i++) {
+        if( NULL == (gpu_device = (parsec_openmp_device_t*)parsec_devices_get(i)) ) continue;
+        if(PARSEC_DEV_OPENMP != gpu_device->super.type) continue;
+        parsec_omp_device_fini((parsec_device_t*)gpu_device);
+        parsec_devices_remove((parsec_device_t*)gpu_device);
+        free(gpu_device);
+    }
+
+    if( parsec_debug_output != parsec_openmp_output_stream ) parsec_output_close(parsec_openmp_output_stream);
+    parsec_openmp_output_stream = -1;
+
+    if ( omp_lib_path ) {
+        free(omp_lib_path);
+    }
+
+    return PARSEC_SUCCESS;
+}
+
+/**
+ * This function reserve the memory_percentage of the total device memory for PaRSEC.
+ * This memory will be managed in chuncks of size eltsize. However, multiple chuncks
+ * can be reserved in a single allocation.
+ */
+static int
+parsec_omp_memory_reserve( parsec_openmp_device_t* gpu_device,
+                           int           memory_percentage,
+                           int           number_blocks,
+                           size_t        eltsize )
+{
+    (void)eltsize;
+
+    size_t how_much_we_allocate;
+    size_t total_mem, initial_free_mem;
+    uint32_t mem_elem_per_gpu = 0;
+
+    /* Determine how much memory we can allocate */
+    //ompMemGetInfo( &initial_free_mem, &total_mem );
+    //TODO: figure out a way to not have to hardcode this and autodetect
+    total_mem = initial_free_mem = 1024*1024*1024*(size_t)2;
+    if( number_blocks != -1 ) {
+        if( number_blocks == 0 ) {
+            parsec_warning("OMP: Invalid argument: requesting 0 bytes of memory on OMP device %s", gpu_device->super.name);
+            return PARSEC_ERROR;
+        } else {
+            how_much_we_allocate = number_blocks * eltsize;
+        }
+    } else {
+        /** number_blocks == -1 means memory_percentage is used */
+        how_much_we_allocate = (memory_percentage * initial_free_mem) / 100;
+    }
+    if( how_much_we_allocate > initial_free_mem ) {
+        /** Handle the case of jokers who require more than 100% of memory,
+         *  and eleventh case of computer scientists who don't know how
+         *  to divide a number by another
+         */
+        parsec_warning("OMP: Requested %d bytes on OMP device %s, but only %d bytes are available -- reducing allocation to max available",
+                      how_much_we_allocate, initial_free_mem);
+        how_much_we_allocate = initial_free_mem;
+    }
+    if( how_much_we_allocate < eltsize ) {
+        /** Handle another kind of jokers entirely, and cases of
+         *  not enough memory on the device
+         */
+        parsec_warning("OMP: Cannot allocate at least one element on OMP device %s", gpu_device->super.name);
+        return PARSEC_ERROR;
+    }
+
+#if defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+    size_t free_mem = initial_free_mem;
+    /*
+     * We allocate a bunch of tiles that will be used
+     * during the computations
+     */
+    while( (free_mem > eltsize )
+           && ((total_mem - free_mem) < how_much_we_allocate) ) {
+        parsec_openmp_data_copy_t* gpu_elem;
+        void *device_ptr;
+
+        device_ptr = omp_target_alloc(eltsize, gpu_device->omp_index);
+        if( NULL == device_ptr ) {
+            size_t _free_mem = 0, _total_mem = 0;
+            //ompMemGetInfo( &_free_mem, &_total_mem );
+            parsec_inform("Per context: free mem %zu total mem %zu (allocated tiles %u)",
+                          _free_mem, _total_mem, mem_elem_per_gpu);
+        }
+        gpu_elem = OBJ_NEW(parsec_data_copy_t);
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "Allocate OMP copy %p [ref_count %d] for data [%p]",
+                            gpu_elem, gpu_elem->super.super.obj_reference_count, NULL);
+        gpu_elem->device_private = (void*)(long)device_ptr;
+        gpu_elem->device_index = gpu_device->super.device_index;
+        mem_elem_per_gpu++;
+        OBJ_RETAIN(gpu_elem);
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "Retain and insert OMP copy %p [ref_count %d] in LRU",
+                            gpu_elem, gpu_elem->super.super.obj_reference_count);
+        parsec_list_nolock_fifo_push( &gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem );
+        //ompMemGetInfo( &free_mem, &total_mem );
+        free_mem -= eltsize;
+    }
+    if( 0 == mem_elem_per_gpu && parsec_list_nolock_is_empty( &gpu_device->gpu_mem_lru ) ) {
+        parsec_warning("GPU:\tRank %d Cannot allocate memory on GPU %d. Skip it!",
+                      gpu_device->super.context->my_rank, gpu_device->omp_index);
+    }
+    else {
+        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output,  "GPU:\tAllocate %u tiles on the GPU memory", mem_elem_per_gpu );
+    }
+    PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                        "GPU:\tAllocate %u tiles on the GPU memory", mem_elem_per_gpu);
+#else
+    if( NULL == gpu_device->memory ) {
+        void* base_ptr;
+        /* We allocate all the memory on the GPU and we use our memory management */
+        mem_elem_per_gpu = (how_much_we_allocate + eltsize - 1 ) / eltsize;
+        size_t total_size = (size_t)mem_elem_per_gpu * eltsize;
+        base_ptr = omp_target_alloc(total_size, gpu_device->omp_index);
+        if( NULL == base_ptr ) {
+            parsec_warning("Allocating memory on the GPU device failed");
+        }
+        gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, eltsize );
+
+        if( gpu_device->memory == NULL ) {
+            parsec_warning("GPU:\tRank %d Cannot allocate memory on GPU %d. Skip it!",
+                          gpu_device->super.context->my_rank, gpu_device->omp_index);
+            return PARSEC_ERROR;
+        }
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "GPU:\tAllocate %u segment of size %d on the GPU memory",
+                            mem_elem_per_gpu, eltsize );
+    }
+#endif
+
+    return PARSEC_SUCCESS;
+}
+
+static void parsec_omp_memory_release_list(parsec_openmp_device_t* gpu_device,
+                                           parsec_list_t* list)
+{
+    parsec_list_item_t* item;
+
+    while(NULL != (item = parsec_list_nolock_fifo_pop(list)) ) {
+        parsec_openmp_data_copy_t* gpu_copy = (parsec_openmp_data_copy_t*)item;
+        parsec_data_t* original = gpu_copy->original;
+
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "Release OMP copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p, in map %p",
+                            gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count,
+                            original, (NULL != original ? original->dc : NULL));
+        assert( gpu_copy->device_index == gpu_device->super.device_index );
+        if( DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) {
+            parsec_warning("GPU[%d] still OWNS the master memory copy for data %d and it is discarding it!",
+                          gpu_device->omp_index, original->key);
+        }
+#if defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+        omp_target_free( gpu_copy->device_private, gpu_device->omp_index );
+#else
+        zone_free( gpu_device->memory, (void*)gpu_copy->device_private );
+#endif
+        gpu_copy->device_private = NULL;
+
+        /* At this point the data copies should have no attachement to a data_t. Thus,
+         * before we get here (aka below parsec_fini), the destructor of the data
+         * collection must have been called, releasing all the copies.
+         */
+        OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy);
+    }
+}
+
+/**
+ * This function release the OMP memory reserved for this device.
+ *
+ * One has to notice that all the data available on the GPU is stored in one of
+ * the two used to keep track of the allocated data, either the gpu_mem_lru or
+ * the gpu_mem_owner_lru. Thus, going over all the elements in these two lists
+ * should be enough to enforce a clean release.
+ */
+static int
+parsec_omp_memory_release( parsec_openmp_device_t* gpu_device )
+{
+
+#if 1
+    dump_omp_device(gpu_device); // debug only
+#endif
+    /* Free all memory on GPU */
+    parsec_omp_memory_release_list(gpu_device, &gpu_device->gpu_mem_lru);
+    parsec_omp_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru);
+
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+    assert( NULL != gpu_device->memory );
+    void* ptr = zone_malloc_fini(&gpu_device->memory);
+    omp_target_free(ptr, gpu_device->omp_index);
+#endif
+
+    return PARSEC_SUCCESS;
+}
+
+/**
+ * Try to find memory space to move all data on the GPU. We attach a device_elem to
+ * a memory_elem as soon as a device_elem is available. If we fail to find enough
+ * available elements, we push all the elements handled during this allocation
+ * back into the pool of available device_elem, to be picked up by another call
+ * (this call will remove them from the current task).
+ * Returns:
+ *    0: All gpu_mem/mem_elem have been initialized
+ *   -2: The task needs to rescheduled
+ */
+static inline int
+parsec_openmp_data_reserve_device_space( parsec_openmp_device_t* gpu_device,
+                                     parsec_openmp_context_t *gpu_task )
+{
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_openmp_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
+    parsec_data_t* master, *oldmaster;
+    const parsec_flow_t *flow;
+    int i, j;
+
+    /**
+     * Parse all the input and output flows of data and ensure all have
+     * corresponding data on the GPU available.
+     */
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        flow = gpu_task->flow[i];
+        assert( flow && (flow->flow_index == i) );
+
+        /* Skip CTL flows only */
+        if(!(flow->flow_flags)) continue;
+
+        temp_loc[i] = NULL;
+        master   = this_task->data[i].data_in->original;
+        gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
+        this_task->data[i].data_out = gpu_elem;
+
+        /* There is already a copy on the device */
+        if( NULL != gpu_elem ) continue;
+
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+        gpu_elem = OBJ_NEW(parsec_data_copy_t);
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "Allocate OMP copy %p [ref_count %d] for data %p",
+                            gpu_elem, gpu_elem->super.super.obj_reference_count, master);
+      malloc_data:
+        gpu_elem->device_private = zone_malloc(gpu_device->memory, master->nb_elts);
+        if( NULL == gpu_elem->device_private ) {
+#endif
+
+          find_another_data:
+            /* Look for a data_copy to free */
+            lru_gpu_elem = (parsec_openmp_data_copy_t*)parsec_list_nolock_fifo_pop(&gpu_device->gpu_mem_lru);
+            if( NULL == lru_gpu_elem ) {
+                /* We can't find enough room on the GPU. Insert the tiles in the begining of
+                 * the LRU (in order to be reused asap) and return without scheduling the task.
+                 */
+                parsec_warning("GPU:\tRequest space on GPU failed for %d out of %d data",
+                              this_task->task_class->nb_flows - i,
+                              this_task->task_class->nb_flows);
+                for( j = 0; j < i; j++ ) {
+                    if( NULL != temp_loc[j] ) {
+                        parsec_list_nolock_lifo_push(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)temp_loc[j]);
+                    }
+                }
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+                OBJ_RELEASE(gpu_elem);
+#endif
+                return -2;
+            }
+
+            PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem);
+            PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                                "Release LRU-retrieved OMP copy %p [ref_count %d]",
+                                lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count);
+            assert( NULL != lru_gpu_elem );
+
+            /* If there are pending readers, let the gpu_elem loose. This is a weak coordination
+             * protocol between here and the parsec_openmp_data_stage_in, where the readers don't necessarily
+             * always remove the data from the LRU.
+             */
+            if( 0 != lru_gpu_elem->readers ) {
+                goto find_another_data; // TODO: add an assert of some sort to check for leaks here?
+            }
+
+            /* Make sure the new GPU element is clean and ready to be used */
+            assert( master != lru_gpu_elem->original );
+            assert( NULL   != lru_gpu_elem->original );
+
+            /* Let's check we're not trying to steal one of our own data */
+            oldmaster = lru_gpu_elem->original;
+            for( j = 0; j < this_task->task_class->nb_flows; j++ ) {
+                if( NULL == this_task->data[j].data_in ) continue;
+                if( this_task->data[j].data_in->original == oldmaster ) {
+                    goto find_another_data;
+                }
+            }
+
+            /* The data is not used, and it's not one of ours: we can free it or reuse it */
+            PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                                "GPU[%d]:\tRepurpose copy %p to mirror block %p (in task %s:i) instead of %p",
+                                gpu_device->omp_index, lru_gpu_elem, master, this_task->task_class->name, i, oldmaster);
+            parsec_data_copy_detach(oldmaster, lru_gpu_elem, gpu_device->super.device_index);
+
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+            /* Let's free this space, and try again to malloc some space */
+            zone_free( gpu_device->memory, (void*)(lru_gpu_elem->device_private) );
+            PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                                "Release LRU-retrieved OMP copy %p [ref_count %d: must be 0]",
+                                lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count);
+            OBJ_RELEASE(lru_gpu_elem); assert( NULL == lru_gpu_elem );
+            goto malloc_data;
+        }
+#else
+        gpu_elem = lru_gpu_elem;
+#endif
+        assert( 0 == gpu_elem->readers );
+        gpu_elem->coherency_state = DATA_COHERENCY_INVALID;
+        gpu_elem->version = 0;
+        parsec_data_copy_attach(master, gpu_elem, gpu_device->super.device_index);
+        this_task->data[i].data_out = gpu_elem;
+        temp_loc[i] = gpu_elem;
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "Retain and insert OMP copy %p [ref_count %d] in LRU",
+                            gpu_elem, gpu_elem->super.super.obj_reference_count);
+        parsec_list_nolock_fifo_push(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem);
+    }
+    return 0;
+}
+
+/**
+ * If the most current version of the data is not yet available on the GPU memory
+ * schedule a transfer.
+ * Returns:
+ *    0: The most recent version of the data is already available on the GPU
+ *    1: A copy has been scheduled on the corresponding stream
+ *   -1: A copy cannot be issued due to OMP.
+ */
+static inline int
+parsec_openmp_data_stage_in( parsec_openmp_device_t* gpu_device,
+                         int32_t type,
+                         parsec_data_pair_t* task_data,
+                         parsec_openmp_context_t *gpu_task,
+                         parsec_openmp_exec_stream_t *gpu_stream )
+{
+    parsec_data_copy_t* in_elem = task_data->data_in;
+    parsec_data_t* original = in_elem->original;
+    parsec_openmp_data_copy_t* gpu_elem = task_data->data_out;
+    int transfer_from = -1;
+
+    /**
+     * If the data will be accessed in write mode, remove it from any GPU data management
+     * lists until the task is completed.
+     */
+    if( FLOW_ACCESS_WRITE & type ) {
+        if (gpu_elem->readers > 0 ) {
+            parsec_warning("GPU[%d]:\tWrite access to data with existing readers (Possible anti-dependency, "
+                          "or concurrent accesses), please prevent that with CTL dependencies\n", gpu_device->omp_index);
+            return -86;
+        }
+        parsec_list_item_ring_chop((parsec_list_item_t*)gpu_elem);
+        PARSEC_LIST_ITEM_SINGLETON(gpu_elem);
+    }
+
+    /* DtoD copy, if data is read only, then we go back to CPU copy, and fetch data from CPU (HtoD) */
+    if( (in_elem != original->device_copies[0]) &&
+        (in_elem->version == original->device_copies[0]->version) ) {
+        /* We should never enter here, as long as we don't foward the GPU data to the input */
+        assert(!((in_elem != original->device_copies[0]) &&
+                 (in_elem->version == original->device_copies[0]->version)));
+        parsec_data_copy_release(in_elem);  /* release the copy in GPU1 */
+        task_data->data_in = original->device_copies[0];
+        in_elem = task_data->data_in;
+        OBJ_RETAIN(in_elem);  /* retain the corresponding CPU copy */
+    }
+
+    transfer_from = parsec_data_transfer_ownership_to_copy(original, gpu_device->super.device_index, (uint8_t)type);
+    gpu_device->super.required_data_in += original->nb_elts;
+    if( -1 != transfer_from ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                            "GPU[%d]:\t\tMove H2D data %x of %d bytes\t(Host: v:%d, ptr:%p / Device: v:%d, ptr:%p)",
+                            gpu_device->omp_index, original->key, original->nb_elts,
+                            in_elem->version, in_elem->device_private,
+                            gpu_elem->version, (void*)gpu_elem->device_private);
+
+        assert( gpu_elem->version <= in_elem->version );
+        assert((gpu_elem->version != in_elem->version) || (gpu_elem->data_transfer_status == DATA_STATUS_NOT_TRANSFER));
+
+#if defined(PARSEC_PROF_TRACE)
+        if( gpu_stream->prof_event_track_enable ) {
+            parsec_task_t *this_task = gpu_task->ec;
+
+            assert(-1 != gpu_stream->prof_event_key_start);
+            PARSEC_PROFILING_TRACE(gpu_stream->profiling,
+                                   gpu_stream->prof_event_key_start,
+                                   this_task->task_class->key_functions->key_hash(this_task->task_class->make_key(this_task->taskpool, this_task->locals), 64, NULL),
+                                   this_task->taskpool->taskpool_id,
+                                   &original);
+        }
+#endif
+
+        /* update the data version in GPU immediately, and mark the data under transfer */
+        assert((gpu_elem->version != in_elem->version) || (gpu_elem->data_transfer_status == DATA_STATUS_NOT_TRANSFER));
+        gpu_elem->version = in_elem->version;
+        gpu_elem->data_transfer_status = DATA_STATUS_UNDER_TRANSFER;
+        gpu_elem->push_task = gpu_task->ec;  /* only the task who does the transfer can modify the data status later. */
+        /* TODO: take ownership of the data */
+        
+        /* Push data into the GPU */
+        int* event = &gpu_stream->events[gpu_stream->start];
+        (*event)++;
+//#pragma omp target update nowait fromdepend(out: event[0]) device(gpu_device->omp_index)
+#pragma omp task depend(out: event[0]) firstprivate(gpu_device, original, in_elem, gpu_elem)
+{ //firstprivate(original, gpu_elem, in_elem, gpu_device)
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream, 
+                        "GPU[%d]: starting omp_target_memcpy h2d from %p to %p on event %p (thread %d)",
+                        gpu_device->omp_index, in_elem->device_private, gpu_elem->device_private, event, omp_get_thread_num());
+    int status = omp_target_memcpy(gpu_elem->device_private, in_elem->device_private,
+                                   original->nb_elts, 0, 0,
+                                   gpu_device->omp_index, omp_get_initial_device());
+    if( 0 != status ) {
+        parsec_warning("<<%p>> -> <<%p>> [%d]", in_elem->device_private, gpu_elem->device_private, original->nb_elts);
+        //return -1; 
+    }
+}
+        gpu_device->super.transferred_data_in += original->nb_elts;
+        return 1;
+    }
+    else {
+        assert( gpu_elem->data_transfer_status == DATA_STATUS_COMPLETE_TRANSFER ||
+                gpu_elem->data_transfer_status == DATA_STATUS_UNDER_TRANSFER);
+
+        PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                            "GPU[%d]:\t\tNO Move H2D for data %x of %d bytes\n",
+                            gpu_device->omp_index, original->key, original->nb_elts);
+    }
+    /* TODO: data keeps the same coherence flags as before */
+    return 0;
+}
+
+void* parsec_openmp_pop_workspace(parsec_openmp_device_t* gpu_device, parsec_openmp_exec_stream_t* gpu_stream, size_t size)
+{
+    (void)gpu_device; (void)gpu_stream; (void)size;
+    void *work = NULL;
+
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+    if (gpu_stream->workspace == NULL) {
+        gpu_stream->workspace = (parsec_openmp_workspace_t *)malloc(sizeof(parsec_openmp_workspace_t));
+        gpu_stream->workspace->total_workspace = PARSEC_OPENMP_MAX_WORKSPACE;
+        gpu_stream->workspace->stack_head = PARSEC_OPENMP_MAX_WORKSPACE - 1;
+
+        for( int i = 0; i < PARSEC_OPENMP_MAX_WORKSPACE; i++ ) {
+            gpu_stream->workspace->workspace[i] = zone_malloc( gpu_device->memory, size);
+        }
+    }
+    assert (gpu_stream->workspace->stack_head >= 0);
+    work = gpu_stream->workspace->workspace[gpu_stream->workspace->stack_head];
+    gpu_stream->workspace->stack_head --;
+#endif /* !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE) */
+    return work;
+}
+
+int parsec_openmp_push_workspace(parsec_openmp_device_t* gpu_device, parsec_openmp_exec_stream_t* gpu_stream)
+{
+    (void)gpu_device; (void)gpu_stream;
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+    gpu_stream->workspace->stack_head ++;
+    assert (gpu_stream->workspace->stack_head < PARSEC_OPENMP_MAX_WORKSPACE);
+#endif /* !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE) */
+    return 0;
+}
+
+int parsec_openmp_free_workspace(parsec_openmp_device_t * gpu_device)
+{
+    (void)gpu_device;
+#if !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE)
+    int i, j;
+    for( i = 0; i < gpu_device->max_exec_streams; i++ ) {
+        parsec_openmp_exec_stream_t *gpu_stream = &(gpu_device->exec_stream[i]);
+        if (gpu_stream->workspace != NULL) {
+            for (j = 0; j < gpu_stream->workspace->total_workspace; j++) {
+                zone_free( gpu_device->memory, gpu_stream->workspace->workspace[j] );
+            }
+            free(gpu_stream->workspace);
+            gpu_stream->workspace = NULL;
+        }
+    }
+#endif /* !defined(PARSEC_OPENMP_OMP_ALLOC_PER_TILE) */
+    return 0;
+}
+
+static inline int parsec_openmp_check_space_needed(parsec_openmp_device_t *gpu_device, parsec_openmp_context_t *gpu_task)
+{
+    int i;
+    int space_needed = 0;
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_data_t *original;
+    parsec_data_copy_t *data;
+    const parsec_flow_t *flow;
+
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        flow = gpu_task->flow[i];
+        if(!(flow->flow_flags)) continue;
+
+        data = this_task->data[i].data_in;
+        original = data->original;
+        if( NULL != PARSEC_DATA_GET_COPY(original, gpu_device->super.device_index) ) {
+            continue;
+        }
+        if(flow->flow_flags & FLOW_ACCESS_READ)
+            space_needed++;
+    }
+    return space_needed;
+}
+
+
+static int parsec_openmp_sort_pending_list(parsec_openmp_device_t *gpu_device)
+{
+    //parsec_list_t *sort_list = &(gpu_device->pending);
+    parsec_list_t *sort_list = gpu_device->exec_stream[0].fifo_pending;
+#if 0
+    if ( !parsec_atomic_trylock(&(sort_list->atomic_lock)) ) {
+        return 0;
+    }
+#endif
+    if (parsec_list_nolock_is_empty(sort_list) ) { /* list is empty */
+#if 0
+        parsec_atomic_unlock(&(sort_list->atomic_lock));
+#endif
+        return 0;
+    }
+
+    if (gpu_device->sort_starting_p == NULL || !parsec_list_nolock_contains(sort_list, gpu_device->sort_starting_p) ) {
+        gpu_device->sort_starting_p = (parsec_list_item_t*)sort_list->ghost_element.list_next;
+    }
+
+    /* p is head */
+    parsec_list_item_t *p = gpu_device->sort_starting_p;
+    int i, j, NB_SORT = 10, space_q, space_min;
+
+    parsec_list_item_t *q, *prev_p, *min_p;
+    for (i = 0; i < NB_SORT; i++) {
+        if ( p == &(sort_list->ghost_element) ) {
+            break;
+        }
+        min_p = p; /* assume the minimum one is the first one p */
+        q = (parsec_list_item_t*)min_p->list_next;
+        space_min = parsec_openmp_check_space_needed(gpu_device, (parsec_openmp_context_t*)min_p);
+        for (j = i+1; j < NB_SORT; j++) {
+            if ( q == &(sort_list->ghost_element) ) {
+                break;
+            }
+            space_q = parsec_openmp_check_space_needed(gpu_device, (parsec_openmp_context_t*)q);
+            if ( space_min > space_q ) {
+                min_p = q;
+                space_min = space_q;
+            }
+            q = (parsec_list_item_t*)q->list_next;
+
+        }
+        if (min_p != p) { /* minimum is not the first one, let's insert min_p before p */
+            /* take min_p out */
+            parsec_list_item_ring_chop(min_p);
+            PARSEC_LIST_ITEM_SINGLETON(min_p);
+            prev_p = (parsec_list_item_t*)p->list_prev;
+
+            /* insert min_p after prev_p */
+            parsec_list_nolock_add_after( sort_list, prev_p, min_p);
+        }
+        p = (parsec_list_item_t*)min_p->list_next;
+    }
+
+#if 0
+        parsec_atomic_unlock(&(sort_list->atomic_lock));
+#endif
+    return 0;
+}
+
+/**
+ * Transfer at most the PARSEC_OPENMP_W2R_NB_MOVE_OUT oldest data from the GPU back
+ * to main memory. Create a single task to move them all out, then switch the
+ * GPU data copy in shared mode.
+ */
+static parsec_openmp_context_t* parsec_openmp_create_W2R_task(parsec_openmp_device_t *gpu_device, parsec_execution_stream_t *es)
+{
+    parsec_openmp_context_t *w2r_task = NULL;
+    parsec_task_t *ec = NULL;
+    parsec_openmp_data_copy_t *gpu_copy;
+    parsec_data_t* original;
+    parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next;
+    int nb_cleaned = 0;
+
+    /* Find a data copy that has no pending users on the GPU, and can be
+     * safely moved back on the main memory */
+    while(nb_cleaned < PARSEC_OPENMP_W2R_NB_MOVE_OUT) {
+        /* Break at the end of the list */
+        if( item == &(gpu_device->gpu_mem_owned_lru.ghost_element) ) {
+            break;
+        }
+        gpu_copy = (parsec_openmp_data_copy_t*)item;
+        original = gpu_copy->original;
+        if( (0 != gpu_copy->readers) || (0 != original->device_copies[0]->readers) ) {
+            item = (parsec_list_item_t*)item->list_next;  /* conversion needed for volatile */
+            continue;
+        }
+        if( NULL == ec ) {  /* allocate on-demand */
+            ec = (parsec_task_t*)parsec_thread_mempool_allocate(es->context_mempool);
+            if( NULL == ec )  /* we're running out of memory. Bail out. */
+                break;
+            ec->status = PARSEC_TASK_STATUS_NONE;
+        }
+        parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
+        PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
+        gpu_copy->readers++;
+        ec->data[nb_cleaned].data_out = gpu_copy;
+        nb_cleaned++;
+    }
+
+    if( 0 == nb_cleaned )
+        return NULL;
+
+    w2r_task = (parsec_openmp_context_t *)malloc(sizeof(parsec_openmp_context_t));
+    OBJ_CONSTRUCT(w2r_task, parsec_list_item_t);
+    ec->priority = INT32_MAX;
+    ec->task_class = NULL;
+    w2r_task->ec = ec;
+    w2r_task->task_type = GPU_TASK_TYPE_D2HTRANSFER;
+    return w2r_task;
+}
+
+/**
+ * Complete a data copy transfer originated from the engine.
+ */
+int parsec_openmp_W2R_task_fini(parsec_openmp_device_t *gpu_device,
+                            parsec_openmp_context_t *w2r_task,
+                            parsec_execution_stream_t *es)
+{
+    parsec_openmp_data_copy_t *gpu_copy, *cpu_copy;
+    parsec_task_t *ec = w2r_task->ec;
+    parsec_data_t* original;
+    int i;
+
+    assert(w2r_task->task_type == GPU_TASK_TYPE_D2HTRANSFER);
+    for( i = 0; (i < PARSEC_OPENMP_W2R_NB_MOVE_OUT) && (NULL != (gpu_copy = ec->data[i].data_out)); i++ ) {
+        gpu_copy->coherency_state = DATA_COHERENCY_SHARED;
+        original = gpu_copy->original;
+        cpu_copy = original->device_copies[0];
+        cpu_copy->coherency_state =  DATA_COHERENCY_SHARED;
+        cpu_copy->version = gpu_copy->version;
+        PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                            "Mirror on CPU and move OMP copy %p [ref_count %d] in LRU",
+                            gpu_copy, gpu_copy->super.super.obj_reference_count);
+        parsec_list_nolock_fifo_push(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy);
+        gpu_copy->readers--;
+        assert(gpu_copy->readers >= 0);
+    }
+    parsec_thread_mempool_free(es->context_mempool, w2r_task->ec);
+    free(w2r_task);
+    return 0;
+}
+
+#if PARSEC_GPU_USE_PRIORITIES
+static inline parsec_list_item_t* parsec_fifo_push_ordered( parsec_list_t* fifo,
+                                                          parsec_list_item_t* elem )
+{
+    parsec_list_nolock_push_sorted(fifo, elem, parsec_execution_context_priority_comparator);
+    return elem;
+}
+#define PARSEC_FIFO_PUSH  parsec_fifo_push_ordered
+#else
+#define PARSEC_FIFO_PUSH  parsec_list_nolock_fifo_push
+#endif
+
+static inline int
+progress_stream( parsec_openmp_device_t* gpu_device,
+                 parsec_openmp_exec_stream_t* exec_stream,
+                 advance_openmp_task_function_t progress_fct,
+                 parsec_openmp_context_t* task,
+                 parsec_openmp_context_t** out_task )
+{
+    int saved_rc = 0, rc, i;
+    *out_task = NULL;
+    parsec_task_t *this_task;
+    const parsec_flow_t *flow;
+
+    if( NULL != task ) {
+        PARSEC_FIFO_PUSH(exec_stream->fifo_pending, (parsec_list_item_t*)task);
+        task = NULL;
+    }
+  grab_a_task:
+    if( NULL == exec_stream->tasks[exec_stream->start] ) {
+        /* get the best task */
+        task = (parsec_openmp_context_t*)parsec_list_nolock_fifo_pop(exec_stream->fifo_pending);
+    }
+    if( NULL == task ) {
+        /* No more room on the event list or no tasks. Keep moving */
+        goto check_completion;
+    }
+    PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task);
+
+    assert( NULL == exec_stream->tasks[exec_stream->start] );
+    /**
+     * In case the task is succesfully progressed, the corresponding profiling
+     * event is triggered.
+     */
+    if ( NULL == progress_fct ) {
+        /* Grab the submit function */
+        progress_fct = task->submit;
+#if defined(PARSEC_DEBUG_PARANOID)
+        for( i = 0; i < task->ec->task_class->nb_flows; i++ ) {
+            flow = task->flow[i];
+            assert( flow );
+            assert( flow->flow_index == i );
+            if(!flow->flow_flags) continue;
+            assert(task->ec->data[i].data_out->data_transfer_status == DATA_STATUS_COMPLETE_TRANSFER);
+        }
+#endif /* defined(PARSEC_DEBUG_PARANOID) */
+    }
+    assert( NULL != progress_fct );
+    rc = progress_fct( gpu_device, task, exec_stream );
+    if( 0 > rc ) {
+        if( PARSEC_HOOK_RETURN_AGAIN != rc) {
+            *out_task = task;
+            return rc;
+        }
+
+        PARSEC_FIFO_PUSH(exec_stream->fifo_pending, (parsec_list_item_t*)task);
+        PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                             "GPU[%d]: Reschedule %s(task %p) priority %d: no room available on the GPU for data",
+                             gpu_device->omp_index, task->ec->task_class->name, (void*)task->ec, task->ec->priority);
+    } else {
+        /**
+         * Do not skip the omp event generation. The problem is that some of the inputs
+         * might be in the pipe of being transferred to the GPU. If we activate this task
+         * too early, it might get executed before the data is available on the GPU.
+         * Obviously, this lead to incorrect results.
+         */
+        int* event = &exec_stream->events[exec_stream->start];
+        PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                             "GPU[%d]:  %s(task %p) target completion waited on event %p, posted %d actions in that event",
+                             gpu_device->omp_index, task->ec->task_class->name, (void*)task->ec, event, *event);
+#pragma omp task depend(inout: event[0])
+{
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream, 
+                        "GPU[%d]: decreasing event counter %p from %d to 0",
+                        gpu_device->omp_index, event, *event);
+    assert(0 != *event);
+    (*event)=0;
+}
+        exec_stream->tasks[exec_stream->start] = task;
+        exec_stream->start = (exec_stream->start + 1) % exec_stream->max_events;
+#if defined(PARSEC_DEBUG_NOISIER)
+        if( task->task_type == GPU_TASK_TYPE_D2HTRANSFER ) {
+            PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                                "GPU[%d]: Submitted Transfer(task %p) on stream %d",
+                                gpu_device->omp_index, (void*)task->ec,
+                                exec_stream->omp_stream);
+        }
+        else {
+            PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                                "GPU[%d]: Submitted %s(task %p) priority %d on stream %d",
+                                gpu_device->omp_index,
+                                task->ec->task_class->name, (void*)task->ec, task->ec->priority,
+                                exec_stream->omp_stream);
+        }
+#endif
+    }
+    task = NULL;
+
+  check_completion:
+    if( (NULL == *out_task) && (NULL != exec_stream->tasks[exec_stream->end]) ) {
+#pragma omp taskyield
+        if(0 == exec_stream->events[exec_stream->end]) {
+            /**
+             * Even though omp event return success, the PUSH may not be
+             * completed if no PUSH is required by this task and the PUSH is
+             * actually done by another task, so we need to check if the data is
+             * actually ready to use
+             */
+            if (exec_stream == &(gpu_device->exec_stream[0])) {  /* exec_stream[0] is the PUSH stream */
+                parsec_openmp_context_t *gtask = exec_stream->tasks[exec_stream->end];
+                this_task = gtask->ec;
+                for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+                    flow = gtask->flow[i];
+                    assert( flow );
+                    assert( flow->flow_index == i );
+                    if(!flow->flow_flags) continue;
+                    if (this_task->data[i].data_out->push_task == this_task) {   /* only the task who did this PUSH can modify the status */
+                        this_task->data[i].data_out->data_transfer_status = DATA_STATUS_COMPLETE_TRANSFER;
+                        continue;
+                    }
+                    assert(this_task->data[i].data_out->data_transfer_status == DATA_STATUS_COMPLETE_TRANSFER);
+                    if (this_task->data[i].data_out->data_transfer_status != DATA_STATUS_COMPLETE_TRANSFER) {  /* data is not ready */
+                        /**
+                         * As long as we have only one stream to push the data
+                         * to the GPU, we should never end up in this case.
+                         * Remove previous assert if changed.
+                         */
+                        return saved_rc;
+                    }
+                }
+            }
+
+            /* Save the task for the next step */
+            task = *out_task = exec_stream->tasks[exec_stream->end];
+#if defined(PARSEC_DEBUG_NOISIER)
+            if( task->task_type == GPU_TASK_TYPE_D2HTRANSFER ) {
+                PARSEC_DEBUG_VERBOSE(19, parsec_openmp_output_stream,
+                                    "GPU[%d]: Completed Transfer(task %p) on stream %d",
+                                    gpu_device->omp_index,
+                                    (void*)task->ec,
+                                    exec_stream->omp_stream);
+            }
+            else {
+                PARSEC_DEBUG_VERBOSE(19, parsec_openmp_output_stream,
+                                    "GPU[%d]: Completed %s(task %p) priority %d on stream %d",
+                                    gpu_device->omp_index,
+                                    task->ec->task_class->name, (void*)task->ec, task->ec->priority,
+                                    exec_stream->omp_stream);
+            }
+#endif
+            exec_stream->tasks[exec_stream->end] = NULL;
+            exec_stream->end = (exec_stream->end + 1) % exec_stream->max_events;
+#if defined(PARSEC_PROF_TRACE)
+            if( exec_stream->prof_event_track_enable ) {
+                if( task->task_type == GPU_TASK_TYPE_D2HTRANSFER ) {
+                    assert( exec_stream->prof_event_key_end == parsec_omp_moveout_key_end );
+                    PARSEC_PROFILING_TRACE(exec_stream->profiling,
+                                          exec_stream->prof_event_key_end,
+                                          -1, 0, NULL);
+                } else {
+                    PARSEC_TASK_PROF_TRACE(exec_stream->profiling,
+                                          (-1 == exec_stream->prof_event_key_end ?
+                                           PARSEC_PROF_FUNC_KEY_END(task->ec->taskpool,
+                                                                    task->ec->task_class->task_class_id) :
+                                           exec_stream->prof_event_key_end),
+                                          task->ec);
+                }
+            }
+#endif /* (PARSEC_PROF_TRACE) */
+            task = NULL;  /* Try to schedule another task */
+            goto grab_a_task;
+        }
+    }
+    return saved_rc;
+}
+
+void dump_omp_stream(parsec_openmp_exec_stream_t* exec_stream)
+{
+    char task_str[128];
+    int i;
+
+    parsec_debug_verbose(0, parsec_openmp_output_stream,
+                        "Dump GPU exec stream %p [events = %d, start = %d, end = %d, executed = %d]",
+                        exec_stream, exec_stream->max_events, exec_stream->start, exec_stream->end,
+                        exec_stream->executed);
+    for( i = 0; i < exec_stream->max_events; i++ ) {
+        if( NULL == exec_stream->tasks[i] ) continue;
+        parsec_debug_verbose(0, parsec_openmp_output_stream,
+                            "    %d: %s", i, parsec_task_snprintf(task_str, 128, exec_stream->tasks[i]->ec));
+    }
+    /* Don't yet dump the fifo_pending queue */
+}
+
+void dump_omp_device(parsec_openmp_device_t* gpu_device)
+{
+    int i;
+
+    parsec_output(parsec_openmp_output_stream, "\n\n");
+    parsec_output(parsec_openmp_output_stream, "Device %d:%d (%p)\n", gpu_device->omp_index, gpu_device->super.device_index, gpu_device);
+    parsec_output(parsec_openmp_output_stream, "\tpeer mask %x executed tasks %llu max streams %d\n",
+                 gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->max_exec_streams);
+    parsec_output(parsec_openmp_output_stream, "\tstats transferred [in %llu out %llu] required [in %llu out %llu]\n",
+                 (unsigned long long)gpu_device->super.transferred_data_in, (unsigned long long)gpu_device->super.transferred_data_out,
+                 (unsigned long long)gpu_device->super.required_data_in, (unsigned long long)gpu_device->super.required_data_out);
+    for( i = 0; i < gpu_device->max_exec_streams; i++ ) {
+        dump_omp_stream(&gpu_device->exec_stream[i]);
+    }
+    if( !parsec_list_nolock_is_empty(&gpu_device->gpu_mem_lru) ) {
+        parsec_output(parsec_openmp_output_stream, "#\n# LRU list\n#\n");
+        i = 0;
+        PARSEC_LIST_NOLOCK_ITERATOR(&gpu_device->gpu_mem_lru, item,
+                             {
+                                 parsec_openmp_data_copy_t* gpu_copy = (parsec_openmp_data_copy_t*)item;
+                                 parsec_output(parsec_openmp_output_stream, "  %d. elem %p GPU mem %p\n", i, gpu_copy, gpu_copy->device_private);
+                                 parsec_dump_data_copy(gpu_copy);
+                                 i++;
+                             });
+    };
+    if( !parsec_list_nolock_is_empty(&gpu_device->gpu_mem_owned_lru) ) {
+        parsec_output(parsec_openmp_output_stream, "#\n# Owned LRU list\n#\n");
+        i = 0;
+        PARSEC_LIST_NOLOCK_ITERATOR(&gpu_device->gpu_mem_owned_lru, item,
+                             {
+                                 parsec_openmp_data_copy_t* gpu_copy = (parsec_openmp_data_copy_t*)item;
+                                 parsec_output(parsec_openmp_output_stream, "  %d. elem %p GPU mem %p\n", i, gpu_copy, gpu_copy->device_private);
+                                 parsec_dump_data_copy(gpu_copy);
+                                 i++;
+                             });
+    };
+    parsec_output(parsec_openmp_output_stream, "\n\n");
+}
+
+/**
+ *  This function schedule the move of all the data required for a
+ *  specific task from the main memory into the GPU memory.
+ *
+ *  Returns:
+ *     a positive number: the number of data to be moved.
+ *     -1: data cannot be moved into the GPU.
+ *     -2: No more room on the GPU to move this data.
+ */
+int
+parsec_openmp_kernel_push( parsec_openmp_device_t            *gpu_device,
+                       parsec_openmp_context_t     *gpu_task,
+                       parsec_openmp_exec_stream_t *gpu_stream)
+{
+    int i, ret = 0;
+    parsec_task_t *this_task = gpu_task->ec;
+    const parsec_flow_t        *flow;
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp[MAX_TASK_STRLEN];
+#endif
+
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                        "GPU[%d]: Try to Push %s",
+                        gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+
+    /**
+     * First, let's reserve enough space on the device to transfer the data on the GPU.
+     */
+    ret = parsec_openmp_data_reserve_device_space( gpu_device,
+                                               gpu_task );
+    if( ret < 0 ) {
+        return ret;
+    }
+
+    /**
+     * Second, We have enough space, let's schedule the required transfer
+     */
+    PARSEC_TASK_PROF_TRACE_IF(gpu_stream->prof_event_track_enable,
+                             gpu_stream->profiling,
+                             (-1 == gpu_stream->prof_event_key_start ?
+                              PARSEC_PROF_FUNC_KEY_START(this_task->taskpool,
+                                                        this_task->task_class->task_class_id) :
+                              gpu_stream->prof_event_key_start),
+                             this_task);
+
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        flow = gpu_task->flow[i];
+        /* Skip CTL flows */
+        if(!(flow->flow_flags)) continue;
+
+        assert( NULL != parsec_data_copy_get_ptr(this_task->data[i].data_in) );
+
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                            "GPU[%1d]:\t\tIN  Data of %s <%x> on GPU\n",
+                            gpu_device->omp_index, flow->name,
+                            this_task->data[i].data_out->original->key);
+        ret = parsec_openmp_data_stage_in( gpu_device, flow->flow_flags,
+                                       &(this_task->data[i]), gpu_task, gpu_stream );
+        if( ret < 0 ) {
+            return ret;
+        }
+    }
+
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                        "GPU[%d]: Push task %s DONE",
+                        gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+
+    return ret;
+}
+
+/**
+ *  This function schedule the move of all the modified data for a
+ *  specific task from the GPU memory into the main memory.
+ *
+ *  Returns: negative number if any error occured.
+ *           positive: the number of data to be moved.
+ */
+int
+parsec_openmp_kernel_pop( parsec_openmp_device_t            *gpu_device,
+                      parsec_openmp_context_t     *gpu_task,
+                      parsec_openmp_exec_stream_t *gpu_stream)
+{
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_openmp_data_copy_t     *gpu_copy;
+    parsec_data_t              *original;
+    const parsec_flow_t        *flow;
+    int return_code = 0, how_many = 0, i;
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp[MAX_TASK_STRLEN];
+#endif
+
+    if (gpu_task->task_type == GPU_TASK_TYPE_D2HTRANSFER) {
+        for( i = 0; i < 1; i++ ) {
+            gpu_copy = this_task->data[i].data_out;
+            original = gpu_copy->original;
+            int* event = &gpu_stream->events[gpu_stream->start];
+            (*event)++;
+#pragma omp task depend(out: event[0]) firstprivate(this_task, gpu_device, original, gpu_copy)
+{ //firstprivate(original, gpu_elem, in_elem, gpu_device)
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream, 
+                        "GPU[%d]: starting omp_target_memcpy d2h from %p to %p on event %p (thread %d)",
+                        gpu_device->omp_index, gpu_copy->device_private, original->device_copies[0]->device_private, event, omp_get_thread_num());
+    int status = omp_target_memcpy(original->device_copies[0]->device_private, 
+                                   gpu_copy->device_private,
+                                   original->nb_elts, 0, 0,
+                                   omp_get_initial_device(), gpu_device->omp_index);
+    if( 0 != status ) {
+        parsec_warning("data %s <<%p>> -> <<%p>> [%d]", this_task->task_class->out[i]->name, 
+                                                        gpu_copy->device_private,
+                                                        original->device_copies[0]->device_private,
+                                                        original->nb_elts);
+    }
+}
+        }
+        return return_code;
+    }
+
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                        "GPU[%d]: Try to Pop %s",
+                        gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        /* Don't bother if there is no real data (aka. CTL or no output) */
+        if(NULL == this_task->data[i].data_out) continue;
+
+        flow = gpu_task->flow[i];
+
+        gpu_copy = this_task->data[i].data_out;
+        original = gpu_copy->original;
+        assert(original == this_task->data[i].data_in->original);
+
+        if( !(flow->flow_flags & FLOW_ACCESS_WRITE) ) {
+            /* Do not propagate GPU copies to successors (temporary solution) */
+            this_task->data[i].data_out = original->device_copies[0];
+        }
+
+        if( flow->flow_flags & FLOW_ACCESS_READ ) {
+            gpu_copy->readers--; assert(gpu_copy->readers >= 0);
+            if( (0 == gpu_copy->readers) &&
+                !(flow->flow_flags & FLOW_ACCESS_WRITE) ) {
+                parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
+                PARSEC_LIST_ITEM_SINGLETON(gpu_copy); /* TODO: singleton instead? */
+                parsec_list_nolock_fifo_push(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy);
+                continue;  /* done with this element, go for the next one */
+            }
+        }
+        if( flow->flow_flags & FLOW_ACCESS_WRITE ) {
+            assert( gpu_copy == parsec_data_get_copy(gpu_copy->original, gpu_device->super.device_index) );
+
+            /* Stage the transfer of the data back to main memory */
+            gpu_device->super.required_data_out += original->nb_elts;
+            assert( ((parsec_list_item_t*)gpu_copy)->list_next == (parsec_list_item_t*)gpu_copy );
+            assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy );
+
+            PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                                "GPU[%1d]:\tOUT Data of %s\n", gpu_device->omp_index, flow->name);
+
+            if( gpu_task->pushout[i] ) {
+                /* TODO: make sure no readers are working on the CPU version */
+                original = gpu_copy->original;
+                PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                                    "GPU[%d]:\tMove D2H data <%s:%x> D:%p -> H:%p requested\n",
+                                    gpu_device->omp_index, flow->name, original->key,
+                                    (void*)gpu_copy->device_private, original->device_copies[0]->device_private);
+                PARSEC_TASK_PROF_TRACE_IF(gpu_stream->prof_event_track_enable,
+                                         gpu_stream->profiling,
+                                         (-1 == gpu_stream->prof_event_key_start ?
+                                          PARSEC_PROF_FUNC_KEY_START(this_task->taskpool,
+                                                                    this_task->task_class->task_class_id) :
+                                          gpu_stream->prof_event_key_start),
+                                         this_task);
+                /* Move the data back into main memory */
+                int* event = &gpu_stream->events[gpu_stream->start];
+                (*event)++;
+#pragma omp task depend(out: event[0]) firstprivate(this_task, gpu_device, original, gpu_copy)
+{ //firstprivate(original, gpu_elem, in_elem, gpu_device)
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream, 
+                        "GPU[%d]: starting omp_target_memcpy d2h from %p to %p on event %p (thread %d)",
+                        gpu_device->omp_index, gpu_copy->device_private, original->device_copies[0]->device_private, event, omp_get_thread_num());
+    int status = omp_target_memcpy(original->device_copies[0]->device_private, 
+                                   gpu_copy->device_private,
+                                   original->nb_elts, 0, 0,
+                                   omp_get_initial_device(), gpu_device->omp_index);
+    if( 0 != status ) {
+        parsec_warning("data %s <<%p>> -> <<%p>> [%d]", this_task->task_class->out[i]->name, 
+                                                        gpu_copy->device_private,
+                                                        original->device_copies[0]->device_private,
+                                                        original->nb_elts);
+    }
+}
+                gpu_device->super.transferred_data_out += original->nb_elts; /* TODO: not hardcoded, use datatype size */
+                how_many++;
+            }
+        }
+    }
+
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                        "GPU[%d]: Pop %s DONE",
+                        gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+
+  release_and_return_error:
+    return (return_code < 0 ? return_code : how_many);
+}
+
+/**
+ * Make sure all data on the device is correctly put back into the queues.
+ */
+int
+parsec_openmp_kernel_epilog( parsec_openmp_device_t        *gpu_device,
+                         parsec_openmp_context_t *gpu_task )
+{
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_openmp_data_copy_t     *gpu_copy, *cpu_copy;
+    parsec_data_t              *original;
+    int i;
+
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp[MAX_TASK_STRLEN];
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                        "GPU[%d]: Epilog of %s",
+                        gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+#endif
+
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        /* Don't bother if there is no real data (aka. CTL or no output) */
+        if(NULL == this_task->data[i].data_out) continue;
+
+        if( !(gpu_task->flow[i]->flow_flags & FLOW_ACCESS_WRITE) ) {
+            /* Warning data_out for read only flows has been overwritten in pop */
+            continue;
+        }
+
+        gpu_copy = this_task->data[i].data_out;
+        original = gpu_copy->original;
+        cpu_copy = original->device_copies[0];
+
+        /**
+         * There might be a race condition here. We can't assume the first CPU
+         * version is the corresponding CPU copy, as a new CPU-bound data
+         * might have been created meanwhile.
+         *
+         * WARNING: For now we always forward the cpu_copy to the next task, to
+         * do that, we lie to the engine by updating the CPU copy to the same
+         * status than the GPU copy without updating the data itself. Thus, the
+         * cpu copy is really invalid. this is related to Issue #88, and the
+         * fact that:
+         *      - we don't forward the gpu copy as output
+         *      - we always take a cpu copy as input, so it has to be in the
+         *        same state as the GPU to prevent an extra data movement.
+         */
+        assert( DATA_COHERENCY_OWNED == gpu_copy->coherency_state );
+        gpu_copy->coherency_state = DATA_COHERENCY_SHARED;
+        cpu_copy->coherency_state = DATA_COHERENCY_SHARED;
+
+        /**
+         *  The cpu_copy will be updated in the completion, and at that moment
+         *  the two versions will be identical.
+         */
+        cpu_copy->version = gpu_copy->version;
+        gpu_copy->version++;  /* on to the next version */
+
+        /**
+         * Let's lie to the engine by reporting that working version of this
+         * data (aka. the one that GEMM worked on) is now on the CPU.
+         */
+        this_task->data[i].data_out = cpu_copy;
+
+        if( gpu_task->pushout[i] ) {
+            parsec_list_nolock_fifo_push(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy);
+            PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                                "OMP copy %p [ref_count %d] moved to the read LRU in %s\n",
+                                gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
+        } else {
+            parsec_list_nolock_fifo_push(&gpu_device->gpu_mem_owned_lru, (parsec_list_item_t*)gpu_copy);
+        }
+    }
+    return 0;
+}
+
+/** @brief Release the OMP copies of the data used in WRITE mode.
+ *
+ * @details This function can be used when the OMP task didn't run
+ *          to completion on the device (either due to an error, or
+ *          simply because the body requested a reexecution on a
+ *          different location). It releases the OMP copies of the
+ *          output data, allowing them to be reused by the runtime.
+ *          This function has the drawback of kicking in too late,
+ *          after all data transfers have been completed toward the
+ *          device.
+ *
+ * @param [IN] gpu_device, the GPU device the the task has been
+ *             supposed to execute.
+ * @param [IN] gpu_task, the task that has been cancelled, and which
+ *             needs it's data returned to the runtime.
+ * @return Currently only success.
+ */
+int
+parsec_openmp_kernel_cleanout( parsec_openmp_device_t        *gpu_device,
+                            parsec_openmp_context_t *gpu_task )
+{
+    parsec_task_t *this_task = gpu_task->ec;
+    parsec_openmp_data_copy_t     *gpu_copy, *cpu_copy;
+    parsec_data_t              *original;
+    int i;
+
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp[MAX_TASK_STRLEN];
+    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                        "GPU[%d]: Cleanup of %s",
+                        gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+#endif
+
+    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+        /* Don't bother if there is no real data (aka. CTL or no output) */
+        if(NULL == this_task->data[i].data_out) continue;
+        if( !(gpu_task->flow[i]->flow_flags & FLOW_ACCESS_WRITE) ) {
+            /* Warning data_out for read only flows has been overwritten in pop */
+            continue;
+        }
+
+        gpu_copy = this_task->data[i].data_out;
+        original = gpu_copy->original;
+        assert(gpu_copy->super.super.obj_reference_count > 1);
+        /* Issue #134 */
+        parsec_data_copy_detach(original, gpu_copy, gpu_device->super.device_index);
+        gpu_copy->coherency_state = DATA_COHERENCY_SHARED;
+        cpu_copy = original->device_copies[0];
+
+        /**
+         * Let's lie to the engine by reporting that working version of this
+         * data (aka. the one that GEMM worked on) is now on the CPU.
+         */
+        this_task->data[i].data_out = cpu_copy;
+        parsec_list_nolock_fifo_push(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy);
+        PARSEC_DEBUG_VERBOSE(20, parsec_openmp_output_stream,
+                             "OMP copy %p [ref_count %d] moved to the read LRU in %s\n",
+                             gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
+    }
+    return 0;
+}
+
+/**
+ * This version is based on 4 streams: one for transfers from the memory to
+ * the GPU, 2 for kernel executions and one for tranfers from the GPU into
+ * the main memory. The synchronization on each stream is based on OMP events,
+ * such an event indicate that a specific epoch of the lifetime of a task has
+ * been completed. Each type of stream (in, exec and out) has a pending FIFO,
+ * where tasks ready to jump to the respective step are waiting.
+ */
+parsec_hook_return_t
+parsec_openmp_kernel_scheduler( parsec_execution_stream_t *es,
+                            parsec_openmp_context_t    *gpu_task,
+                            int which_gpu )
+{
+    parsec_openmp_device_t* gpu_device;
+    int rc, exec_stream = 0;
+    parsec_openmp_context_t *progress_task, *out_task_push, *out_task_submit = NULL, *out_task_pop = NULL;
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp[MAX_TASK_STRLEN];
+#endif
+
+    gpu_device = (parsec_openmp_device_t*)parsec_devices_get(which_gpu);
+
+#if defined(PARSEC_PROF_TRACE)
+    PARSEC_PROFILING_TRACE_FLAGS( es->es_profile,
+                                  PARSEC_PROF_FUNC_KEY_END(gpu_task->ec->taskpool,
+                                                           gpu_task->ec->task_class->task_class_id),
+                                  gpu_task->ec->task_class->key_functions->key_hash(gpu_task->ec->task_class->make_key(gpu_task->ec->taskpool, gpu_task->ec->locals), 64, NULL),
+                                  gpu_task->ec->taskpool->taskpool_id, NULL,
+                                  PARSEC_PROFILING_EVENT_RESCHEDULED );
+#endif /* defined(PARSEC_PROF_TRACE) */
+
+    /* Check the GPU status */
+    rc = parsec_atomic_inc_32b( &(gpu_device->mutex) );
+    if( 1 != rc ) {  /* I'm not the only one messing with this GPU */
+        parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
+        return PARSEC_HOOK_RETURN_ASYNC;
+    }
+
+#if defined(PARSEC_PROF_TRACE)
+    if( parsec_omp_trackable_events & PARSEC_PROFILE_OMP_TRACK_OWN )
+        PARSEC_PROFILING_TRACE( es->es_profile, parsec_omp_own_GPU_key_start,
+                               (unsigned long)es, PROFILE_OBJECT_ID_NULL, NULL );
+#endif  /* defined(PARSEC_PROF_TRACE) */
+
+    omp_set_default_device( gpu_device->omp_index );
+
+#pragma omp parallel num_threads(gpu_device->max_exec_streams) proc_bind(master)
+#pragma omp master
+{
+  check_in_deps:
+    if( NULL != gpu_task ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream,
+                            "GPU[%1d]:\tUpload data (if any) for %s priority %d",
+                            gpu_device->omp_index,
+                            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
+                            gpu_task->ec->priority );
+    }
+    rc = progress_stream( gpu_device,
+                          &(gpu_device->exec_stream[0]),
+                          parsec_openmp_kernel_push,
+                          gpu_task, &progress_task );
+    if( rc < 0 ) {
+        if( -1 == rc )
+            goto disable_gpu;
+    }
+    gpu_task = progress_task;
+    out_task_push = progress_task;
+
+    /* Stage-in completed for this task: it is ready to be executed */
+    exec_stream = (exec_stream + 1) % (gpu_device->max_exec_streams - 2);  /* Choose an exec_stream */
+    if( NULL != gpu_task ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,  "GPU[%1d]:\tExecute %s priority %d", gpu_device->omp_index,
+                            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
+                            gpu_task->ec->priority );
+    }
+    rc = progress_stream( gpu_device,
+                          &(gpu_device->exec_stream[2+exec_stream]),
+                          NULL,
+                          gpu_task, &progress_task );
+    if( rc < 0 ) {
+        if( PARSEC_HOOK_RETURN_DISABLE == rc )
+            goto disable_gpu;
+        if( PARSEC_HOOK_RETURN_ASYNC != rc ) {
+            /* Reschedule the task. As the chore_id has been modified,
+               another incarnation of the task will be executed. */
+            if( NULL != progress_task ) {
+                __parsec_reschedule(es, progress_task->ec);
+                parsec_openmp_kernel_cleanout(gpu_device, progress_task);
+                gpu_task = progress_task;
+                progress_task = NULL;
+                goto remove_gpu_task;
+            }
+            gpu_task = NULL;
+            goto fetch_task_from_shared_queue;
+        }
+        progress_task = NULL;
+    }
+    gpu_task = progress_task;
+    out_task_submit = progress_task;
+
+    /* This task has completed its execution: we have to check if we schedule DtoN */
+    if( NULL != gpu_task ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,  "GPU[%1d]:\tRetrieve data (if any) for %s priority %d", gpu_device->omp_index,
+                            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
+                            gpu_task->ec->priority );
+    }
+    if (out_task_submit == NULL && out_task_push == NULL) {
+        gpu_task = parsec_openmp_create_W2R_task(gpu_device, es);
+    }
+    /* Task is ready to move the data back to main memory */
+    rc = progress_stream( gpu_device,
+                          &(gpu_device->exec_stream[1]),
+                          parsec_openmp_kernel_pop,
+                          gpu_task, &progress_task );
+    if( rc < 0 ) {
+        if( -1 == rc )
+            goto disable_gpu;
+    }
+    if( NULL != progress_task ) {
+        /* We have a succesfully completed task. However, it is not gpu_task, as
+         * it was just submitted into the data retrieval system. Instead, the task
+         * ready to move into the next level is the progress_task.
+         */
+        gpu_task = progress_task;
+        progress_task = NULL;
+        goto complete_task;
+    }
+    gpu_task = progress_task;
+    out_task_pop = progress_task;
+
+  fetch_task_from_shared_queue:
+    assert( NULL == gpu_task );
+    if (out_task_submit == NULL && out_task_pop == NULL) {
+        parsec_openmp_sort_pending_list(gpu_device);
+    }
+    gpu_task = (parsec_openmp_context_t*)parsec_fifo_try_pop( &(gpu_device->pending) );
+    if( NULL != gpu_task ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,  "GPU[%1d]:\tGet from shared queue %s priority %d", gpu_device->omp_index,
+                            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
+                            gpu_task->ec->priority );
+    }
+    goto check_in_deps;
+
+  complete_task:
+    assert( NULL != gpu_task );
+    PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,  "GPU[%1d]:\tComplete %s priority %d", gpu_device->omp_index,
+                        parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
+                        gpu_task->ec->priority );
+    /* Everything went fine so far, the result is correct and back in the main memory */
+    PARSEC_LIST_ITEM_SINGLETON(gpu_task);
+    if (gpu_task->task_type == GPU_TASK_TYPE_D2HTRANSFER) {
+        parsec_openmp_W2R_task_fini(gpu_device, gpu_task, es);
+        gpu_task = progress_task;
+        goto fetch_task_from_shared_queue;
+    }
+    parsec_openmp_kernel_epilog( gpu_device, gpu_task );
+    __parsec_complete_execution( es, gpu_task->ec );
+    gpu_device->super.executed_tasks++;
+  remove_gpu_task:
+    parsec_device_load[gpu_device->super.device_index] -= parsec_device_sweight[gpu_device->super.device_index];
+    free( gpu_task );
+    rc = parsec_atomic_dec_32b( &(gpu_device->mutex) );
+    if( 0 == rc ) {  /* I was the last one */
+#if defined(PARSEC_PROF_TRACE)
+        if( parsec_omp_trackable_events & PARSEC_PROFILE_OMP_TRACK_OWN )
+            PARSEC_PROFILING_TRACE( es->es_profile, parsec_omp_own_GPU_key_end,
+                                   (unsigned long)es, PROFILE_OBJECT_ID_NULL, NULL );
+#endif  /* defined(PARSEC_PROF_TRACE) */
+
+        rc = PARSEC_HOOK_RETURN_ASYNC;
+        goto fini;
+    }
+    gpu_task = progress_task;
+    goto fetch_task_from_shared_queue;
+
+  disable_gpu:
+    /* Something wrong happened. Push all the pending tasks back on the
+     * cores, and disable the gpu.
+     */
+    parsec_warning("Critical issue related to the GPU discovered. Giving up\n");
+    rc = PARSEC_HOOK_RETURN_DISABLE;
+  fini:
+    (void)rc;
+} // omp parallel
+    return rc;
+}
+
+#endif /* PARSEC_HAVE_OPENMP */
diff --git a/parsec/devices/openmp/dev_omp.h b/parsec/devices/openmp/dev_omp.h
new file mode 100644
index 000000000..65282b307
--- /dev/null
+++ b/parsec/devices/openmp/dev_omp.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2010-2018 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#ifndef PARSEC_OPENMP_DATA_H_HAS_BEEN_INCLUDED
+#define PARSEC_OPENMP_DATA_H_HAS_BEEN_INCLUDED
+
+#include "parsec/parsec_internal.h"
+#include "parsec/class/parsec_object.h"
+#include "parsec/class/fifo.h"
+#include "parsec/devices/device.h"
+
+#if defined(PARSEC_HAVE_OPENMP)
+#include "parsec/class/list_item.h"
+#include "parsec/class/list.h"
+
+#include <omp.h>
+
+#include "parsec/utils/zone_malloc.h"
+
+BEGIN_C_DECLS
+
+#define PARSEC_OPENMP_MAX_WORKSPACE      2
+#define PARSEC_OPENMP_W2R_NB_MOVE_OUT    1
+
+typedef struct __parsec_openmp_workspace {
+    void* workspace[PARSEC_OPENMP_MAX_WORKSPACE];
+    int stack_head;
+    int total_workspace;
+} parsec_openmp_workspace_t;
+
+#if defined(PARSEC_PROF_TRACE)
+#define PARSEC_PROFILE_OMP_TRACK_DATA_IN  0x0001
+#define PARSEC_PROFILE_OMP_TRACK_DATA_OUT 0x0002
+#define PARSEC_PROFILE_OMP_TRACK_OWN      0x0004
+#define PARSEC_PROFILE_OMP_TRACK_EXEC     0x0008
+
+extern int parsec_omp_trackable_events;
+extern int parsec_omp_movein_key_start;
+extern int parsec_omp_movein_key_end;
+extern int parsec_omp_moveout_key_start;
+extern int parsec_omp_moveout_key_end;
+extern int parsec_omp_own_GPU_key_start;
+extern int parsec_omp_own_GPU_key_end;
+#endif  /* defined(PROFILING) */
+
+#define GPU_TASK_TYPE_D2HTRANSFER 111
+
+struct __parsec_openmp_context;
+typedef struct __parsec_openmp_context parsec_openmp_context_t;
+
+struct __parsec_openmp_exec_stream;
+typedef struct __parsec_openmp_exec_stream parsec_openmp_exec_stream_t;
+
+struct _parsec_openmp_gpu_device;
+typedef struct _parsec_openmp_gpu_device parsec_openmp_device_t;
+
+typedef int (*advance_openmp_task_function_t)(parsec_openmp_device_t            *parsec_openmp_gpu_device,
+                                       parsec_openmp_context_t     *gpu_task,
+                                       parsec_openmp_exec_stream_t *gpu_stream);
+
+struct __parsec_openmp_context {
+    parsec_list_item_t          list_item;
+    parsec_task_t *ec;
+    advance_openmp_task_function_t    submit;
+    int                        task_type;
+    int                        pushout[MAX_PARAM_COUNT];
+    const parsec_flow_t        *flow[MAX_PARAM_COUNT];
+};
+
+struct __parsec_openmp_exec_stream {
+    struct __parsec_openmp_context **tasks;
+    int *events;
+    int omp_stream;
+    int32_t max_events;  /* number of potential events, and tasks */
+    int32_t executed;    /* number of executed tasks */
+    int32_t start, end;  /* circular buffer management start and end positions */
+    parsec_list_t *fifo_pending;
+    parsec_openmp_workspace_t *workspace;
+#if defined(PARSEC_PROF_TRACE)
+    parsec_thread_profiling_t *profiling;
+#endif  /* defined(PROFILING) */
+#if defined(PARSEC_PROF_TRACE)
+    int prof_event_track_enable;
+    int prof_event_key_start, prof_event_key_end;
+#endif  /* defined(PROFILING) */
+};
+
+struct _parsec_openmp_gpu_device {
+    parsec_device_t super;
+    uint8_t omp_index;
+    uint8_t major;
+    uint8_t minor;
+    uint8_t max_exec_streams;
+    int16_t peer_access_mask;  /**< A bit set to 1 represent the capability of
+                                *   the device to access directly the memory of
+                                *   the index of the set bit device.
+                                */
+    parsec_openmp_exec_stream_t* exec_stream;
+    volatile uint32_t mutex;
+    parsec_list_t gpu_mem_lru;
+    parsec_list_t gpu_mem_owned_lru;
+    parsec_list_t pending;
+    zone_malloc_t *memory;
+    parsec_list_item_t *sort_starting_p;
+};
+
+
+int parsec_omp_init(parsec_context_t *parsec_context);
+int parsec_omp_fini(void);
+
+/**
+ * Debugging functions.
+ */
+void dump_omp_gpu_stream(parsec_openmp_exec_stream_t* exec_stream);
+void dump_omp_state(parsec_openmp_device_t* parsec_openmp_gpu_device);
+
+/****************************************************
+ ** GPU-DATA Specific Starts Here **
+ ****************************************************/
+
+/**
+ * Overload the default data_copy_t with a GPU specialized type
+ */
+typedef parsec_data_copy_t parsec_openmp_data_copy_t;
+
+#include "parsec/data_distribution.h"
+
+/* GPU workspace  ONLY works when PARSEC_ALLOC_GPU_PER_TILE is OFF */
+int parsec_openmp_push_workspace(parsec_openmp_device_t* parsec_openmp_gpu_device, parsec_openmp_exec_stream_t* gpu_stream);
+void* parsec_openmp_pop_workspace(parsec_openmp_device_t* parsec_openmp_gpu_device, parsec_openmp_exec_stream_t* gpu_stream, size_t size);
+int parsec_openmp_free_workspace(parsec_openmp_device_t * parsec_openmp_gpu_device);
+
+
+/**
+ * Progress
+ */
+/**
+ * This version is based on 4 streams: one for transfers from the memory to
+ * the GPU, 2 for kernel executions and one for tranfers from the GPU into
+ * the main memory. The synchronization on each stream is based on omp events,
+ * such an event indicate that a specific epoch of the lifetime of a task has
+ * been completed. Each type of stream (in, exec and out) has a pending FIFO,
+ * where tasks ready to jump to the respective step are waiting.
+ */
+parsec_hook_return_t
+parsec_openmp_kernel_scheduler(parsec_execution_stream_t *es,
+                             parsec_openmp_context_t    *gpu_task,
+                             int which_gpu );
+
+/**
+ * Predefined generic progress functions
+ */
+
+/**
+ *  This function schedule the move of all the data required for a
+ *  specific task from the main memory into the GPU memory.
+ *
+ *  Returns:
+ *     a positive number: the number of data to be moved.
+ *     -1: data cannot be moved into the GPU.
+ *     -2: No more room on the GPU to move this data.
+ */
+int
+parsec_openmp_kernel_push(parsec_openmp_device_t          *parsec_openmp_gpu_device,
+                       parsec_openmp_context_t     *gpu_task,
+                       parsec_openmp_exec_stream_t *gpu_stream);
+
+/**
+ *  This function schedule the move of all the modified data for a
+ *  specific task from the GPU memory into the main memory.
+ *
+ *  Returns: negative number if any error occured.
+ *           positive: the number of data to be moved.
+ */
+int
+parsec_openmp_kernel_pop(parsec_openmp_device_t          *parsec_openmp_gpu_device,
+                      parsec_openmp_context_t     *gpu_task,
+                      parsec_openmp_exec_stream_t *gpu_stream);
+
+/**
+ * Make sure all data on the device is correctly put back into the queues.
+ */
+int
+parsec_openmp_kernel_epilog(parsec_openmp_device_t      *parsec_openmp_gpu_device,
+                         parsec_openmp_context_t *gpu_task );
+
+int
+parsec_openmp_kernel_cleanout(parsec_openmp_device_t       *parsec_openmp_gpu_device,
+                            parsec_openmp_context_t *gpu_task );
+
+END_C_DECLS
+
+#endif /* defined(PARSEC_HAVE_OPENMP) */
+
+#endif
diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in
index 0bc193a0f..ae6b98977 100644
--- a/parsec/include/parsec/parsec_options.h.in
+++ b/parsec/include/parsec/parsec_options.h.in
@@ -57,6 +57,7 @@
 #cmakedefine PARSEC_GPU_CUDA_ALLOC_PER_TILE
 #cmakedefine PARSEC_GPU_WITH_OPENCL
 #cmakedefine PARSEC_HAVE_PEER_DEVICE_MEMORY_ACCESS
+#cmakedefine PARSEC_GPU_WITH_OPENMP
 
 /* debug */
 #cmakedefine PARSEC_DEBUG
@@ -94,6 +95,7 @@
 #cmakedefine PARSEC_HAVE_HWLOC
 #cmakedefine PARSEC_HAVE_PAPI
 #cmakedefine PARSEC_HAVE_CUDA
+#cmakedefine PARSEC_HAVE_OPENMP
 #cmakedefine PARSEC_HAVE_OPENCL
 #cmakedefine PARSEC_HAVE_MPI
 #cmakedefine PARSEC_HAVE_MPI_20
@@ -102,6 +104,7 @@
 #define PARSEC_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@"
 /* Default PATH to look for the CUDA .so files */
 #define PARSEC_LIB_CUDA_PREFIX ".;@CUDA_cublas_LIBRARY@;@CMAKE_BINARY_DIR@/dplasma/cores"
+#define PARSEC_LIB_OPENMP_PREFIX ".;@CUDA_cublas_LIBRARY@;@CMAKE_BINARY_DIR@/dplasma/cores"
 
 #define PARSEC_SIZEOF_VOID_P @CMAKE_SIZEOF_VOID_P@
 
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 61c5e5802..4bbb47dde 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -1238,6 +1238,10 @@ static void jdf_minimal_code_before_prologue(const jdf_t *jdf)
             "#include \"parsec/devices/cuda/dev_cuda.h\"\n"
             "extern int parsec_cuda_output_stream;\n"
             "#endif  /* defined(PARSEC_HAVE_CUDA) */\n"
+            "#if defined(PARSEC_HAVE_OPENMP)\n"
+            "#include \"parsec/devices/openmp/dev_omp.h\"\n"
+            "extern int parsec_openmp_output_stream;\n"
+            "#endif  /* defined(PARSEC_HAVE_OPENMP) */\n"
             "#include <alloca.h>\n\n"
             "#define PARSEC_%s_NB_TASK_CLASSES %d\n"
             "#define PARSEC_%s_NB_DATA %d\n\n"
@@ -4724,6 +4728,291 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
     string_arena_free(sa2);
 }
 
+static void jdf_generate_code_hook_openmp(const jdf_t *jdf,
+                                        const jdf_function_entry_t *f,
+                                        const jdf_body_t* body,
+                                        const char *name)
+{
+    jdf_def_list_t *type_property;
+    jdf_def_list_t *weight_property;
+    jdf_def_list_t *device_property;
+    const char *dyld;
+    const char *dyldtype;
+    const char *device;
+    const char *weight;
+    string_arena_t *sa, *sa2, *sa3;
+    assignment_info_t ai;
+    init_from_data_info_t ai2;
+    jdf_dataflow_t *fl;
+    expr_info_t info;
+    int di;
+    int profile_on;
+    char* output;
+
+    profile_on = profile_enabled(f->properties) && profile_enabled(body->properties);
+
+    jdf_find_property(body->properties, "type", &type_property);
+
+    /* Get the dynamic function properties */
+    dyld = jdf_property_get_string(body->properties, "dyld", NULL);
+    dyldtype = jdf_property_get_string(body->properties, "dyldtype", "void*");
+
+    sa  = string_arena_new(64);
+    sa2 = string_arena_new(64);
+    sa3 = string_arena_new(64);
+
+    ai.sa = sa2;
+    ai.holder = "this_task->locals.";
+    ai.expr = NULL;
+
+    string_arena_add_string(sa3, "%s",
+                            UTIL_DUMP_LIST(sa, f->locals, next,
+                                           dump_local_assignments, &ai, "", "  ", "\n", "\n"));
+
+    string_arena_add_string(sa3, "%s",
+                            UTIL_DUMP_LIST_FIELD(sa, f->locals, next, name,
+                                                 dump_string, NULL, "", "  (void)", ";", ";\n"));
+
+    /* Generate the openmp_kernel_submit structure and function */
+    coutput("struct parsec_body_omp_%s_%s_s {\n"
+            "  uint8_t      index;\n"
+            "  int*         stream;\n"
+            "  %s           dyld_fn;\n"
+            "};\n"
+            "\n"
+            "static int openmp_kernel_submit_%s_%s(parsec_openmp_device_t            *gpu_device,\n"
+            "                                   parsec_openmp_context_t     *gpu_task,\n"
+            "                                   parsec_openmp_exec_stream_t *gpu_stream )\n"
+            "{\n"
+            "  %s *this_task = (%s *)gpu_task->ec;\n"
+            "  __parsec_%s_internal_taskpool_t *__parsec_tp = (__parsec_%s_internal_taskpool_t *)this_task->taskpool;\n"
+            "  struct parsec_body_omp_%s_%s_s parsec_body = { gpu_device->omp_index, &gpu_stream->events[gpu_stream->start], NULL };\n"
+            "  int* parsec_openmp_stream = parsec_body.stream;\n"
+            "  assert(0 == *parsec_openmp_stream);\n"
+            "  *parsec_openmp_stream = 1;\n"
+            "%s\n"
+            "  (void)gpu_device; (void)gpu_stream; (void)__parsec_tp; (void)parsec_body;\n",
+            jdf_basename, f->fname,
+            dyldtype,
+            jdf_basename, f->fname,
+            parsec_get_name(jdf, f, "task_t"), parsec_get_name(jdf, f, "task_t"),
+            jdf_basename, jdf_basename,
+            jdf_basename, f->fname,
+            string_arena_get_string( sa3 ));
+
+    ai2.sa = sa2;
+    ai2.where = "out";
+    output = UTIL_DUMP_LIST(sa, f->dataflow, next,
+                            dump_data_initialization_from_data_array, &ai2, "", "", "", "");
+    if( 0 != strlen(output) ) {
+        coutput("  /** Declare the variables that will hold the data, and all the accounting for each */\n"
+                "%s\n",
+                output);
+    }
+
+    /**
+     * Generate code for the simulation.
+     */
+    coutput("  /** Update starting simulation date */\n"
+            "#if defined(PARSEC_SIM)\n"
+            "  {\n"
+            "    this_task->sim_exec_date = 0;\n");
+    for( di = 0, fl = f->dataflow; fl != NULL; fl = fl->next, di++ ) {
+
+        if(fl->flow_flags & JDF_FLOW_TYPE_CTL) continue;  /* control flow, nothing to store */
+
+        coutput("    data_repo_entry_t *e%s = this_task->data._f_%s.data_repo;\n"
+                "    if( (NULL != e%s) && (e%s->sim_exec_date > this_task->sim_exec_date) )\n"
+                "      this_task->sim_exec_date = e%s->sim_exec_date;\n",
+                fl->varname, fl->varname,
+                fl->varname, fl->varname,
+                fl->varname);
+    }
+    coutput("    if( this_task->task_class->sim_cost_fct != NULL ) {\n"
+            "      this_task->sim_exec_date += this_task->task_class->sim_cost_fct(this_task);\n"
+            "    }\n"
+            "    if( es->largest_simulation_date < this_task->sim_exec_date )\n"
+            "      es->largest_simulation_date = this_task->sim_exec_date;\n"
+            "  }\n"
+            "#endif\n");
+
+    jdf_generate_code_cache_awareness_update(jdf, f);
+
+    coutput("#if defined(PARSEC_DEBUG_NOISIER)\n"
+            "  {\n"
+            "    char tmp[MAX_TASK_STRLEN];\n"
+            "    PARSEC_DEBUG_VERBOSE(10, parsec_openmp_output_stream, \"GPU[%%1d]:\\tEnqueue on device %%s priority %%d\\n\", gpu_device->omp_index, \n"
+            "           parsec_task_snprintf(tmp, MAX_TASK_STRLEN, (parsec_task_t *)this_task),\n"
+            "           this_task->priority );\n"
+            "  }\n"
+            "#endif /* defined(PARSEC_DEBUG_NOISIER) */\n" );
+
+    jdf_generate_code_dry_run_before(jdf, f);
+    jdf_coutput_prettycomment('-', "%s BODY", f->fname);
+
+    if( profile_on ) {
+        coutput("  PARSEC_TASK_PROF_TRACE_IF(gpu_stream->prof_event_track_enable,\n"
+                "                           gpu_stream->profiling,\n"
+                "                           (-1 == gpu_stream->prof_event_key_start ?\n"
+                "                           PARSEC_PROF_FUNC_KEY_START(this_task->taskpool,\n"
+                "                                                     this_task->task_class->task_class_id) :\n"
+                "                           gpu_stream->prof_event_key_start),\n"
+                "                           (parsec_task_t*)this_task);\n");
+    }
+
+    dyld = jdf_property_get_string(body->properties, "dyld", NULL);
+    dyldtype = jdf_property_get_string(body->properties, "dyldtype", "void*");
+    if ( NULL != dyld ) {
+        coutput("  /* Pointer to dynamic gpu function */\n"
+                "  parsec_body.dyld_fn = (%s)this_task->task_class->incarnations[gpu_device->omp_index].dyld_fn;\n\n",
+                dyldtype );
+    }
+
+    coutput("%s\n", body->external_code);
+    if( !JDF_COMPILER_GLOBAL_ARGS.noline ) {
+        coutput("#line %d \"%s\"\n", cfile_lineno+1, jdf_cfilename);
+    }
+    jdf_coutput_prettycomment('-', "END OF %s BODY", f->fname);
+    jdf_generate_code_dry_run_after(jdf, f);
+    coutput("  return PARSEC_HOOK_RETURN_DONE;\n"
+            "}\n\n");
+
+    /* Generate the hook_openmp */
+    coutput("static int %s_%s(parsec_execution_stream_t *es, %s *this_task)\n"
+            "{\n"
+            "  __parsec_%s_internal_taskpool_t *__parsec_tp = (__parsec_%s_internal_taskpool_t *)this_task->taskpool;\n"
+            "  parsec_openmp_context_t *gpu_task;\n"
+            "  double ratio;\n"
+            "  int dev_index;\n"
+            "  %s\n"
+            "  (void)es; (void)__parsec_tp;\n"
+            "\n",
+            name, type_property->expr->jdf_var, parsec_get_name(jdf, f, "task_t"),
+            jdf_basename, jdf_basename,
+            string_arena_get_string( sa3 ));
+
+    info.sa = string_arena_new(64);
+    info.prefix = "";
+    info.suffix = "";
+    info.assignments = "&this_task->locals";
+
+    /* Get the ratio to  apply on the weight for this task */
+    jdf_find_property( body->properties, "weight", &weight_property );
+    if ( NULL != weight_property ) {
+        weight = dump_expr((void**)weight_property->expr, &info);
+    }
+    else {
+        weight = "1.";
+    }
+    coutput("  ratio = %s;\n", weight);
+    
+    /* Get the hint for statix and/or external gpu scheduling */
+    jdf_find_property( body->properties, "device", &device_property );
+    if ( NULL != device_property ) {
+        device = dump_expr((void**)device_property->expr, &info);
+        coutput("  dev_index = %s;\n"
+                "  if (dev_index < -1) {\n"
+                "    return PARSEC_HOOK_RETURN_NEXT;\n"
+                "  } else if (dev_index == -1) {\n"
+                "    dev_index = parsec_devices_best_load((parsec_task_t*)this_task, ratio);\n"
+                "  } else {\n"
+                "    dev_index = (dev_index %% (parsec_devices_enabled()-2)) + 2;\n"
+                "  }\n",
+                device);
+    } else {
+        coutput("  dev_index = parsec_devices_best_load((parsec_task_t*)this_task, ratio);\n");
+    }
+    coutput("  assert(dev_index >= 0);\n"
+            "  if( dev_index < 2 ) {\n"
+            "    return PARSEC_HOOK_RETURN_NEXT;  /* Fall back */\n"
+            "  }\n"
+            "  parsec_device_load[dev_index] += ratio * parsec_device_sweight[dev_index];\n"
+            "\n"
+            "  gpu_task = (parsec_openmp_context_t*)calloc(1, sizeof(parsec_openmp_context_t));\n"
+            "  OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);\n"
+            "  gpu_task->ec = (parsec_task_t*)this_task;\n"
+            "  gpu_task->submit = &openmp_kernel_submit_%s_%s;\n"
+            "  gpu_task->task_type = 0;\n",
+            jdf_basename, f->fname);
+
+    /* Dump the dataflow */
+    for(fl = f->dataflow, di = 0; fl != NULL; fl = fl->next, di++) {
+        coutput("  gpu_task->pushout[%d] = 0;\n"
+                "  gpu_task->flow[%d]    = &%s;\n",
+                di,
+                di, JDF_OBJECT_ONAME( fl ));
+
+        if (fl->flow_flags & JDF_FLOW_TYPE_WRITE) {
+            jdf_dep_t *dl;
+            int testtrue, testfalse;
+
+            /**
+             * We force the pushout for every data that is not only going to the
+             * same kind of kernel in the future.
+             * (TODO: could be avoided with different GPU compliant kernels)
+             */
+            for(dl = fl->deps; dl != NULL; dl = dl->next) {
+                if( dl->dep_flags & JDF_DEP_FLOW_IN )
+                    continue;
+
+                testtrue = (dl->guard->calltrue != NULL) &&
+                    ((dl->guard->calltrue->var == NULL ) ||
+                     (strcmp(dl->guard->calltrue->func_or_mem, f->fname)));
+
+                testfalse = (dl->guard->callfalse != NULL) &&
+                    ((dl->guard->callfalse->var == NULL ) ||
+                     (strcmp(dl->guard->callfalse->func_or_mem, f->fname)));
+
+                switch( dl->guard->guard_type ) {
+                case JDF_GUARD_UNCONDITIONAL:
+                    if(testtrue) {
+                        coutput("  gpu_task->pushout[%d] = 1;\n", di);
+                        goto nextflow;
+                    }
+                    break;
+                case JDF_GUARD_BINARY:
+                    if(testtrue) {
+                        coutput("  if( %s ) {\n"
+                                "    gpu_task->pushout[%d] = 1;\n"
+                                "  }",
+                                dump_expr((void**)dl->guard->guard, &info), di);
+                    }
+                    break;
+                case JDF_GUARD_TERNARY:
+                    if( testtrue ) {
+                        if( testfalse ) {
+                            coutput("  gpu_task->pushout[%d] = 1;\n", di);
+                        } else {
+                            coutput("  if( %s ) {\n"
+                                    "    gpu_task->pushout[%d] = 1;\n"
+                                    "  }\n",
+                                    dump_expr((void**)dl->guard->guard, &info), di);
+                        }
+                    } else if ( testfalse ) {
+                        coutput("  if( !(%s) ) {\n"
+                                "    gpu_task->pushout[%d] = 1;\n"
+                                "  }\n",
+                                dump_expr((void**)dl->guard->guard, &info), di);
+                    }
+                    break;
+                }
+            }
+          nextflow:
+            ;
+        }
+    }
+    string_arena_free(info.sa);
+
+
+    coutput("\n"
+            "  return parsec_openmp_kernel_scheduler( es, gpu_task, dev_index );\n"
+            "}\n\n");
+
+    string_arena_free(sa);
+    string_arena_free(sa2);
+    string_arena_free(sa3);
+}
+
 static void jdf_generate_code_hook_cuda(const jdf_t *jdf,
                                         const jdf_function_entry_t *f,
                                         const jdf_body_t* body,
@@ -4769,16 +5058,16 @@ static void jdf_generate_code_hook_cuda(const jdf_t *jdf,
                             UTIL_DUMP_LIST_FIELD(sa, f->locals, next, name,
                                                  dump_string, NULL, "", "  (void)", ";", ";\n"));
 
-    /* Generate the gpu_kernel_submit structure and function */
+    /* Generate the cuda_kernel_submit structure and function */
     coutput("struct parsec_body_cuda_%s_%s_s {\n"
             "  uint8_t      index;\n"
             "  cudaStream_t stream;\n"
             "  %s           dyld_fn;\n"
             "};\n"
             "\n"
-            "static int gpu_kernel_submit_%s_%s(gpu_device_t            *gpu_device,\n"
-            "                                   parsec_gpu_context_t     *gpu_task,\n"
-            "                                   parsec_gpu_exec_stream_t *gpu_stream )\n"
+            "static int cuda_kernel_submit_%s_%s(parsec_cuda_device_t            *gpu_device,\n"
+            "                                   parsec_cuda_context_t     *gpu_task,\n"
+            "                                   parsec_cuda_exec_stream_t *gpu_stream )\n"
             "{\n"
             "  %s *this_task = (%s *)gpu_task->ec;\n"
             "  __parsec_%s_internal_taskpool_t *__parsec_tp = (__parsec_%s_internal_taskpool_t *)this_task->taskpool;\n"
@@ -4874,7 +5163,7 @@ static void jdf_generate_code_hook_cuda(const jdf_t *jdf,
     coutput("static int %s_%s(parsec_execution_stream_t *es, %s *this_task)\n"
             "{\n"
             "  __parsec_%s_internal_taskpool_t *__parsec_tp = (__parsec_%s_internal_taskpool_t *)this_task->taskpool;\n"
-            "  parsec_gpu_context_t *gpu_task;\n"
+            "  parsec_cuda_context_t *gpu_task;\n"
             "  double ratio;\n"
             "  int dev_index;\n"
             "  %s\n"
@@ -4907,13 +5196,13 @@ static void jdf_generate_code_hook_cuda(const jdf_t *jdf,
                 "  if (dev_index < -1) {\n"
                 "    return PARSEC_HOOK_RETURN_NEXT;\n"
                 "  } else if (dev_index == -1) {\n"
-                "    dev_index = parsec_gpu_get_best_device((parsec_task_t*)this_task, ratio);\n"
+                "    dev_index = parsec_devices_best_load((parsec_task_t*)this_task, ratio);\n"
                 "  } else {\n"
                 "    dev_index = (dev_index %% (parsec_devices_enabled()-2)) + 2;\n"
                 "  }\n",
                 device);
     } else {
-        coutput("  dev_index = parsec_gpu_get_best_device((parsec_task_t*)this_task, ratio);\n");
+        coutput("  dev_index = parsec_devices_best_load((parsec_task_t*)this_task, ratio);\n");
     }
     coutput("  assert(dev_index >= 0);\n"
             "  if( dev_index < 2 ) {\n"
@@ -4921,10 +5210,10 @@ static void jdf_generate_code_hook_cuda(const jdf_t *jdf,
             "  }\n"
             "  parsec_device_load[dev_index] += ratio * parsec_device_sweight[dev_index];\n"
             "\n"
-            "  gpu_task = (parsec_gpu_context_t*)calloc(1, sizeof(parsec_gpu_context_t));\n"
+            "  gpu_task = (parsec_cuda_context_t*)calloc(1, sizeof(parsec_cuda_context_t));\n"
             "  OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);\n"
             "  gpu_task->ec = (parsec_task_t*)this_task;\n"
-            "  gpu_task->submit = &gpu_kernel_submit_%s_%s;\n"
+            "  gpu_task->submit = &cuda_kernel_submit_%s_%s;\n"
             "  gpu_task->task_type = 0;\n",
             jdf_basename, f->fname);
 
@@ -4998,7 +5287,7 @@ static void jdf_generate_code_hook_cuda(const jdf_t *jdf,
 
 
     coutput("\n"
-            "  return parsec_gpu_kernel_scheduler( es, gpu_task, dev_index );\n"
+            "  return parsec_cuda_kernel_scheduler( es, gpu_task, dev_index );\n"
             "}\n\n");
 
     string_arena_free(sa);
@@ -5048,6 +5337,10 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
             jdf_generate_code_hook_cuda(jdf, f, body, name);
             goto hook_end_block;
         }
+        if (!strcmp(type_property->expr->jdf_var, "OPENMP")) {
+            jdf_generate_code_hook_openmp(jdf, f, body, name);
+            goto hook_end_block;
+        }
     }
     sa  = string_arena_new(64);
     sa2 = string_arena_new(64);